以下全文均已发布至「和鲸社区」,复制下面链接,可一键fork跑通:https://www.heywhale.com/mw/project/6240547988d07a00177fe0a2
导入相关库
代码语言:javascript复制import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from itertools import combinations
from joblib import Parallel, delayed
读取文件
代码语言:javascript复制path = "/home/mw/input/julei8185/julei/julei"
data = {}
for root,dirs,files in os.walk(path):
for file in files:
TCFile=os.path.join(root,file)
track = pd.read_excel(TCFile)
track1 = track [['time','lat','lon']]
track1 = track1[~(track1['time'] == 999)] #去掉数值为999的或缺省值
data[str(file[:-4])] = track1 #用文件名作为字典的键和数据一起写入字典
定义向量之间的距离
代码语言:javascript复制def OneWayHausdorffDistance(ptSetA, ptSetB):
# 计算任意向量之间的距离,假设ptSetA有n个向量,ptSetB有m个向量
# 得到矩阵C(n行m列)Cij代表A中都第i个向量到B中第j向量都距离
dist = cdist(ptSetA, ptSetB, metric='euclidean')
# np.min(dist,axis=1):计算每一行的的最小值
# 即:固定点集A的值,求点集A中到集合B的最小值
return np.max(np.min(dist, axis=1))
# ptSetA:输入的第一个点集
# ptSetB:输入的第二个点集
# Hausdorff距离度量了两个点集间的最大不匹配程度
定义Hausdorff距离距离
代码语言:javascript复制def HausdorffDistance(ptSetA, ptSetB):
res = np.array([
OneWayHausdorffDistance(ptSetA, ptSetB),
OneWayHausdorffDistance(ptSetB, ptSetA)
])
return np.max(res)
计算轨迹段的距离矩阵
代码语言:javascript复制def DistanceMat(data,w=[1]):
'''
功能:计算轨迹段的距离矩阵
输出:距离矩阵
'''
#要计算的组合
ptCom = list(combinations(list(data.keys()),2))
#基于轨迹的距离
distance_tra = Parallel(n_jobs=8,verbose=False)(delayed(HausdorffDistance)(
data[ptSet1][['lat','lon']],data[ptSet2][['lat','lon']]
) for ptSet1,ptSet2 in ptCom)
distancemat_tra = pd.DataFrame(ptCom)
distancemat_tra['distance'] = distance_tra
distancemat_tra = distancemat_tra.pivot(index=0,columns=1,values='distance')
for pt1 in data.keys():
distancemat_tra.loc[str(pt1),str(pt1)] = 0
distancemat_tra = distancemat_tra.fillna(0)
distancemat_tra = distancemat_tra.loc[list(data.keys()),list(data.keys())]
distancemat_tra = distancemat_tra distancemat_tra.T
distancemat_tra = (distancemat_tra-distancemat_tra.min().min())/(distancemat_tra.max().max()-distancemat_tra.min().min())
distancemat = w[0]*distancemat_tra
return distancemat
distancemat = DistanceMat(data,w=[1])
自定义Kmean函数
代码语言:javascript复制class KMeans:
def __init__(self,n_clusters=4,Q=180,max_iter=100): #Q是样本数,max_iter是迭代数
self.n_clusters = n_clusters #聚类数
self.Q = Q
self.max_iter = max_iter # 最大迭代数
def fit(self,distancemat):
#选择初始中心
best_c = random.sample(distancemat.columns.tolist(),1)
for i in range(self.n_clusters-1):
best_c = random.sample(distancemat.loc[(distancemat[best_c[-1]]>self.Q)&(~distancemat.index.isin(best_c))].index.tolist(),1)
center_init = distancemat[best_c] #选择最小的样本组合为初始质心
self._init_center = center_init
#迭代停止条件
iter_ = 0
run = True
#开始迭代
while (iter_<self.max_iter)&(run==True):
#聚类聚类标签更新
labels_ = np.argmin(center_init.values,axis=1)
#聚类中心更新
best_c_ = [distancemat.iloc[labels_== i,labels_==i].sum().idxmin() for i in range(self.n_clusters)]
center_init_ = distancemat[best_c_]
#停止条件
iter_ = 1
if best_c_ == best_c:
run = False
center_init = center_init_.copy()
best_c = best_c_.copy()
#记录数据
self.labels_ = np.argmin(center_init.values,axis=1)
self.center_tra = center_init.columns.values
self.num_iter = iter_
self.sse = sum([sum(center_init.iloc[self.labels_==i,i]) for i in range(self.n_clusters)])
聚类
代码语言:javascript复制SSE = []
for i in range(2,16):
kmeans = KMeans(n_clusters=i,Q=0.01,max_iter=100)
kmeans.fit(distancemat)
SSE.append(kmeans.sse)
#画图
plt.figure(0)
plt.plot(SSE)
plt.show()
使用最好结果进行聚类
代码语言:javascript复制n_clusters=5
kmeans = KMeans(n_clusters=n_clusters,Q=0.01,max_iter=100)
kmeans.fit(distancemat)
kmeans.sse #输出sse
kmeans.labels_ #输出标签
kmeans.center_tra #输出聚类中心
画图
代码语言:javascript复制plt.figure(1)
for i in range(n_clusters):
for name in distancemat.columns[kmeans.labels_==i]:
plt.plot(data[name].loc[:,'lon'],data[name].loc[:,'lat'],c=sns.xkcd_rgb[list(sns.xkcd_rgb.keys())[i]])
plt.show()
结果输出
代码语言:javascript复制#保存每一个轨迹属于哪一类
kmeans_result = pd.DataFrame(columns=['label','id'])
for i in range(n_clusters):
kmeans_result.loc[i] = [i,distancemat.columns[kmeans.labels_==i].tolist()]
f = open(r'result.txt', 'wt', encoding='utf8')
# 写入文件内容
for list_mem in kmeans_result.values:
f.write(str(list_mem).replace("'","") "n")
##关闭文件
f.close()
print('commplete ok!')