点击下方公众号,回复资料,收获惊喜
以下全文代码和数据均已发布至和鲸社区,复制下面链接前往,可一键fork跑通:
https://www.heywhale.com/mw/project/6302faacf31025b7777230c9
本文根据《K-均值聚类法用于西北太平洋热带气旋路径分类》文献中的聚类方法,对印度洋的台风路径进行聚类分析。 其核心原理就是通过计算每条台风路径的经、纬向质心,以及经、纬、对角向的方差,作为聚类的依据,使用KMEAN算法将上述5个特征进行分类。 最后将分类后的结构进行可视化展示。
导入相关库
代码语言:javascript复制import pandas as pd
import numpy as np
import glob
import cartopy
import cartopy.crs as ccrs
import cartopy.feature as cfeat
from cartopy.mpl.ticker import LongitudeFormatter,LatitudeFormatter
from cartopy.io.shapereader import Reader, natural_earth
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.image import imread
import shapely.geometry as sgeom
import cartopy.io.shapereader as shpreader
import matplotlib.lines as mlines
from sklearn.cluster import KMeans
读取数据&相应预处理
代码语言:javascript复制files = glob.glob(f'/home/mw/input/india_typhoon6005/indian/*/*')
df_list = []
i = 1
for f in files[:]:
lines = open(f, 'r').readlines()
max_line = max(lines, key=len)
max_line_num = len(max_line.split(','))
df = pd.read_csv(f, sep=',', header=None, error_bad_lines=False, names=range(max_line_num))
df['count'] = i
df = df[df.columns.drop('count').insert(0, 'count')]
df = df.loc[:, :8].drop(columns=[0, 3, 4, 5])
df.columns.values[:6] = ['count', 'num', 'time', 'lat', 'lon', 'speed']
df['time'] = pd.to_datetime(df['time'], format='%Y%m%d%H')
df['year'] = df['time'][0].year
df['month'] = df['time'][0].month
df['day'] = df['time'][0].day
df['y'] = df['lat'].apply(lambda x: float(x[:-1]) / 10)
df['x'] = df['lon'].apply(lambda x: float(x[:-1]) / 10)
df['speed'] = df['speed'] * 0.514
df['w'] = df['speed'] ** 0.5
if (df['time'].iloc[-1] - df['time'].iloc[0]).days >= 1 and df['speed'].max() >= 17.2:
df_list.append(df)
i = i 1
data = pd.concat(df_list)
data['w'] = data['w'].replace(0, np.nan)
data.dropna(axis='rows', how='any', inplace=True)
data.drop(columns=['lat', 'lon'], inplace=True)
代码语言:javascript复制data
根据文献设置相应特征
代码语言:javascript复制tc = data
代码语言:javascript复制tc['wx'] = tc['w'] * tc['x']
tc['wy'] = tc['w'] * tc['y']
tc
代码语言:javascript复制tc['wx_sum'] = tc.groupby(['count'])['wx'].transform('sum')
tc['wy_sum'] = tc.groupby(['count'])['wy'].transform('sum')
tc['w_sum'] = tc.groupby(['count'])['w'].transform('sum')
tc['x_mean'] = tc['wx_sum'] / tc['w_sum']
tc['y_mean'] = tc['wy_sum'] / tc['w_sum']
tc['x_var'] = tc['wx_sum'] / tc['w_sum']
tc['x_var'] = (tc['x'] - tc['x_mean']) ** 2 * tc['w']
tc['y_var'] = (tc['y'] - tc['y_mean']) ** 2 * tc['w']
tc['xy_var'] = (tc['y'] - tc['y_mean']) * (tc['x'] - tc['x_mean']) * tc['w']
tc['x_var_sum'] = tc.groupby(['count'])['x_var'].transform('sum')
tc['y_var_sum'] = tc.groupby(['count'])['y_var'].transform('sum')
tc['xy_var_sum'] = tc.groupby(['count'])['xy_var'].transform('sum')
tc['x_var_mean'] = tc['x_var_sum'] / tc['w_sum']
tc['y_var_mean'] = tc['y_var_sum'] / tc['w_sum']
tc['xy_var_mean'] = tc['xy_var_sum'] / tc['w_sum']
tc
代码语言:javascript复制tc_group = tc.groupby('count').mean()[['x_mean', 'y_mean', 'x_var_mean', 'y_var_mean', 'xy_var_mean']]
tc_group
运行算法
代码语言:javascript复制kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(tc_group)
代码语言:javascript复制KMeans(n_clusters=4, random_state=0)
查看算法输出结果
代码语言:javascript复制print(kmeans.labels_)
代码语言:javascript复制[2 2 0 0 2 3 0 2 3 0 3 2 0 2 2 2 2 2 2 3 3 2 2 3 2 2 1 2 3 2 2 2 2 2 3 2 2
3 2 2 0 0 2 3 2 2 2 2 2 3 2 2 0 2 2 0 2 3 2 2 0 2 2 1 2 0 0 1 2 2 3 2 0 0
3 2 2 2 3 2 2 1 2 2 2 2 2 0 0 2 3 3 3 3 2 2 3 2 3 2 2 2 2 2 1 2 3 3 3 3 2
2 2 2 2 3 2 3 2 3 3 0 2 2 2 2 2 2 0 2 2 3 2 2 2 3 0 2 3 2 2 2 2 2 2 0 2 2
2 3 2 2 3 3 2 0 3 2 3 3 0 2 3 2 2 3 2 2 0 2 0 2 2 3 2 3 2 3 2 3 3 3 2 3 2
2 1 2 2 3 2 3 3 2 2 3 2 0 2 2 3 3 3 3 3 3 2 3 3 2 2]
代码语言:javascript复制print(kmeans.cluster_centers_)
代码语言:javascript复制[[ 73.8071703 11.65878343 39.74780613 7.47642322 -11.02116106]
[ 78.00734193 10.96868728 112.7133901 4.30306716 -2.60074766]
[ 86.76754499 14.82524246 6.22982994 7.68770382 -0.54250094]
[ 63.71074018 15.72378535 8.58366153 3.80791174 -1.27377464]]
代码语言:javascript复制tc_group['label'] = kmeans.labels_ 1
代码语言:javascript复制tc_group['label'].value_counts()
代码语言:javascript复制3 121
4 59
1 25
2 6
Name: label, dtype: int64
代码语言:javascript复制tc_group
代码语言:javascript复制tc.set_index(['count'], inplace=True)
tc['label'] = tc_group['label']
代码语言:javascript复制tc
将结果可视化
代码语言:javascript复制# 分级设色
def get_color(w):
if w <= 13.8:
color='#FFFF00'
elif 13.9<= w <= 17.1:
color='#6495ED'
elif 17.2 <= w <= 24.4:
color='#3CB371'
elif 24.5<= w <= 32.6:
color='#FFA500'
elif 32.7 <= w <= 61.3:
color='#FF00FF'
else:
color='#DC143C'
return color
代码语言:javascript复制def create_map(title, extent):
fig = plt.figure(figsize=(12, 8), dpi=400)
ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
ax.imshow(
imread('/home/mw/input/natural_earth8967/NE1_50M_SR_W.tif'),
origin='upper',
transform=ccrs.PlateCarree(),
extent=[-180, 180, -90, 90]
)
ax.set_extent(extent,crs=ccrs.PlateCarree())
gl = ax.gridlines(draw_labels=False, linewidth=1, color='k', alpha=0.5, linestyle='--')
gl.top_labels = gl.right_labels = False
ax.set_xticks(np.arange(extent[0], extent[1] 5, 5))
ax.set_yticks(np.arange(extent[2], extent[3] 5, 5))
ax.xaxis.set_major_formatter(LongitudeFormatter())
ax.xaxis.set_minor_locator(plt.MultipleLocator(1))
ax.yaxis.set_major_formatter(LatitudeFormatter())
ax.yaxis.set_minor_locator(plt.MultipleLocator(1))
ax.tick_params(axis='both', labelsize=10, direction='out')
province = shpreader.Reader('/home/mw/input/cn_shp3641/Province_9.shp')
ax.add_geometries(province.geometries(), crs=ccrs.PlateCarree(), linewidths=0.1,edgecolor='k',facecolor='none')
a = mlines.Line2D([],[],color='#FFFF00',marker='o',markersize=7, label='D',ls='')
b = mlines.Line2D([],[],color='#6495ED', marker='o',markersize=7, label='DD',ls='')
c = mlines.Line2D([],[],color='#3CB371', marker='o',markersize=7, label='CS',ls='')
d = mlines.Line2D([],[],color='#FFA500', marker='o',markersize=7, label='SCS',ls='')
e = mlines.Line2D([],[],color='#FF00FF', marker='o',markersize=7, label='VSCS',ls='')
f = mlines.Line2D([],[],color='#DC143C', marker='o',markersize=7, label='SuperCS',ls='')
ax.legend(handles=[a,b,c,d,e,f], numpoints=1, handletextpad=0, loc='upper left', shadow=True)
plt.title(f'{title} Typhoon Track', fontsize=15)
return ax
代码语言:javascript复制for label in tc_group['label'].value_counts().index[:]:
tc_number = tc_group["label"].value_counts()[label]
print(f'label:{label}, tc number:{tc_number}')
one_type = tc[tc['label']==label].reset_index()
ax = create_map(f'Type {label}', [40, 110, 0, 30])
for num in one_type['count'].value_counts().index:
df = one_type[one_type['count']==num]
for i in range(len(df))[:1]:
ax.scatter(list(df['x'])[i], list(df['y'])[i], marker='o', s=10, color='k')
for i in range(len(df)-1):
pointA = list(df['x'])[i],list(df['y'])[i]
pointB = list(df['x'])[i 1],list(df['y'])[i 1]
ax.add_geometries([sgeom.LineString([pointA, pointB])], color=get_color(list(df['speed'])[i 1]),crs=ccrs.PlateCarree(), linewidth=2)
plt.savefig(f'track_type{label}_typhoon.png')
代码语言:javascript复制label:3, tc number:121
label:4, tc number:59
label:1, tc number:25
label:2, tc number:6
问题讨论
本次复现的工作其实并没有全部完成,在确定台风分类数量上只是随机选择了一个数,但实际文献中是给了一个确定分类个数的方法的:
在这里是抛砖引玉,感兴趣的盆友们可以自行fork本项目,添加后续解决方案。
❝参考文献:K-均值聚类法用于西北太平洋热带气旋路径分类 ❞