kaggle-top50
top50
的数据是kaggle官网上关于一个音乐
的数据集。
There are 50 songs and 13 variables to be explored
新知识
数据本身是比较完美的,没有涉及到太多的数据预处理工作,主要是学习到了多种图形的绘制
- 直方图
- 直方图 折线
- 热力图
- 饼图
- 等高线图
<!--MORE-->
属性
分析过程
导入库和包
代码语言:txt复制import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import squarify as sq
from pandas.plotting import scatter_matrix
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # 预处理模块
from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.model_selection import train_test_split,cross_val_score, KFold # 数据分离,交叉验证,K折验证
from sklearn import metrics # 矩阵模块
from sklearn.metrics import confusion_matrix, classification_report # 混淆矩阵,分类报告
%matplotlib inline
#提供汉字支持
mpl.rcParams["font.family"]="sans-serif"
mpl.rcParams["font.sans-serif"]=u'SimHei'
数据查看
代码语言:txt复制filename='/Users/piqianchao/data-visualization/top50.csv'
data = pd.read_csv(filename
,encoding = "ISO-8859-1" # 解决UnicodeError问题
,engine='python'
,index_col=0) # 解决已知文件的第一列当做属性问题
data.head()
属性重命名rename
代码语言:txt复制data.rename(columns={'Track.Name':'track_name','Artist.Name':'artist_name','Beats.Per.Minute':'beats_per_minute','Loudness..dB..':'Loudness(dB)','Valence.':'Valence','Length.':'Length', 'Acousticness..':'Acousticness','Speechiness.':'Speechiness'},inplace=True)
Calculating the number of songs of each genre
代码语言:txt复制popular_genre = data.groupby('Genre').size() # 根据类别分组,再统计每个类别多少首歌
print(popular_genre)
genre_list = data['Genre'].values.tolist() # 将每个类别转成列表形式
Calculating the number of songs by each of the artists
代码语言:txt复制popular_artist = data.groupby('artist_name').size() # 统计每个作家几首歌
print(popular_artist)
artist_list = data['artist_name'].values.tolist() # 作家的名字转成列表
查看属性的统计信息
代码语言:txt复制pd.set_option('precision', 3) # 设置最多显示的小数位
data.describe() # 查看统计信息
Finding out the skew for each attribute
找出每个属性的偏度skew
skew = data.skew() # skew是偏态,偏态系数
print(skew)
代码语言:txt复制transform = np.asarray(data[['Liveness']].values) # 取出每个Liveness的值,转成ndarray型数据
print(type(transform))
data_transform = stats.boxcox(transform)[0]
plt.hist(data['Liveness'], bins=10) # 原始数据
plt.title("original data")
plt.show()
plt.hist(data_transform, bins=10) # 修正偏态之后的数据
plt.title("skew corrected data")
plt.show()
如何在直方图的基础上画出折线趋势
代码语言:txt复制transform1 = np.asarray(data[['Popularity']].values)
data_transform1 = stats.boxcox(transform1)[0]
# 类似上面的做法,画出直方图
# plt.hist(data['Popularity'],bins=10) #original data
# plt.show()
# plt.hist(data_transform1,bins=10) #corrected skew data
# plt.show()
sns.distplot(data['Popularity'],bins=10,kde=True,kde_kws={"color":"k", "lw":2, "label":"KDE"}, color='blue')
plt.title("original data")
plt.show()
sns.distplot(data_transform1, bins=10, kde=True, kde_kws={"color":"k", "lw":2, "label":"KDE"}, color='green')
plt.title("skew corrected data")
plt.show()
Bar graph to see the number of songs of each genre
代码语言:txt复制fig, ax = plt.subplots(figsize=(30,12)) # 指定画布大小
length = np.arange(len(popular_genre))
plt.bar(length, popular_genre, color='g',edgecolor='black',alpha=0.7)
plt.xticks(length, genre_list) # 显示的是横轴上的每个刻度
plt.title("Most popular genre", fontsize=28)
plt.xlabel("Genre", fontsize=25)
plt.ylabel("Number On Songs", fontsize=25)
plt.show()
相关系数correction
如何求解相关系数
代码语言:txt复制pd.set_option('display.width', 100) # 每行最多显示的数据量为100,多的话就隔行再显示
pd.set_option('precision', 3) # 最多精确的小数位
correclation = data.corr(method='spearman') # method系数相关:pearson 线性数据之间的相关性;kendall分类变量相关性,无序序列;spearman 非线性的,非正态的数据的相关系数
print(correclation)
8.2 根据相关系数画出热力图
代码语言:txt复制plt.figure(figsize=(10,10))
plt.title("Correclation heatmap")
sns.heatmap(correclation, annot=True,vmin=-1, vmax=1,cmap="GnBu_r", center=1)
barh of most popular artists
代码语言:txt复制fig, ax=plt.subplots(figsize=(12,12))
length=np.arange(len(popular_artist))
plt.barh(length, popular_artist,color='r',edgecolor='black',alpha=0.7)
# plt.barh(y, width, height=0.8, left=None, *, align='center', **kwargs)
plt.yticks(length, artist_list) # y轴上的刻度
plt.title("Most popular artists", fontsize=18)
plt.ylabel("Artists", fontsize=18) # 横纵轴的标签
plt.xlabel("Number of songs", fontsize=16)
plt.show()
Analysing the relationship between energy and loudness
代码语言:txt复制fig = plt.subplots(figsize=(10,10))
sns.regplot(x='Energy', y='Loudness(dB)', data=data, color='black')
Dependence between energy and popularity
代码语言:txt复制fig = plt.subplots(figsize=(10,10))
plt.title('Dependence between energy and popularity')
sns.regplot(x='Energy', y='Popularity', ci=None, data=data)
sns.kdeplot(data.Energy, data.Popularity)
代码语言:txt复制plt.figure(figsize=(14,8))
sq.plot(sizes=data.Genre.value_counts(), label=data['Genre'].unique(), alpha=0.8)
plt.axis('off')
plt.show()
Pie charts 饼图
通过每个歌手和其歌曲数目制作饼图
代码语言:txt复制labels = data.artist_name.value_counts().index # 每小块的标签
sizes = data.artist_name.value_counts().values # 每块的大小
colors = ['red', 'yellowgreen', 'lightcoral', 'lightskyblue','cyan', 'green', 'black','yellow']
plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels,colors=colors) # 画图
autopct = ("%1.1f%%")
plt.axis('equal')
plt.show()
Linear Regression
数据构建和TTS
代码语言:txt复制# 构建训练集和测试集
x = data.loc[:, ['Energy','Danceability','Length','Loudness(dB)','Acousticness']].values
y = data.loc[:, 'Popularity'].values
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
reg = LinearRegression()
reg.fit(X_train, y_train)
预测
代码语言:txt复制# 进行预测,真实值和预测值之间的比较
y_pred = reg.predict(X_test)
data_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(data_output)
代码语言:txt复制# 计算LR的准确率:MAE:mean absolute error;MSE: mean sqaured error
print("MAE", metrics.mean_absolute_error(y_test, y_pred))
print("MSE", metrics.mean_squared_error(y_test, y_pred))
print("Root MSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# 预测值和真实的测试值之间的散点图
plt.figure(figsize=(10,10))
plt.plot(y_pred, y_test, color='black', linestyle='dashed',marker='*',markerfacecolor='red',markersize=10)
plt.title("Error analsis")
plt.xlabel("Predicted values")
plt.ylabel("Test values")
交叉验证
代码语言:txt复制x = data.loc[:, ['Energy', 'Danceability']].values
y = data.loc[:, 'Popularity'].values
reg = LinearRegression()
mse = cross_val_score(reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
mean_mse = np.mean(mse)
print(mean_mse)
diff = metrics.mean_squared_error(y_test, y_pred) - abs(mean_mse)
print(diff)