代码语言:javascript复制
#!/usr/bin/python
# coding=utf-8
'''
@author: lenovo
@software: 3.6 PyCharm
@file: 8W信贷数据处理.py
@time: 20170531
@function:Credit data processing and preliminary analysis
信贷数据处理与初步分析
@edition :1.0
'''
#导入模块
from __future__ import division, print_function
import os
import pandas as pd
import zipfile
#设置文件位置文件夹名(本文件为zip)
dataset_path = 'C:\UserslenovoDesktop...dataset'
#设置文件名注意后缀
zip_file_name = 'loan.zip'
#此位置CSV
csv_file_name = './loan.csv'
def run_main():
'''
主函数
'''
zip_file_path=os.path.join(dataset_path,zip_file_name)
csv_file_path=os.path.join(dataset_path,csv_file_name)
#如果不存在CSV文件,解压zip文件
if not os.path.exists(csv_file_path):
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(dataset_path)
# 读取数据
raw_data = pd.read_csv(csv_file_path,engine='python')
#查看数据集
print('n数据预览:',raw_data.head())
print(' n 数据描述: ')
print(raw_data.describe())
print('n数据集基本信息: ')
print(raw_data.info())
#选择列
used_cols = ['loan_amnt', 'term', 'int_rate', 'grade', 'issue_d', 'addr_state']
used_data = raw_data[used_cols]
print('n数据预览',used_data.head())
# #Q:按月份统计借贷金额
print('n时间序列转换ING')
#原来为dec_12 改为 datetime
used_data['issue_d2']=pd.to_datetime(used_data['issue_d'])
print('n数据预览')
print(used_data.head())
print('数据基本信息',used_data.info)
#分组求和
data_group_by_date=used_data.groupby(['issue_d2']).sum()
#给新列命名
data_group_by_date.reset_index(inplace=True)
#apply(直接跟函数M)
data_group_by_date['issue_month']=data_group_by_date['issue_d2'].apply(lambda x:x.to_period('M'))
load_amout_group_by_month=data_group_by_date.groupby('issue_month')['loan_amnt'].sum()
#结果转换为dataframe
load_amout_group_by_month_df=pd.DataFrame(load_amout_group_by_month).reset_index()
print('n按月统计借贷总额预览:',load_amout_group_by_month_df.head())
#保存结果,输出结果为load_amout_group_by_month无df
load_amout_group_by_month_df.to_csv('C:/Users/lenovo/Desktop/.../output/load_amouta_by_month.csv',index=False)
#Q:按州统计借贷金额`
data_group_by_state=used_data.groupby(['addr_state'])['loan_amnt'].sum()
#结果转DATAFRAME
load_amout_group_by_state_df=pd.DataFrame(data_group_by_state).reset_index()
print('/n按州统计预览',data_group_by_state.head())
load_amout_group_by_state_df.to_csv('C:/Users/lenovo/Desktop/...output/load_amout_by_state.csv',index=False)
#Q:借贷评级、期限和利率关系
#根据grade,term分组,int_rate求平均
data_group_by_grade_term=used_data.groupby(['grade','term'])['int_rate'].mean()
data_group_by_grade_term_df=pd.DataFrame(data_group_by_grade_term).reset_index()
print('/n借贷评级、期限和利率关系预览:',data_group_by_grade_term_df.head())
data_group_by_grade_term_df.to_csv('C:/Users/lenovo/Desktop/...output/intrate_by_grade_term.scv',index=False)
#if exists CSV文件删除,释放空间
if os.path.exists(csv_file_path):
os.remove(csv_file_path)
'''
if used_data['']= :
return
if used_data['']= :
return
else:
return
print(raw_data.head())
'''
if __name__ =='__main__':
run_main()