Python信贷数据处理与初步分析(ZIP解压)

2019-02-14 12:04:29 浏览数 (1)

代码语言:javascript复制
#!/usr/bin/python
# coding=utf-8
'''
     
@author: lenovo
@software: 3.6 PyCharm
@file: 8W信贷数据处理.py
@time: 20170531
@function:Credit data processing and preliminary analysis
          信贷数据处理与初步分析
@edition :1.0
'''
#导入模块
from __future__ import division, print_function
import os
import pandas as pd
import zipfile

#设置文件位置文件夹名(本文件为zip)
dataset_path = 'C:\UserslenovoDesktop...dataset'
#设置文件名注意后缀
zip_file_name = 'loan.zip'
#此位置CSV
csv_file_name = './loan.csv'

def run_main():
    '''
    主函数

    '''
    zip_file_path=os.path.join(dataset_path,zip_file_name)
    csv_file_path=os.path.join(dataset_path,csv_file_name)
#如果不存在CSV文件,解压zip文件
    if not os.path.exists(csv_file_path):
          with zipfile.ZipFile(zip_file_path) as zf:
              zf.extractall(dataset_path)

    # 读取数据
    raw_data = pd.read_csv(csv_file_path,engine='python')
    #查看数据集
    print('n数据预览:',raw_data.head())

    print(' n 数据描述: ')
    print(raw_data.describe())

    print('n数据集基本信息: ')
    print(raw_data.info())

    #选择列
    used_cols = ['loan_amnt', 'term', 'int_rate', 'grade', 'issue_d', 'addr_state']
    used_data = raw_data[used_cols]

    print('n数据预览',used_data.head())

    # #Q:按月份统计借贷金额
    print('n时间序列转换ING')
    #原来为dec_12 改为 datetime
    used_data['issue_d2']=pd.to_datetime(used_data['issue_d'])
    print('n数据预览')
    print(used_data.head())
    print('数据基本信息',used_data.info)
    #分组求和
    data_group_by_date=used_data.groupby(['issue_d2']).sum()
    #给新列命名
    data_group_by_date.reset_index(inplace=True)
    #apply(直接跟函数M)
    data_group_by_date['issue_month']=data_group_by_date['issue_d2'].apply(lambda x:x.to_period('M'))

    load_amout_group_by_month=data_group_by_date.groupby('issue_month')['loan_amnt'].sum()
    #结果转换为dataframe
    load_amout_group_by_month_df=pd.DataFrame(load_amout_group_by_month).reset_index()
    print('n按月统计借贷总额预览:',load_amout_group_by_month_df.head())
    #保存结果,输出结果为load_amout_group_by_month无df
    load_amout_group_by_month_df.to_csv('C:/Users/lenovo/Desktop/.../output/load_amouta_by_month.csv',index=False)

    #Q:按州统计借贷金额`
    data_group_by_state=used_data.groupby(['addr_state'])['loan_amnt'].sum()
    #结果转DATAFRAME
    load_amout_group_by_state_df=pd.DataFrame(data_group_by_state).reset_index()
    print('/n按州统计预览',data_group_by_state.head())



    load_amout_group_by_state_df.to_csv('C:/Users/lenovo/Desktop/...output/load_amout_by_state.csv',index=False)

    #Q:借贷评级、期限和利率关系
    #根据grade,term分组,int_rate求平均
    data_group_by_grade_term=used_data.groupby(['grade','term'])['int_rate'].mean()
    data_group_by_grade_term_df=pd.DataFrame(data_group_by_grade_term).reset_index()

    print('/n借贷评级、期限和利率关系预览:',data_group_by_grade_term_df.head())
    data_group_by_grade_term_df.to_csv('C:/Users/lenovo/Desktop/...output/intrate_by_grade_term.scv',index=False)
    #if exists CSV文件删除,释放空间
    if os.path.exists(csv_file_path):
        os.remove(csv_file_path)
'''
    if  used_data['']=  :
        return
    if  used_data['']=    :
        return
    else:
        return
    print(raw_data.head())
'''
if __name__ =='__main__':
    run_main()

0 人点赞