import numpy as np
import pandas as pd
from pandas import Series, DataFrame
代码语言:javascript复制# 读取apply_demo.csv数据
link_csv = '/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/apply_demo.csv'
df = pd.read_csv(link_csv).head()
df
time | data | |
---|---|---|
0 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 |
1 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 |
2 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 |
3 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 |
4 | 1473411963 | Symbol: APPL Seqno: 1 Price: 1649 |
df.size
代码语言:javascript复制10
代码语言:javascript复制# 新加一列Series
s1 = Series(['a']*10)
s1
代码语言:javascript复制0 a
1 a
2 a
3 a
4 a
5 a
6 a
7 a
8 a
9 a
dtype: object
代码语言:javascript复制df['A'] = s1
df.head()
time | data | A | |
---|---|---|---|
0 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 | a |
1 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 | a |
2 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 | a |
3 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 | a |
4 | 1473411963 | Symbol: APPL Seqno: 1 Price: 1649 | a |
# 将A列小写全变为大写(函数.apply(str.upper))
df['A'] = df['A'].apply(str.upper)
df
time | data | A | |
---|---|---|---|
0 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 | A |
1 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 | A |
2 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 | A |
3 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 | A |
4 | 1473411963 | Symbol: APPL Seqno: 1 Price: 1649 | A |
# 切分去除data数据
df['data'][0]
代码语言:javascript复制' Symbol: APPL Seqno: 0 Price: 1623'
代码语言:javascript复制# 去除头尾strip,且空格分割split
l1 = df['data'][0].strip().split(' ')
l1
代码语言:javascript复制['Symbol:', 'APPL', 'Seqno:', '0', 'Price:', '1623']
代码语言:javascript复制# 想要的是字典值
l1[1],l1[3],l1[5]
代码语言:javascript复制('APPL', '0', '1623')
代码语言:javascript复制# 写分割返回函数
def foo(line):
items = line.strip().split(' ')
return Series([items[1],items[3],items[5]])
代码语言:javascript复制# 分割完生成新的数框
df_tmp = df['data'].apply(foo)
df_tmp
0 | 1 | 2 | |
---|---|---|---|
0 | APPL | 0 | 1623 |
1 | APPL | 0 | 1623 |
2 | APPL | 0 | 1623 |
3 | APPL | 0 | 1623 |
4 | APPL | 1 | 1649 |
# 新的数框 重命名
df_tmp = df_tmp.rename(columns={0:'Symbol',1:'Seqno',2:'Price'})
df_tmp
Symbol | Seqno | Price | |
---|---|---|---|
0 | APPL | 0 | 1623 |
1 | APPL | 0 | 1623 |
2 | APPL | 0 | 1623 |
3 | APPL | 0 | 1623 |
4 | APPL | 1 | 1649 |
df
time | data | A | |
---|---|---|---|
0 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 | A |
1 | 1473411962 | Symbol: APPL Seqno: 0 Price: 1623 | A |
2 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 | A |
3 | 1473411963 | Symbol: APPL Seqno: 0 Price: 1623 | A |
4 | 1473411963 | Symbol: APPL Seqno: 1 Price: 1649 | A |
# 新旧两个数框 结合
df_new = df.combine_first(df_tmp)
df_new
A | Price | Seqno | Symbol | data | time | |
---|---|---|---|---|---|---|
0 | A | 1623.0 | 0.0 | APPL | Symbol: APPL Seqno: 0 Price: 1623 | 1473411962 |
1 | A | 1623.0 | 0.0 | APPL | Symbol: APPL Seqno: 0 Price: 1623 | 1473411962 |
2 | A | 1623.0 | 0.0 | APPL | Symbol: APPL Seqno: 0 Price: 1623 | 1473411963 |
3 | A | 1623.0 | 0.0 | APPL | Symbol: APPL Seqno: 0 Price: 1623 | 1473411963 |
4 | A | 1649.0 | 1.0 | APPL | Symbol: APPL Seqno: 1 Price: 1649 | 1473411963 |
# 去掉多余已经处理的data
del df_new['data']
del df_new['A']
df_new
Price | Seqno | Symbol | time | |
---|---|---|---|---|
0 | 1623.0 | 0.0 | APPL | 1473411962 |
1 | 1623.0 | 0.0 | APPL | 1473411962 |
2 | 1623.0 | 0.0 | APPL | 1473411963 |
3 | 1623.0 | 0.0 | APPL | 1473411963 |
4 | 1649.0 | 1.0 | APPL | 1473411963 |
# 转存到外部继续用
df_new.to_csv('/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/demo_duplicate.csv')
代码语言:javascript复制!ls /Users/bennyrhys/Desktop/数据分析可视化-数据集/homework
代码语言:javascript复制AMZN.csv apply_demo.csv iris.csv top5.csv
BABA.csv city_weather.csv movie_metadata.csv train.csv
Pokemon.csv demo_duplicate.csv sales-funnel.xlsx usa_flights.csv