【数据分析可视化】通过apply进行数据预处理

2020-07-07 19:56:04 浏览数 (1)

代码语言:javascript复制
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
代码语言:javascript复制
# 读取apply_demo.csv数据
link_csv = '/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/apply_demo.csv'
df = pd.read_csv(link_csv).head()
df

time

data

0

1473411962

Symbol: APPL Seqno: 0 Price: 1623

1

1473411962

Symbol: APPL Seqno: 0 Price: 1623

2

1473411963

Symbol: APPL Seqno: 0 Price: 1623

3

1473411963

Symbol: APPL Seqno: 0 Price: 1623

4

1473411963

Symbol: APPL Seqno: 1 Price: 1649

代码语言:javascript复制
df.size
代码语言:javascript复制
10
代码语言:javascript复制
# 新加一列Series
s1 = Series(['a']*10)
s1
代码语言:javascript复制
0    a
1    a
2    a
3    a
4    a
5    a
6    a
7    a
8    a
9    a
dtype: object
代码语言:javascript复制
df['A'] = s1
df.head()

time

data

A

0

1473411962

Symbol: APPL Seqno: 0 Price: 1623

a

1

1473411962

Symbol: APPL Seqno: 0 Price: 1623

a

2

1473411963

Symbol: APPL Seqno: 0 Price: 1623

a

3

1473411963

Symbol: APPL Seqno: 0 Price: 1623

a

4

1473411963

Symbol: APPL Seqno: 1 Price: 1649

a

代码语言:javascript复制
# 将A列小写全变为大写(函数.apply(str.upper))
df['A'] = df['A'].apply(str.upper)
df

time

data

A

0

1473411962

Symbol: APPL Seqno: 0 Price: 1623

A

1

1473411962

Symbol: APPL Seqno: 0 Price: 1623

A

2

1473411963

Symbol: APPL Seqno: 0 Price: 1623

A

3

1473411963

Symbol: APPL Seqno: 0 Price: 1623

A

4

1473411963

Symbol: APPL Seqno: 1 Price: 1649

A

代码语言:javascript复制
# 切分去除data数据
df['data'][0]
代码语言:javascript复制
' Symbol: APPL Seqno: 0 Price: 1623'
代码语言:javascript复制
# 去除头尾strip,且空格分割split
l1 = df['data'][0].strip().split(' ')
l1
代码语言:javascript复制
['Symbol:', 'APPL', 'Seqno:', '0', 'Price:', '1623']
代码语言:javascript复制
# 想要的是字典值
l1[1],l1[3],l1[5]
代码语言:javascript复制
('APPL', '0', '1623')
代码语言:javascript复制
# 写分割返回函数
def foo(line):
    items = line.strip().split(' ')
    return Series([items[1],items[3],items[5]])
代码语言:javascript复制
# 分割完生成新的数框
df_tmp = df['data'].apply(foo)
df_tmp

0

1

2

0

APPL

0

1623

1

APPL

0

1623

2

APPL

0

1623

3

APPL

0

1623

4

APPL

1

1649

代码语言:javascript复制
# 新的数框 重命名
df_tmp = df_tmp.rename(columns={0:'Symbol',1:'Seqno',2:'Price'})
df_tmp

Symbol

Seqno

Price

0

APPL

0

1623

1

APPL

0

1623

2

APPL

0

1623

3

APPL

0

1623

4

APPL

1

1649

代码语言:javascript复制
df

time

data

A

0

1473411962

Symbol: APPL Seqno: 0 Price: 1623

A

1

1473411962

Symbol: APPL Seqno: 0 Price: 1623

A

2

1473411963

Symbol: APPL Seqno: 0 Price: 1623

A

3

1473411963

Symbol: APPL Seqno: 0 Price: 1623

A

4

1473411963

Symbol: APPL Seqno: 1 Price: 1649

A

代码语言:javascript复制
# 新旧两个数框 结合
df_new = df.combine_first(df_tmp)
df_new

A

Price

Seqno

Symbol

data

time

0

A

1623.0

0.0

APPL

Symbol: APPL Seqno: 0 Price: 1623

1473411962

1

A

1623.0

0.0

APPL

Symbol: APPL Seqno: 0 Price: 1623

1473411962

2

A

1623.0

0.0

APPL

Symbol: APPL Seqno: 0 Price: 1623

1473411963

3

A

1623.0

0.0

APPL

Symbol: APPL Seqno: 0 Price: 1623

1473411963

4

A

1649.0

1.0

APPL

Symbol: APPL Seqno: 1 Price: 1649

1473411963

代码语言:javascript复制
# 去掉多余已经处理的data
del df_new['data']
del df_new['A']
df_new

Price

Seqno

Symbol

time

0

1623.0

0.0

APPL

1473411962

1

1623.0

0.0

APPL

1473411962

2

1623.0

0.0

APPL

1473411963

3

1623.0

0.0

APPL

1473411963

4

1649.0

1.0

APPL

1473411963

代码语言:javascript复制
# 转存到外部继续用
df_new.to_csv('/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/demo_duplicate.csv')
代码语言:javascript复制
!ls /Users/bennyrhys/Desktop/数据分析可视化-数据集/homework
代码语言:javascript复制
AMZN.csv           apply_demo.csv     iris.csv           top5.csv
BABA.csv           city_weather.csv   movie_metadata.csv train.csv
Pokemon.csv        demo_duplicate.csv sales-funnel.xlsx  usa_flights.csv

0 人点赞