本文重点知识:
- 创建带有日期的索引:
dates = pd.date_range('20190924', periods=6)
- head()、tail()
- 按轴排序:索引排序
sort_index
,默认是ascending=True
升序- axis=0:行索引,可以用
index
- axis=1:列索引,可以用
columns
- axis=0:行索引,可以用
- 按值排序:
df.sort_values(by='columns')
,默认升序
创建数据
代码语言:javascript复制import numpy as np
import pandas as pd
代码语言:javascript复制s = pd.Series([1, 3, 5, np.nan, 6, 89])
s
代码语言:javascript复制0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 89.0
dtype: float64
代码语言:javascript复制dates = pd.date_range('20190924', periods=6)
dates
代码语言:javascript复制DatetimeIndex(['2019-09-24', '2019-09-25', '2019-09-26', '2019-09-27',
'2019-09-28', '2019-09-29'],
dtype='datetime64[ns]', freq='D')
代码语言:javascript复制df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
# 同时创建多个不同的列
df2 = pd.DataFrame({'A': 1., # 某列的值相同
'B': pd.Timestamp('20130102'), # 时间戳的创建
'C': pd.Series(1, index=list(range(4)), dtype='float32'), # 某列值可以是S型数据
'D': np.array([3] * 4, dtype='int32'), # 使用numpy数组
'E': pd.Categorical(["test", "train", "test", "train"]), # 不同的类
'F': 'foo'}) # 使用布尔值
df2
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
df2.dtypes
代码语言:javascript复制A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
查看数据
查看数据的相关信息
- 头、尾几行数据
- index、columns
- describe ,T
# 前几行数据,默认是5行
df.head(3)
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
# 查看末几行,默认是5行
df.tail(2)
A | B | C | D | |
---|---|---|---|---|
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
# 查看数据的行索引
df.index
代码语言:javascript复制DatetimeIndex(['2019-09-24', '2019-09-25', '2019-09-26', '2019-09-27',
'2019-09-28', '2019-09-29'],
dtype='datetime64[ns]', freq='D')
代码语言:javascript复制# 查看数据的列属性
df.columns
代码语言:javascript复制Index(['A', 'B', 'C', 'D'], dtype='object')
代码语言:javascript复制df.to_numpy()
代码语言:javascript复制array([[ 0.50000505, 0.16657823, -0.75851253, -0.67917279],
[ 0.09020909, 0.11790626, -0.40218323, 2.26118249],
[-0.80905211, -0.17314443, 0.32491161, -1.88510066],
[ 0.31037417, -1.50208882, 1.52440064, 0.26995285],
[-0.8464884 , -0.23587797, 1.39889574, -0.22957318],
[ 0.9758533 , -0.99839501, -0.51448041, -0.88270375]])
代码语言:javascript复制df.describe()
A | B | C | D | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | 0.036817 | -0.437504 | 0.262172 | -0.190903 |
std | 0.730717 | 0.668113 | 0.997562 | 1.400993 |
min | -0.846488 | -1.502089 | -0.758513 | -1.885101 |
25% | -0.584237 | -0.807766 | -0.486406 | -0.831821 |
50% | 0.200292 | -0.204511 | -0.038636 | -0.454373 |
75% | 0.452597 | 0.045144 | 1.130400 | 0.145071 |
max | 0.975853 | 0.166578 | 1.524401 | 2.261182 |
df.T
2019-09-24 00:00:00 | 2019-09-25 00:00:00 | 2019-09-26 00:00:00 | 2019-09-27 00:00:00 | 2019-09-28 00:00:00 | 2019-09-29 00:00:00 | |
---|---|---|---|---|---|---|
A | 0.500005 | 0.090209 | -0.809052 | 0.310374 | -0.846488 | 0.975853 |
B | 0.166578 | 0.117906 | -0.173144 | -1.502089 | -0.235878 | -0.998395 |
C | -0.758513 | -0.402183 | 0.324912 | 1.524401 | 1.398896 | -0.514480 |
D | -0.679173 | 2.261182 | -1.885101 | 0.269953 | -0.229573 | -0.882704 |
# sort_index
# 索引排序`sort_index`,默认是`ascending=True`升序
# axis=0:行索引,可以用`index`
# axis=1:列索引,可以用`columns`
df.sort_index(axis=1)
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
# 按照行索引进行降序
df.sort_index(axis=0, ascending=False)
A | B | C | D | |
---|---|---|---|---|
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
# 按照某个属性进行排序
# 按照B属性的升序排列
df.sort_values(by="B")
A | B | C | D | |
---|---|---|---|---|
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
df.sort_values(by=["B","C"])
A | B | C | D | |
---|---|---|---|---|
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
选择数据
查看指定的行列数据
代码语言:javascript复制# 指定列属性查看数据
df[["B","C"]]
B | C | |
---|---|---|
2019-09-24 | 0.166578 | -0.758513 |
2019-09-25 | 0.117906 | -0.402183 |
2019-09-26 | -0.173144 | 0.324912 |
2019-09-27 | -1.502089 | 1.524401 |
2019-09-28 | -0.235878 | 1.398896 |
2019-09-29 | -0.998395 | -0.514480 |
# 指定行标签查看指定的行数据
df[1:3]
A | B | C | D | |
---|---|---|---|---|
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
df["20190924":"20190927"]
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
loc
根据标签(不是自带的数字索引)查看数据
代码语言:javascript复制df.loc[dates[0]]
代码语言:javascript复制A 0.500005
B 0.166578
C -0.758513
D -0.679173
Name: 2019-09-24 00:00:00, dtype: float64
代码语言:javascript复制dates[0]
代码语言:javascript复制Timestamp('2019-09-24 00:00:00', freq='D')
代码语言:javascript复制# 选择行和列
df.loc[:, ["A","B"]] # 选择所有行和AB两个列
A | B | |
---|---|---|
2019-09-24 | 0.500005 | 0.166578 |
2019-09-25 | 0.090209 | 0.117906 |
2019-09-26 | -0.809052 | -0.173144 |
2019-09-27 | 0.310374 | -1.502089 |
2019-09-28 | -0.846488 | -0.235878 |
2019-09-29 | 0.975853 | -0.998395 |
# 索引通过标签来实现
df.loc['20190924':'20190927', ['A', 'B']]
A | B | |
---|---|---|
2019-09-24 | 0.500005 | 0.166578 |
2019-09-25 | 0.090209 | 0.117906 |
2019-09-26 | -0.809052 | -0.173144 |
2019-09-27 | 0.310374 | -1.502089 |
# 指定的行或者列可以是切片形式
df.loc['20190924':'20190927', 'A':'B']
A | B | |
---|---|---|
2019-09-24 | 0.500005 | 0.166578 |
2019-09-25 | 0.090209 | 0.117906 |
2019-09-26 | -0.809052 | -0.173144 |
2019-09-27 | 0.310374 | -1.502089 |
iloc数字索引
记忆放法:iloc
记为intloc
,int为整型,表示通过数字来进行索引
df
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
df.iloc[1:3]
A | B | C | D | |
---|---|---|---|---|
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
df.iloc[1:3, 0:2] # 切片形式,连续性
A | B | |
---|---|---|
2019-09-25 | 0.090209 | 0.117906 |
2019-09-26 | -0.809052 | -0.173144 |
df.iloc[[1, 2, 4], [0, 2]] # 行索引是离散的值
A | C | |
---|---|---|
2019-09-25 | 0.090209 | -0.402183 |
2019-09-26 | -0.809052 | 0.324912 |
2019-09-28 | -0.846488 | 1.398896 |
df.iloc[:, 1:3]
B | C | |
---|---|---|
2019-09-24 | 0.166578 | -0.758513 |
2019-09-25 | 0.117906 | -0.402183 |
2019-09-26 | -0.173144 | 0.324912 |
2019-09-27 | -1.502089 | 1.524401 |
2019-09-28 | -0.235878 | 1.398896 |
2019-09-29 | -0.998395 | -0.514480 |
获取具体位置的元素
代码语言:javascript复制df.iloc[1,2]
代码语言:javascript复制-0.4021832300071616
代码语言:javascript复制df.iat[1,2] # 等同上面
代码语言:javascript复制-0.4021832300071616
布尔索引
代码语言:javascript复制df
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-26 | -0.809052 | -0.173144 | 0.324912 | -1.885101 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-28 | -0.846488 | -0.235878 | 1.398896 | -0.229573 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
df[df.A > 0] # 将属性A中大于0的行全部选择出出来
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | -0.758513 | -0.679173 |
2019-09-25 | 0.090209 | 0.117906 | -0.402183 | 2.261182 |
2019-09-27 | 0.310374 | -1.502089 | 1.524401 | 0.269953 |
2019-09-29 | 0.975853 | -0.998395 | -0.514480 | -0.882704 |
df[df > 0]
A | B | C | D | |
---|---|---|---|---|
2019-09-24 | 0.500005 | 0.166578 | NaN | NaN |
2019-09-25 | 0.090209 | 0.117906 | NaN | 2.261182 |
2019-09-26 | NaN | NaN | 0.324912 | NaN |
2019-09-27 | 0.310374 | NaN | 1.524401 | 0.269953 |
2019-09-28 | NaN | NaN | 1.398896 | NaN |
2019-09-29 | 0.975853 | NaN | NaN | NaN |