代码语言:javascript复制
1 import pandas as pd
2 import numpy as np
3
4 df = pd.DataFrame({
5 'key1': [4, 5, 3, np.nan, 2],
6 'key2': [1, 2, np.nan, 4, 5],
7 'key3': [1, 2, 3, 'j', 'k']
8 }, index=['a', 'b', 'c', 'd', 'e'])
9 print(df)
10 print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
11 print('-------')
12 '''
13 key1 key2 key3
14 a 4.0 1.0 1
15 b 5.0 2.0 2
16 c 3.0 NaN 3
17 d NaN 4.0 j
18 e 2.0 5.0 k
19 float64 float64 object
20 -------
21 '''
22 # 计算每一列的均值 df.mean()
23 # 只统计数字列,默认忽略nan。
24 print(df.mean())
25 '''
26 key1 3.5
27 key2 3.0
28 dtype: float64
29 '''
30 # 不忽略nan值计算均值
31 # skipna默认为True,如果为False,有NaN的列统计结果仍为NaN
32 m3 = df.mean(skipna=False)
33 print(m3)
34 '''
35 key1 NaN
36 key2 NaN
37 dtype: float64
38 '''
39 # 计算单一列的均值
40 print('计算单一列的均值',df['key2'].mean())
41 '''
42 计算单一列的均值 3.0
43 '''
44
45 df2 = pd.DataFrame({
46 'key1': [1, 3, 5],
47 'key2': [2, 4, 6],
48 'key3': [3, 5, 7]
49 }, index=['a', 'b', 'c'])
50 # print(df2)
51 # print('--------df2')
52 # 计算df2每一行的均值并将其结果添加到新的列
53 df2['mean'] = df2.mean(axis=1)
54 print(df2)
55 '''
56 key1 key2 key3 mean
57 a 1 2 3 2.0
58 b 3 4 5 4.0
59 c 5 6 7 6.0
60 '''
61
62 # 统计非NaN值的数量 count()
63 print(df)
64 print('-'*6)
65 print(df.count())
66 '''
67 key1 key2 key3
68 a 4.0 1.0 1
69 b 5.0 2.0 2
70 c 3.0 NaN 3
71 d NaN 4.0 j
72 e 2.0 5.0 k
73 ------
74 key1 4
75 key2 4
76 key3 5
77 dtype: int64
78 '''
79
80 # 统计
81 print(df)
82 print('-' * 6)
83 print('df的最小值',df.min())
84 print('df的最大值',df.max())
85 print('df的key2列的最大值',df['key2'].max())
86 print('统计df的分位数,参数q确定位置',df.quantile(q=0.75))
87 print('对df求和',df.sum())
88 print('求df的中位数,median(),50%分位数',df.median())
89 print('求df的标准差,std()',df.std())
90 print('求df的方差,var()',df.var())
91 print('求skew样本的偏度,skew()',df.skew())
92 print('求kurt样本的峰度,kurt()',df.kurt())
93 print('df累计求和,cumsum()',df['key2'].cumsum())
94 print('df累计求积,cumprod()',df['key2'].cumprod())
95 print('求df的累计最大值,cummax()', df['key2'].cummax())
96 print('求df的累计最小值,cummin()', df['key2'].cummin())
97 '''
98 key1 key2 key3
99 a 4.0 1.0 1
100 b 5.0 2.0 2
101 c 3.0 NaN 3
102 d NaN 4.0 j
103 e 2.0 5.0 k
104 ------
105 df的最小值 key1 2.0
106 key2 1.0
107 dtype: float64
108 df的最大值 key1 5.0
109 key2 5.0
110 dtype: float64
111 df的key2列的最大值 5.0
112 统计df的分位数,参数q确定位置 key1 4.25
113 key2 4.25
114 Name: 0.75, dtype: float64
115 对df求和 key1 14.0
116 key2 12.0
117 dtype: float64
118 求df的中位数,median(),50%分位数 key1 3.5
119 key2 3.0
120 dtype: float64
121 求df的标准差,std() key1 1.290994
122 key2 1.825742
123 dtype: float64
124 求df的方差,var() key1 1.666667
125 key2 3.333333
126 dtype: float64
127 求skew样本的偏度,skew() key1 0.0
128 key2 0.0
129 dtype: float64
130 求kurt样本的峰度,kurt() key1 -1.2
131 key2 -3.3
132 dtype: float64
133 df累计求和,cumsum() a 1.0
134 b 3.0
135 c NaN
136 d 7.0
137 e 12.0
138 Name: key2, dtype: float64
139 df累计求积,cumprod() a 1.0
140 b 2.0
141 c NaN
142 d 8.0
143 e 40.0
144 Name: key2, dtype: float64
145 求df的累计最大值,cummax() a 1.0
146 b 2.0
147 c NaN
148 d 4.0
149 e 5.0
150 Name: key2, dtype: float64
151 求df的累计最小值,cummin() a 1.0
152 b 1.0
153 c NaN
154 d 1.0
155 e 1.0
156 Name: key2, dtype: float64
157 '''
158
159 # 唯一值 :unique()
160 s = pd.Series(list('kjdhsakjdhjfh'))
161 sq = s.unique()
162 print(s)
163 print(sq)
164 print('sq的类型:',type(sq))
165 print('对sq进行重新排序:',pd.Series(sq).sort_values())
166 '''
167 0 k
168 1 j
169 2 d
170 3 h
171 4 s
172 5 a
173 6 k
174 7 j
175 8 d
176 9 h
177 10 j
178 11 f
179 12 h
180 dtype: object
181 ['k' 'j' 'd' 'h' 's' 'a' 'f']
182 sq的类型: <class 'numpy.ndarray'>
183 对sq进行重新排序: 5 a
184 2 d
185 6 f
186 3 h
187 1 j
188 0 k
189 4 s
190 dtype: object
191 '''
192 # 对某一列进行值的计数,只能对一列,不能对Dataframe
193 print(df['key2'].value_counts())
194
195 # 判断Dataframe中的每个元素是否都是在某个列表中
196 print(df)
197 df_isin = df.isin([1,3])
198 print(df_isin)
199 '''
200 key1 key2 key3
201 a 4.0 1.0 1
202 b 5.0 2.0 2
203 c 3.0 NaN 3
204 d NaN 4.0 j
205 e 2.0 5.0 k
206
207
208 key1 key2 key3
209 a False True True
210 b False False False
211 c True False True
212 d False False False
213 e False False False
214 '''