NaN-means Not a Number
代码语言:javascript复制import numpy as np
import pandas as pd
from pandas import Series, DataFrame
代码语言:javascript复制# 创建NaN
n = np.nan
代码语言:javascript复制# 类型
type(n)
代码语言:javascript复制float
代码语言:javascript复制# 任何数字和nan做计算永远是nan
m = 1
m n
代码语言:javascript复制nan
NaN in Series
代码语言:javascript复制# 创建含nan情况
s1 = Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
s1
代码语言:javascript复制A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64
代码语言:javascript复制# 判断是否nan
s1.isnull()
代码语言:javascript复制A False
B False
C True
D False
E False
dtype: bool
代码语言:javascript复制s1.notnull()
代码语言:javascript复制A True
B True
C False
D True
E True
dtype: bool
代码语言:javascript复制# nan删除掉nan
s1.dropna()
代码语言:javascript复制A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64
NaN in DataFrame
代码语言:javascript复制# 创建含有nan情况
df1 = DataFrame(np.random.rand(25).reshape(5,5))
df1.ix[2,4] = np.nan
df1.ix[1,3] = np.nan
df1
代码语言:javascript复制/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
This is separate from the ipykernel package so we can avoid doing imports until
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
after removing the cwd from sys.path.
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.912220 | 0.932765 | 0.827517 | 0.031858 | 0.749619 |
1 | 0.957043 | 0.857664 | 0.616395 | NaN | 0.562609 |
2 | 0.686575 | 0.016802 | 0.030477 | 0.609545 | NaN |
3 | 0.543484 | 0.555226 | 0.138279 | 0.979043 | 0.460136 |
4 | 0.870316 | 0.141909 | 0.567168 | 0.116696 | 0.204007 |
# 判断nan
df1.isnull()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | False | False | False | False | False |
1 | False | False | False | True | False |
2 | False | False | False | False | True |
3 | False | False | False | False | False |
4 | False | False | False | False | False |
df1.notnull()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | True | True | True | True | True |
1 | True | True | True | False | True |
2 | True | True | True | True | False |
3 | True | True | True | True | True |
4 | True | True | True | True | True |
# 删除的使用(df二维的,因此略有不同)
# axis=0所有带nan的行全部删除
df2 = df1.dropna(axis=0)
df2
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.912220 | 0.932765 | 0.827517 | 0.031858 | 0.749619 |
3 | 0.543484 | 0.555226 | 0.138279 | 0.979043 | 0.460136 |
4 | 0.870316 | 0.141909 | 0.567168 | 0.116696 | 0.204007 |
# axis=1所有带nan的列全部删除
df2 = df1.dropna(axis=1)
df2
0 | 1 | 2 | |
---|---|---|---|
0 | 0.912220 | 0.932765 | 0.827517 |
1 | 0.957043 | 0.857664 | 0.616395 |
2 | 0.686575 | 0.016802 | 0.030477 |
3 | 0.543484 | 0.555226 | 0.138279 |
4 | 0.870316 | 0.141909 | 0.567168 |
# 如何删除now,参数now
# any 只要有一个为nan就删掉 当前行或列
df2 = df1.dropna(axis=0,how='any')
df2
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.912220 | 0.932765 | 0.827517 | 0.031858 | 0.749619 |
3 | 0.543484 | 0.555226 | 0.138279 | 0.979043 | 0.460136 |
4 | 0.870316 | 0.141909 | 0.567168 | 0.116696 | 0.204007 |
# 如何删除now,参数now
# all 只有全部为nan就删掉 当前行或列
df2 = df1.dropna(axis=0,how='all')
df2
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.912220 | 0.932765 | 0.827517 | 0.031858 | 0.749619 |
1 | 0.957043 | 0.857664 | 0.616395 | NaN | 0.562609 |
2 | 0.686575 | 0.016802 | 0.030477 | 0.609545 | NaN |
3 | 0.543484 | 0.555226 | 0.138279 | 0.979043 | 0.460136 |
4 | 0.870316 | 0.141909 | 0.567168 | 0.116696 | 0.204007 |
# 为测试thresh参数新建数据
df2 = DataFrame(np.random.rand(25).reshape(5,5))
df2.ix[2,:] = np.nan
df2.ix[1,3] = np.nan
df2.ix[3,3] = np.nan
df2.ix[3,4] = np.nan
df2
代码语言:javascript复制/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
This is separate from the ipykernel package so we can avoid doing imports until
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
after removing the cwd from sys.path.
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
"""
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.371901 | 0.140453 | 0.576335 | 0.895684 | 0.233522 |
1 | 0.896337 | 0.719907 | 0.647172 | NaN | 0.698708 |
2 | NaN | NaN | NaN | NaN | NaN |
3 | 0.415230 | 0.601340 | 0.694270 | NaN | NaN |
4 | 0.926047 | 0.913255 | 0.586473 | 0.442759 | 0.238776 |
# thresh参数是一个删除界限(当前行或列的nan>2,则删除)
df3 = df2.dropna(thresh=2)
df3
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.371901 | 0.140453 | 0.576335 | 0.895684 | 0.233522 |
1 | 0.896337 | 0.719907 | 0.647172 | NaN | 0.698708 |
3 | 0.415230 | 0.601340 | 0.694270 | NaN | NaN |
4 | 0.926047 | 0.913255 | 0.586473 | 0.442759 | 0.238776 |
# nan填充值(可以具体指定行列nan填充值)
df2.fillna(value=1)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.371901 | 0.140453 | 0.576335 | 0.895684 | 0.233522 |
1 | 0.896337 | 0.719907 | 0.647172 | 1.000000 | 0.698708 |
2 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
3 | 0.415230 | 0.601340 | 0.694270 | 1.000000 | 1.000000 |
4 | 0.926047 | 0.913255 | 0.586473 | 0.442759 | 0.238776 |
# 可以具体指定行列nan填充值)
df2.fillna(value={0:0,1:1,2:2,3:3,4:4})
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.371901 | 0.140453 | 0.576335 | 0.895684 | 0.233522 |
1 | 0.896337 | 0.719907 | 0.647172 | 3.000000 | 0.698708 |
2 | 0.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 |
3 | 0.415230 | 0.601340 | 0.694270 | 3.000000 | 4.000000 |
4 | 0.926047 | 0.913255 | 0.586473 | 0.442759 | 0.238776 |
fillna 和 dropna 原始值不会变,需要保存新值