【数据分析可视化】谈一谈NaN

2020-07-07 20:01:04 浏览数 (1)

NaN-means Not a Number

代码语言:javascript复制
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
代码语言:javascript复制
# 创建NaN
n = np.nan
代码语言:javascript复制
# 类型
type(n)
代码语言:javascript复制
float
代码语言:javascript复制
# 任何数字和nan做计算永远是nan
m = 1
m   n
代码语言:javascript复制
nan

NaN in Series

代码语言:javascript复制
# 创建含nan情况
s1 = Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
s1
代码语言:javascript复制
A    1.0
B    2.0
C    NaN
D    3.0
E    4.0
dtype: float64
代码语言:javascript复制
# 判断是否nan
s1.isnull()
代码语言:javascript复制
A    False
B    False
C     True
D    False
E    False
dtype: bool
代码语言:javascript复制
s1.notnull()
代码语言:javascript复制
A     True
B     True
C    False
D     True
E     True
dtype: bool
代码语言:javascript复制
# nan删除掉nan
s1.dropna()
代码语言:javascript复制
A    1.0
B    2.0
D    3.0
E    4.0
dtype: float64

NaN in DataFrame

代码语言:javascript复制
# 创建含有nan情况
df1 = DataFrame(np.random.rand(25).reshape(5,5))
df1.ix[2,4] = np.nan
df1.ix[1,3] = np.nan
df1
代码语言:javascript复制
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.

0

1

2

3

4

0

0.912220

0.932765

0.827517

0.031858

0.749619

1

0.957043

0.857664

0.616395

NaN

0.562609

2

0.686575

0.016802

0.030477

0.609545

NaN

3

0.543484

0.555226

0.138279

0.979043

0.460136

4

0.870316

0.141909

0.567168

0.116696

0.204007

代码语言:javascript复制
# 判断nan
df1.isnull()

0

1

2

3

4

0

False

False

False

False

False

1

False

False

False

True

False

2

False

False

False

False

True

3

False

False

False

False

False

4

False

False

False

False

False

代码语言:javascript复制
df1.notnull()

0

1

2

3

4

0

True

True

True

True

True

1

True

True

True

False

True

2

True

True

True

True

False

3

True

True

True

True

True

4

True

True

True

True

True

代码语言:javascript复制
# 删除的使用(df二维的,因此略有不同)
# axis=0所有带nan的行全部删除
df2 = df1.dropna(axis=0)
df2

0

1

2

3

4

0

0.912220

0.932765

0.827517

0.031858

0.749619

3

0.543484

0.555226

0.138279

0.979043

0.460136

4

0.870316

0.141909

0.567168

0.116696

0.204007

代码语言:javascript复制
# axis=1所有带nan的列全部删除
df2 = df1.dropna(axis=1)
df2

0

1

2

0

0.912220

0.932765

0.827517

1

0.957043

0.857664

0.616395

2

0.686575

0.016802

0.030477

3

0.543484

0.555226

0.138279

4

0.870316

0.141909

0.567168

代码语言:javascript复制
# 如何删除now,参数now
# any 只要有一个为nan就删掉 当前行或列
df2 = df1.dropna(axis=0,how='any')
df2

0

1

2

3

4

0

0.912220

0.932765

0.827517

0.031858

0.749619

3

0.543484

0.555226

0.138279

0.979043

0.460136

4

0.870316

0.141909

0.567168

0.116696

0.204007

代码语言:javascript复制
# 如何删除now,参数now
# all 只有全部为nan就删掉 当前行或列
df2 = df1.dropna(axis=0,how='all')
df2

0

1

2

3

4

0

0.912220

0.932765

0.827517

0.031858

0.749619

1

0.957043

0.857664

0.616395

NaN

0.562609

2

0.686575

0.016802

0.030477

0.609545

NaN

3

0.543484

0.555226

0.138279

0.979043

0.460136

4

0.870316

0.141909

0.567168

0.116696

0.204007

代码语言:javascript复制
# 为测试thresh参数新建数据
df2 = DataFrame(np.random.rand(25).reshape(5,5))
df2.ix[2,:] = np.nan
df2.ix[1,3] = np.nan
df2.ix[3,3] = np.nan
df2.ix[3,4] = np.nan
df2
代码语言:javascript复制
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:5: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
/Users/bennyrhys/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated

0

1

2

3

4

0

0.371901

0.140453

0.576335

0.895684

0.233522

1

0.896337

0.719907

0.647172

NaN

0.698708

2

NaN

NaN

NaN

NaN

NaN

3

0.415230

0.601340

0.694270

NaN

NaN

4

0.926047

0.913255

0.586473

0.442759

0.238776

代码语言:javascript复制
# thresh参数是一个删除界限(当前行或列的nan>2,则删除)
df3 = df2.dropna(thresh=2)
df3

0

1

2

3

4

0

0.371901

0.140453

0.576335

0.895684

0.233522

1

0.896337

0.719907

0.647172

NaN

0.698708

3

0.415230

0.601340

0.694270

NaN

NaN

4

0.926047

0.913255

0.586473

0.442759

0.238776

代码语言:javascript复制
# nan填充值(可以具体指定行列nan填充值)
df2.fillna(value=1)

0

1

2

3

4

0

0.371901

0.140453

0.576335

0.895684

0.233522

1

0.896337

0.719907

0.647172

1.000000

0.698708

2

1.000000

1.000000

1.000000

1.000000

1.000000

3

0.415230

0.601340

0.694270

1.000000

1.000000

4

0.926047

0.913255

0.586473

0.442759

0.238776

代码语言:javascript复制
# 可以具体指定行列nan填充值)
df2.fillna(value={0:0,1:1,2:2,3:3,4:4})

0

1

2

3

4

0

0.371901

0.140453

0.576335

0.895684

0.233522

1

0.896337

0.719907

0.647172

3.000000

0.698708

2

0.000000

1.000000

2.000000

3.000000

4.000000

3

0.415230

0.601340

0.694270

3.000000

4.000000

4

0.926047

0.913255

0.586473

0.442759

0.238776

fillna 和 dropna 原始值不会变,需要保存新值

nan

0 人点赞