统计系列(四)利用Python进行假设检验

2023-03-16 19:54:13 浏览数 (1)

统计系列(四)利用Python进行假设检验

z检验

  • 主要应用场景:在大样本量的总体比例检验
  • 核心:两样本的总体比例差异

单样本比例检验

代码语言:javascript复制
# 检验样本合格率与0.38是否有差异
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

counts=200; nobs=500; value=0.38

# 计算z检验统计量及p值
proportions_ztest(counts, nobs, value)
代码语言:javascript复制
(0.9128709291752777, 0.36131042852617834)

双样本比例差异检验

代码语言:javascript复制
# 两样本合格率是否有差异
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

count1=200; nobs1=500
count2=150; nobs2=500

counts=np.array([count1,count2])
nobs=np.array([nobs1,nobs2])

# 计算z检验统计量及p值
proportions_ztest(counts, nobs)
代码语言:javascript复制
(3.3149677206589807, 0.0009165370761145276)

t检验

  • 主要应用场景:在小样本量或总体方差未知的情况下,进行单样本或双样本检验
  • 核心:一个两分类自变量与一个连续型因变量。如检验性别(男、女)在薪资上的差异

单样本均值检验

代码语言:javascript复制
# 检验样本均值与500是否有差异
import pandas as pd
import numpy as np
from scipy import stats as ss

sale = [506,503,489,501,498,497,491,502,490,511,510,504,512,499,487,507,503,488,521,517]
df = pd.DataFrame(sale, columns = ['sale'])

x = np.array(df[['sale']])
mu = np.mean(x)
u0 = 500
print(mu, ss.ttest_1samp(a=x, popmean=u0))
代码语言:javascript复制
501.8 Ttest_1sampResult(statistic=array([0.83092969]), pvalue=array([0.41633356]))

两独立样本均值差异检验

代码语言:javascript复制
# 检验两样本均值是否显著差异(不要求样本量一致)
import pandas as pd
import numpy as np
from scipy import stats

a = [145,147,139,138,135,133,135,138,144,143,98,108,109,124,134,129,133,139,141,142,143,145,]
b = [101,98,87,106,105,108,114,112,110,103,105,101,98,87,106,105,108,114,112,110,103,105,101,98]

x = np.array(a)
y = np.array(b)

# 检验两样本是否等方差,不确定前可以用levene检验方差齐性
print(stats.levene(x, y)) # p=0.15不显著,即无法拒绝两样本方差显著差异
      
# 若两样本不具有方差齐性,则需要将equal_val参数设定为False
t, p = stats.ttest_ind(x, y)
print(t, p)
代码语言:javascript复制
LeveneResult(statistic=2.1445010235453714, pvalue=0.1501903760252867)
9.678388988357316 1.8131877062436242e-12

配对样本均值差检验

代码语言:javascript复制
# 配对样本校验政策前后是否有显著差异
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel

qian = [88.60,85.20,75.20,78.40,76.00,75.30,89.90,82.70,82.40,75.60,88.60,85.20,75.20,78.40,76.00,75.30,89.90,82.70,82.40,75.60,88.60,85.20,75.20,78.40,76.00,75.30,89.90,82.70,82.40,75.60]
hou = [75.60,76.50,68.20,67.20,69.90,74.30,71.30,78.10,75.30,69.90,75.60,76.50,68.20,67.20,69.90,74.30,71.30,78.10,75.30,69.90,75.60,76.50,68.20,67.20,69.90,74.30,71.30,78.10,75.30,69.90]

dic = {
    
    'qian':qian
    ,'hou':hou
}

df = pd.DataFrame(dic)

x = np.array(df[['qian']])
y = np.array(df[['hou']])
t, p = ttest_rel(x, y)

print(t, p)
代码语言:javascript复制
[9.5629333] [1.80117663e-10]

卡方检验

主要应用场景:通过分析不同类别数据的相对选择频数和占比情况进行差异判断。

核心:一个多分类自变量与另一个多分类因变量。如检验学历(低、中、高)在收入等级(低、中、高)上的差异

拟合优度检验检验

代码语言:javascript复制
# 拟合优度检验 观察实验数据与期望数据是否有差异
import pandas as pd
import numpy as np
from scipy import stats
from random import randint
from collections import Counter

# 模拟120次掷骰子
def getData(N, times, sides=6):
    """
    定义函数,获取投掷数据
    N: 表示一次用几个骰子投
    times:表示总共投几次
    sides:表示骰子的面数,默认为6
    """  
    results = []
    for n in range(1,N 1):
        for roll_num in range(times):
            result = randint(1,sides)
            results.append(result)
    return results

roll_result = getData(1, 120)
df = pd.DataFrame(Counter(roll_result).items(), columns=['point', 'cnt']
                     ).sort_values("point")
df['exp'] = 20

# 卡方检验
print(stats.chisquare(df['cnt'], df['exp']))
代码语言:javascript复制
Power_divergenceResult(statistic=3.3999999999999995, pvalue=0.6385699231037951)

独立性检验

代码语言:javascript复制
# 独立性检验 构造收入等级与学历列联表
import pandas as pd
import numpy as np
from scipy import stats

# 学历收入数据
ep_dic = {
        '高收入':{'高中及以下':25, '大学':21, '研究生及以上':10},
        '中等收入':{'高中及以下':82, '大学':88, '研究生及以上':30},
        '低收入':{'高中及以下':223, '大学':16, '研究生及以上':5} 
}
df = pd.DataFrame(ep_dic)

print('卡方值=%.4f, p值=%.4f, 自由度=%i 预期频率=%s' % stats.chi2_contingency(df))
代码语言:javascript复制
卡方值=138.2050, p值=0.0000, 自由度=4 预期频率=[[ 36.96 132.   161.04]
 [ 14.    50.    61.  ]
 [  5.04  18.    21.96]]

方差分析

主要应用场景:分析多个分类变量对连续型因变量的影响。

核心:多个多分类自变量()与连续型因变量

单因素方差分析

代码语言:javascript复制
# 单因素方差分析 学历对收入的影响
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# 学历收入数据
ep_dic = {
        '学历':['A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C'],
        '收入':[9, 14, 11, 8, 7, 5, 13, 9, 11, 14, 13, 14, 15, 17, 15, 18, 11, 25]
}
df = pd.DataFrame(ep_dic)

formula = '收入~C(学历)'
model = ols(formula=formula, data=df).fit()
anova_lm(model)

df

sum_sq

mean_sq

F

PR(>F)

C(学历)

2.0

185.444444

92.722222

7.806361

0.004747

Residual

15.0

178.166667

11.877778

NaN

NaN

多因素方差分析

代码语言:javascript复制
# 多因素方差分析 学历性别对收入的影响
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# 学历收入数据
ep_dic = {
        '学历':['A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C'],
        '性别':['f', 'f', 'm', 'm', 'm', 'm', 'm', 'm', 'f', 'm', 'f', 'm', 'm', 'f', 'f', 'm', 'm', 'm'],
        '收入':[9, 14, 11, 8, 7, 5, 13, 9, 11, 14, 13, 14, 15, 17, 15, 18, 11, 25]
}
df = pd.DataFrame(ep_dic)

formula = '收入~ 学历   性别'
model = ols(formula=formula, data=df).fit()
anova_lm(model)

df

sum_sq

mean_sq

F

PR(>F)

学历

2.0

185.444444

92.722222

7.359370

0.006542

性别

1.0

1.777778

1.777778

0.141102

0.712818

Residual

14.0

176.388889

12.599206

NaN

NaN

样本方差检验

主要应用场景:检验样本的方差波动

单样本方差检验

代码语言:javascript复制
# 检验样本方差是否等于1%(95%置信度)
import pandas as pd
import numpy as np

x = np.array([0.564409196,0.264802098,0.947742641,0.276915401,0.118015848,0.40797025,-0.72194916,0.871691048,0.461142898,0.421672612])
n = len(x)
s2 = np.var(x)
s0 = 0.01 # 估计方差基准值
# 计算卡方值
chi2 = (n-1)*s2/s0
print(chi2)
代码语言:javascript复制
171.94566597969543

双样本方差检验

代码语言:javascript复制
# 检验两样本的波动是否一致
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

dic = {
    'x':[0.424155712,0.898346186,0.521924734,0.841409445,0.211007655,0.564409196,0.264802098,0.061063781,0.555238843,0.871449053,0.255149871,0.044518802,0.424155712,0.898346186,0.521924734,0.841409445,0.211007655,0.564409196,0.264802098,0.947742641],
    'y':[0.261074632,0.165020704,0.760604024,0.371380478,0.379540861,0.967873454,0.582328379,0.102436937,0.577388406,0.109178342,0.974608779,0.216238976,0.261074632,0.165020704,0.760604024,0.371380478,0.379540861,0.967873454,0.582328379,0.795299947]
}
df = pd.DataFrame(dic)

formula = 'x~y'
model = ols(formula,df).fit()
results = anova_lm(model)
print(results) # 无法拒绝波动率一致,即波动率显著相同
代码语言:javascript复制
            df    sum_sq   mean_sq         F   PR(>F)
y          1.0  0.000709  0.000709  0.007744  0.93085
Residual  18.0  1.648029  0.091557       NaN      NaN

总结

共勉~

0 人点赞