import numpy as np
import pandas as pd
调试代码的数据集:dir(datasets)
# 加载数据(鸢尾花)
from sklearn import datasets
dir(datasets)
iris_raw=datasets.load_iris()
iris_raw
iris_raw.target # target是分类,调出
iris_raw.data
# array转dataframe
pd.DataFrame(iris_raw.data)
# 重新命名列名
# array转dataframe
iris=pd.DataFrame(iris_raw.data,columns=iris_raw.feature_names)
iris
# pd.DataFrame(iris_raw.data,columns=['a','b','c','d'])
# 增加一列
iris['Species']=iris_raw.target # target原本是array
iris
iris['petal length (cm)'].mean()
# H0:鸢尾花花瓣平均长度是4.2
import statsmodels.stats.weightstats as sw
sw.ztest(iris['petal length (cm)'],value=4.2)
#(-3.066548320028344, 0.0021654580512200875) 分别对应统计值和P值
# H0: 花瓣平均长度为4.0
import scipy.stats as ss
ss.ttest_1samp(iris['petal length (cm)'],popmean=4.0)
# 取出数值
stats_val,p_val=ss.ttest_1samp(iris['petal length (cm)'],popmean=4.0)
print(stats_val,p_val)
iris['Species'].unique()
# 山鸢尾和可变色鸢尾,花瓣长度是否有差异
iris['petal length (cm)']
import scipy.stats as ss
ss.ttest_ind(iris[iris['Species']==0]['petal length (cm)'], iris[iris['Species']==1]['petal length (cm)'])
# Pvalue小于0.025,拒绝原假设