本文主要记录了python对数据进行相关性统计检验的实现方法。
主要是:
1、连续变量VS连续变量:皮尔逊相关系数
2、分类变量VS分类变量:卡方检验
3、连续变量VS多分类变量:F检验
4、连续变量VS二分类变量:若连续变量满足正态分布,则用t检验,否则应用曼-惠特尼U检验
主要的python脚本如下:
from sklearn.feature_selection import SelectKBest,chi2from scipy import statsfrom scipy.stats import ttest_ind, levenedef corr_continue_continue(series_1,series_2):corr = pearsonr(series_1,series_2)[1]return corrdef corr_cate_cate(series_1,series_2):series_1 = np.array(series_1).reshape(-1,1)model1 = SelectKBest(chi2, k=1)model1.fit_transform(series_1, series_2)corr = model1.scores_[0]p_value = model1.pvalues_[0]return corr,p_valuedef corr_continue_multicate(fenlei_series_1,lianxu_series_2):f,p = stats.f_oneway(fenlei_series_1,lianxu_series_2)return f,pdef normal_test(series):u = series.mean()std = series.std()statistic,pvalue = stats.kstest(series,'norm',(u,std))return statistic,pvalue def corr_continue_twocate(data,fenlei_var,lianxu_var):fenlei = list(set(data[fenlei_var]))series_1 = data[lianxu_var][data[fenlei_var]==fenlei[0]]series_2 = data[lianxu_var][data[fenlei_var]==fenlei[1]]stats_1,pvalue_1 = normal_test(series_1)stats_2,pvalue_2 = normal_test(series_2)if pvalue_1 < 0.05 or pvalue_2 < 0.05:t_value,p_value = stats.mannwhitneyu(series_1,series_2,alternative='two-sided')else:_,levene_p_value = levene(series_1,series_2)if levene_p_value>0.05:t_value,p_value = ttest_ind(series_1,series_2)else:t_value,p_value = ttest_ind(series_1,series_2,equal_var=False)return t_value,p_value,pvalue_1,pvalue_2