700字范文 > 数据分析: 线性回归分析之研究二手房价的影响因素建立房价预测模型

数据分析: 线性回归分析之研究二手房价的影响因素建立房价预测模型

时间：2018-09-11 14:36:45

# -*- coding: utf-8 -*-'''研究二手房价的影响因素，建立房价预测模型，数据存放在“sndHsPr.csv”中。分析思路：在对房价的影响因素进行模型研究之前，首先对各变量进行描述性分析，以初步判断房价的影响因素，进而建立房价预测模型变量说明如下：dist-所在区roomnum-室的数量halls-厅的数量AREA-房屋面积floor-楼层subway-是否临近地铁school-是否学区房price-平米单价步骤如下：（一）因变量分析：单位面积房价分析（二）自变量分析：2.1 自变量自身分布分析2.2 自变量对因变量影响分析（三）建立房价预测模型3.1线性回归模型3.2 对因变量取对数的线性模型3.3 考虑交互项的对数线性（四）预测：假设有一家三口，父母为了能让孩子在东城区上学，想买一套邻近地铁的两居室，面积是70平方米，中层楼层，那么房价大约是多少呢？'''import pandas as pdimport osfrom scipy import statsimport statsmodels.api as smfrom statsmodels.formula.api import olsimport matplotlib.pyplot as pltimport seaborn as sns# 解决matplotlib中文显示以及负号(-)显示plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# 数据导入和数据清洗os.chdir(r'F:\python_data_analysis\data_file')data = pd.read_csv('sndHsPr.csv')print('数据预览: \n{}'.format(data.head()))dist_columns = {'chaoyang': '朝阳区','haidian': '海淀区','fengtai': '丰台区','xicheng': '西城区','dongcheng': '东城区','shijingshan': '石景山区'}data['dist'] = data['dist'].map(dist_columns)data['price'] = data['price']/10000print(data.head())# 描述性统计分析# 频数统计for i in range(7):if i != 3:print(data.columns.values[i], ':')print(data[data.columns.values[i]].agg(['value_counts']).T)print('===================================================')# 两个连续性变量AREA和price的描述性统计print('AREA :')print(pd.DataFrame(data.AREA.agg(['mean', 'min', 'max', 'median', 'std'])).T)print('===================================================')print('连续型因变量price的描述性统计分析 :')print(data[['price']].describe().T)# 地区占比情况饼图data['dist'].value_counts().plot(kind='pie', autopct='%6.3f%%')plt.show()# 房价的分布直方图plt.hist(data['price'], bins=20)plt.show()# 各地区平均房价的差异柱状图data.groupby('dist')['price'].mean().sort_values(ascending=True).plot(x='price', y='dist', kind='barh')plt.xlabel('平均房价')plt.ylabel('地区', rotation=0)plt.show()# 各地区房价的分布盒须图dist_price_df = data[['dist', 'price']]dist_price_df['dist'] = dist_price_df['dist'].astype('category')dist_price_df['dist'].cat.set_categories(['石景山区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区'], inplace=True)sns.boxplot(x='dist', y='price', data=dist_price_df)plt.show()# 有无地铁对房价的影响柱状图data.groupby('subway')['price'].mean().plot(kind='bar')plt.show()# 是否学区房对房价的影响柱状图data.groupby('school')['price'].mean().plot(kind='bar')plt.show()# 有无地铁,是否学区房的交叉分析sub_sch = pd.crosstab(data.subway, data.school)print('有无地铁,是否学区房的交叉分析:')print(sub_sch)sub_sch.div(sub_sch.sum(1), axis=0).plot(kind='bar', stacked=True)plt.show()# 卧室个数对房价的影响data.groupby('roomnum')['price'].mean().plot(kind='bar')plt.show()sns.boxplot(x='roomnum', y='price', data=data)plt.show()# 厅个数对房价的影响data.groupby('halls')['price'].mean().plot(kind='bar')plt.show()sns.boxplot(x='halls', y='price', data=data)plt.show()# 不同楼层对房价的影响data.groupby('floor')['price'].mean().plot(kind='bar')plt.show()sns.boxplot(x='floor', y='price', data=data)plt.show()# 分层抽样# 抽样方法分为三种; 简单随机抽样simple_random, 系统抽样systematic, 分层抽样stratifiedimport numpy as npimport mathimport randomdef get_sample(df, sampling, k, stratified_col=None):len_df = len(df)if k < 0:raise AssertionError('请确保输入的抽样数k大于零')elif k >= 1:assert isinstance(k, int), '请确保输入的抽样数k为整数'sampling_by_n = Trueif sampling is 'stratified':layers_num = df.groupby(by=stratified_col)[stratified_col[0]].count().count()if k*layers_num >= len_df:raise AssertionError('请确保抽样数乘分层数不得超过总样本量')else:sampling_by_n = Falseif sampling in ('simple_random', 'systematic'):k = math.ceil(k*len_df)if sampling is 'simple_random':print('进行简单随机抽样')idx = random.sample(range(len_df), k)sample_result = df.iloc[idx, :].copy()return sample_resultelif sampling is 'systematic':print('进行系统抽样')start = 0step = len_df//k + 1idx = range(len_df)[start::step]sample_result = df.iloc[idx, :].copy()return sample_resultelif sampling is 'stratified':assert stratified_col is not None, '请确保输入的分层列名不为空'assert all(np.in1d(stratified_col, df.columns)), '请检查输入的包含分层列名的列表'print('进行分层抽样')grouped = df.groupby(by=stratified_col)[stratified_col[0]].count()if sampling_by_n == True:grouped_s = grouped.map(lambda x: k)else:grouped_s = grouped.map(lambda x: math.ceil(x * k))sample_result = pd.DataFrame(columns=df.columns)for df_idx in grouped_s.index:df_new = dfif len(stratified_col) == 1:df_new = df_new[df_new[stratified_col[0]] == df_idx]else:for i in range(len(df_idx)):df_new = df_new[df_new[stratified_col[i]] == df_idx[i]]idx = random.sample(range(len(df_new)), grouped_s[df_idx])grouped_df = df_new.iloc[idx, :].copy()sample_result = sample_result.append(grouped_df)return sample_resultelse:raise AssertionError('sampling is illegal')# 采用分层抽样, k=400data_sampled = get_sample(data, sampling='stratified', k=400, stratified_col=['dist'])'''# 两样本T检验# 两个分类变量subway, schoolsub_1 = data[data['subway'] == 1]['price']sub_0 = data[data['subway'] == 0]['price']sch_1 = data[data['school'] == 1]['price']sch_0 = data[data['school'] == 0]['price']#方差齐性检验w_statistic, p_value = stats.levene(sub_1, sub_0, center='median')print('w_statistic: {}, p_value: {}'.format(w_statistic, p_value))t_statistic, p_value = stats.stats.ttest_ind(sub_1, sub_0, equal_var=True)print('t_statistic: {}, p_value: {}'.format(t_statistic, p_value))w_statistic, p_value = stats.levene(sch_1, sch_0, center='median')print('w_statistic: {}, p_value: {}'.format(w_statistic, p_value))t_statistic, p_value = stats.stats.ttest_ind(sch_1, sch_0, equal_var=True)print('t_statistic: {}, p_value: {}'.format(t_statistic, p_value))'''# 方差分析# 六个分类变量 dist, roomnum, halls, floor, subway, schoolprint('price ~ dist方差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(dist)', data=data_sampled).fit()).values[0, 4]))print('price ~ roomnum方差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(roomnum)', data=data_sampled).fit()).values[0, 4]))print('price ~ halls方差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(halls)', data=data_sampled).fit()).values[0, 4]))print('price ~ floor方差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(floor)', data=data_sampled).fit()).values[0, 4]))print('price ~ subway方差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(subway)', data=data_sampled).fit()).values[0,4]))print('price ~ school方差分析的P值: {}'.format(sm.stats.anova_lm(ols('price ~ C(school)', data=data_sampled).fit()).values[0,4]))# 从方差分析的结果可知, 自变量roomnum/halls对因变量price的影响不显著# 将厅的个数切分成二分类变量:'无厅'/'有厅'data_sampled['style_new'] = data_sampled['halls'].map(lambda x: '无厅' if x == 0 else '有厅')print(data_sampled.head())# 将多分类变量dist, floor生成哑变量data_dummy = pd.get_dummies(data_sampled[['dist', 'floor']])print(data_dummy.head())# 移除dist_石景山区, floor_high两个哑变量,作为参照组data_dummy.drop(['dist_石景山区', 'floor_high'], axis=1, inplace=True)# 将生成的哑变量与抽样数据集合并成新的数据集data_concated = pd.concat([data_dummy, data_sampled[['AREA', 'subway','roomnum', 'school', 'style_new', 'price']]], axis=1)print(data_concated.head())print(data_concated.columns)# 建立线性回归模型lm_0 = ols('price ~ C(dist) + C(style_new) + C(floor) + subway + school + AREA', data=data_sampled).fit()print('不对分类型自变量进行哑变量处理:')print(lm_0.summary())print('price ~ C(dist) + C(style_new) + C(floor) + subway + school + AREA 线性回归模型的R2值: {}'.format(lm_0.rsquared))print('================================================')lm_1 = ols('price ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + ''floor_middle + floor_low + style_new + subway + school + AREA', data=data_concated).fit()print('对分类型自变量进行哑变量处理:')print(lm_1.summary())print('price ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + ''floor_middle + floor_low + style_new + subway + school + AREA 线性回归模型的R2值: {}'.format(lm_1.rsquared))print('================================================')data_concated['predict_1'] = lm_1.predict(data_concated)data_concated['resid_1'] = lm_1.residdata_concated.plot(x='predict_1', y='resid_1', kind='scatter')plt.show()#由模型诊断图(散点图)可知, 存在异方差现象# 对price取对数对连续型自变量AREA取对数data_concated['price_ln'] = np.log(data_concated['price'])data_concated['AREA_ln'] = np.log(data_concated['AREA'])lm_2 = ols('price_ln ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + ''floor_middle + floor_low + style_new + subway + school + AREA_ln', data=data_concated).fit()print('对price取对数对连续型自变量AREA取对数对分类型自变量进行哑变量处理:')print(lm_2.summary())print('price_ln ~ dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区 + ''floor_middle + floor_low + style_new + subway + school + AREA_ln 线性回归模型的R2值: {}'.format(lm_2.rsquared))#由模型诊断图(散点图)可知, 异方差现象得到消除data_concated['predict_2'] = lm_2.predict(data_concated)data_concated['resid_2'] = lm_2.residdata_concated.plot(x='predict_2', y='resid_2', kind='scatter')plt.show()# 由于石景山区学区房均价低于非学区房# 考虑地区dist与学区房school的交互作用import seaborn as sns# pd.pivot_table(data, index='dist', columns='school', values='price', aggfunc='mean')sns.barplot(x='dist', y='price', hue='school', data=data)plt.show()# 描述统计石景山区非学区房与学区房的平均房价print('石景山区非学区房 : {:.2f}万元/每平方米, 石景山区学区房 : {:.2f}万元/每平方米'.format(data[(data['dist'] == '石景山区')&(data['school'] == 0)]['price'].mean(),data[(data['dist'] == '石景山区')&(data['school'] == 1)]['price'].mean()))# 描述统计其余五个区非学区房与学区房的平均房价dists = ['丰台区', '海淀区', '西城区', '东城区', '朝阳区']for i in dists:print('{}非学区房 : {:.2f}万元/每平方米, {}学区房 : {:.2f}万元/每平方米'.format(i, data[(data['dist'] == i) & (data['school'] == 0)]['price'].mean(),i, data[(data['dist'] == i) & (data['school'] == 1)]['price'].mean()))# 对比石景山区非学区房与学区房数量sch0_account = data[(data['dist'] == '石景山区')&(data['school'] == 0)].shape[0]sch1_account = data[(data['dist'] == '石景山区')&(data['school'] == 1)].shape[0]sch_ratio = sch1_account/(sch0_account + sch1_account)print('石景山区非学区房数量 : {}, 石景山区学区房数量 : {}, 非学区房占比: {:.4f}%'.format(sch0_account, sch1_account, sch_ratio*100))dists = ['石景山区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区']df = pd.DataFrame()sch_0 = []sch_1 = []for i in dists:sch_0.append(data[(data['dist'] == i) & (data['school'] == 0)]['price'].mean())sch_1.append(data[(data['dist'] == i) & (data['school'] == 1)]['price'].mean())df['dist'] = pd.Series(dists)df['no_school'] = pd.Series(sch_0)df['school'] = pd.Series(sch_1)print(df)df1 = df['no_school'].T.valuesdf2 = df['school'].T.valuesplt.figure(figsize=(10, 6))x1 = range(0, len(df))x2 = [i + 0.3 for i in x1]plt.bar(x1, df1, color='r', width=0.3, alpha=0.6, label='非学区房')plt.bar(x2, df2, color='b', width=0.3, alpha=0.6, label='非学区房')plt.xticks(range(0, 6), dists)plt.show()dists_sorted = ['石景山区', '丰台区', '朝阳区', '海淀区', '东城区', '西城区']for i in dists_sorted:data[data['dist'] == i][['school', 'price']].boxplot(by='school', patch_artist=True)plt.xlabel(i + '学区房')plt.show()lm_3 = ols('price_ln ~ (dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区)*school + ''floor_middle + floor_low + style_new + subway + AREA_ln', data=data_concated).fit()print('考虑交互项')print('price_ln ~ (dist_东城区 + dist_丰台区 + dist_朝阳区 + dist_海淀区 + dist_西城区)*school + ''floor_middle + floor_low + style_new + subway + AREA_ln 线性回归模型的R2值: {}'.format(lm_3.rsquared))print('========================================================================')df_predict = data_concated.head(1).copy()print(df_predict)df_predict['dist_东城区'] = 1df_predict['school'] = 1df_predict['subway'] = 1df_predict['AREA_ln'] = np.log(70)df_predict['style_new'] = '有厅'df_predict['roomnum'] = 2df_predict['floor_middle'] = 1price = math.exp(lm_3.predict(df_predict))print('单位面积房价: {:.4f}万元/平方米'.format(price))print('总价: {}万元'.format(price * 70))

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。

数据分析: 线性回归分析之研究二手房价的影响因素 建立房价预测模型

数据分析: 线性回归分析之研究二手房价的影响因素建立房价预测模型