房价数据分析

"数据"

Posted by zwt on February 3, 2021

引包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 导入需要的模块
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 用来绘图的,封装了matplot
# 要注意的是一旦导入了seaborn,
# matplotlib的默认作图风格就会被覆盖成seaborn的格式
import seaborn as sns       

from scipy import stats
from scipy.stats import  norm
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
data_train = pd.read_csv("../KaggleDataset/train.csv")

分析各个特征

输出房价的数量、均值、最大值、最小值、方差等特征

1
data_train['SalePrice'].describe()

绘制直方图

1
2
sns.distplot(data_train['SalePrice'])
plt.show()

CentralAir中央空调

1
2
3
4
5
# CentralAir 
var = 'CentralAir' 
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1) 
fig = sns.boxplot(x=var, y="SalePrice", data=data) 
fig.axis(ymin=0, ymax=800000);

OverallQual 总体评价

1
2
3
4
5
# OverallQual
var = 'OverallQual'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

YearBuilt 建造年份

1
2
3
4
# 散点图
var = 'YearBuilt'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
data.plot.scatter(x=var, y="SalePrice", ylim=(0, 800000))

Neighborhood 地段

1
2
3
4
5
var = 'Neighborhood'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
f, ax = plt.subplots(figsize=(26, 12))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

LotArea 地表面积

1
2
3
var  = 'LotArea'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))

GrLivArea 不含车库的室内面积

1
2
3
var  = 'GrLivArea'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))

TotalBsmtSF 地下室面积

1
2
3
var  = 'TotalBsmtSF'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))Out [15]:

MiscVal 附加资产

1
2
3
var  = 'MiscVal'
data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))

GarageArea/GarageCars 车库面积/车库车辆

1
2
3
4
var  = ['GarageArea', 'GarageCars']
for index in range(2):
    data = pd.concat([data_train['SalePrice'], data_train[var[index]]], axis=1)
    data.plot.scatter(x=var[index], y='SalePrice', ylim=(0, 800000))

关系矩阵分析特征

1
2
3
corrmat = data_train.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=0.8, square=True)

加入Neighborhood这种离散型数据

1
2
3
4
5
6
7
8
from sklearn import preprocessing
f_names = ['CentralAir', 'Neighborhood']
for x in f_names:
    label = preprocessing.LabelEncoder()
    data_train[x] = label.fit_transform(data_train[x])
corrmat = data_train.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=0.8, square=True)

房价关系矩阵

1
2
3
4
5
6
7
k  = 10 # 关系矩阵中将显示10个特征
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(data_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, \
                 square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

绘制关系图

1
2
3
4
sns.set()
cols = ['SalePrice','OverallQual','GrLivArea', 'GarageCars','TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt']
sns.pairplot(data_train[cols], size = 2.5)
plt.show()

参考

  1. Kaggle项目之房价预测