서울 연립다세대 매매
linear regression
import pandas as pd
import numpy as np
# 한글 폰트 적용
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus']=False
path='c:/Windows/Fonts/malgun.ttf'
font_name=font_manager.FontProperties(fname=path).get_name()
rc('font',family=font_name)
import os
path = 'C:/Users/박연재/Desktop/데청캠/0715/고우주-1일차/02_Regression/data/서울_연립다세대_매매/'
file_list = os.listdir(path)
file_list_py = [file for file in file_list if file.endswith('.csv')]
## csv 파일들을 DataFrame으로 불러와서 concat
df = pd.DataFrame()
for i in file_list_py:
data = pd.read_csv(path + i,header=15, encoding='CP949')
df = pd.concat([df,data])
df = df.reset_index(drop = True)
df['시군구'][0].split()
gu_tmp = []
for i in range(len(df)):
gu_tmp.append(df['시군구'][i].split()[1])
df['구'] = gu_tmp
tmp_year = []
for i in range(len(df)):
tmp_year.append(df['계약년월'][i]//100)
df['계약년도'] = tmp_year
price = []
for i in range(len(df)):
price.append(df['거래금액(만원)'][i].replace("," ,""))
df['거래금액'] = price
df['거래금액'] = df['거래금액'].astype('float')
df = df[['전용면적(㎡)','계약년도','거래금액','층','건축년도','구']]
df = df.dropna()
df.info()
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()
onehot_labeled = label_encoder.fit_transform(df['구'])
print(onehot_labeled)
df['구2'] = onehot_labeled
df = df.drop(['구'], axis = 1)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# pairplot
sns.pairplot(df)
plt.figure(figsize = (5,5))
sns.heatmap(df.corr(),annot = True)
df.describe()
df.groupby(pd.qcut(df['전용면적(㎡)'],25))['거래금액'].mean()
a = pd.cut(df['전용면적(㎡)'],5)
b = df['거래금액'].groupby(a)
b.agg(['count','mean','std','min','max'])
y = df['거래금액']
X = df[['전용면적(㎡)','계약년도','층','건축년도','구2']]
# the features와 response의 관계를 scatterplot으로 그리기
sns.pairplot(df, x_vars=['전용면적(㎡)','계약년도','층','건축년도','구2'], y_vars='거래금액' ,
size=10, aspect=0.7, kind='scatter')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
random_state=100) # random_state 인자를 동일하게 하면 동일하게 분리
# import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression
# lr에 LinearRegression() 지정
lr = LinearRegression()
# lr.fit() Model Fit 하기
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print('Mean_Asolute_Error:', mae)
print('Mean_Squared_Error :' ,mse)
print('r_square_value :',r_squared)
df1 = df
df2 = df.loc[df['전용면적(㎡)']<=100]
df3 = df2.loc[df['거래금액']<30000]
plt.figure(figsize = (5,5))
sns.heatmap(df3.corr(),annot = True)
y = df3['거래금액']
X = df3[['전용면적(㎡)','계약년도','층','건축년도','구2']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
random_state=100) # random_state 인자를 동일하게 하면 동일하게 분리
# import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression
# lr에 LinearRegression() 지정
lr = LinearRegression()
# lr.fit() Model Fit 하기
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print('Mean_Asolute_Error:', mae)
print('Mean_Squared_Error :' ,mse)
print('r_square_value :',r_squared)
df3['전용면적(㎡)'] = df3['전용면적(㎡)']/(df3['전용면적(㎡)'].max())
df3['거래금액'] = df3['거래금액']/(df3['거래금액'].max())
y = df3['거래금액']
X = df3[['전용면적(㎡)','계약년도','층','건축년도','구2']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
random_state=100) # random_state 인자를 동일하게 하면 동일하게 분리
# import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression
# lr에 LinearRegression() 지정
lr = LinearRegression()
# lr.fit() Model Fit 하기
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print('Mean_Asolute_Error:', mae)
print('Mean_Squared_Error :' ,mse)
print('r_square_value :',r_squared)
'데이터 청년 캠퍼스(경남대학교) > 스터디' 카테고리의 다른 글
2021-07-19 (0) | 2021.07.20 |
---|---|
2021-07-14 (0) | 2021.07.14 |
2021-07-13 (0) | 2021.07.14 |
2021-07-12 (0) | 2021.07.12 |
2021 - 07 - 07 (0) | 2021.07.07 |