from IPython.display import display, Image


## 머신러닝 작업 flow
display(Image(filename='img/machineWorkflow01.png'))


from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np


import pandas as pd

train = pd.read_csv("house_train.csv")
test = pd.read_csv("house_test.csv")


train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


train.head()


X_all = train.drop(['price'], axis=1) # 열 기준 컬럼 삭제
y = train['price']

print(type(X_all), type(y) )
print( X_all.shape, y.shape )

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
(15035, 20) (15035,)


X_all.columns

Index(['id', 'date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')


X_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15035 entries, 0 to 15034
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             15035 non-null  int64  
 1   date           15035 non-null  object 
 2   bedrooms       15035 non-null  int64  
 3   bathrooms      15035 non-null  float64
 4   sqft_living    15035 non-null  int64  
 5   sqft_lot       15035 non-null  int64  
 6   floors         15035 non-null  float64
 7   waterfront     15035 non-null  int64  
 8   view           15035 non-null  int64  
 9   condition      15035 non-null  int64  
 10  grade          15035 non-null  int64  
 11  sqft_above     15035 non-null  int64  
 12  sqft_basement  15035 non-null  int64  
 13  yr_built       15035 non-null  int64  
 14  yr_renovated   15035 non-null  int64  
 15  zipcode        15035 non-null  int64  
 16  lat            15035 non-null  float64
 17  long           15035 non-null  float64
 18  sqft_living15  15035 non-null  int64  
 19  sqft_lot15     15035 non-null  int64  
dtypes: float64(4), int64(15), object(1)
memory usage: 2.3+ MB


train.corr()


train.corr()['price']

id               0.020899
price            1.000000
bedrooms         0.323672
bathrooms        0.525479
sqft_living      0.702899
sqft_lot         0.096793
floors           0.262588
waterfront       0.265738
view             0.400806
condition        0.039740
grade            0.667211
sqft_above       0.608577
sqft_basement    0.322218
yr_built         0.047290
yr_renovated     0.140808
zipcode         -0.051498
lat              0.301604
long             0.023547
sqft_living15    0.586419
sqft_lot15       0.086384
Name: price, dtype: float64


from sklearn.preprocessing import MinMaxScaler


sel = ['sqft_living', 'sqft_lot', 'bedrooms']  # 'bedrooms' , 'bathrooms', 
X = X_all[sel]
y = train['price']

nor_X = MinMaxScaler().fit_transform(X)  # 입력 데이터 정규화
print("정규화 : ", nor_X.shape, y.shape)

정규화 :  (15035, 3) (15035,)


#  정규화 데이터 사용
X_train, X_test, y_train, y_test = train_test_split(nor_X, y, 
                                                    random_state=42)

# 정규화 데이터 사용 안함.
# X_train, X_test, y_train, y_test = train_test_split(X, y, 
#                                                    random_state=42)


model = RandomForestRegressor(n_estimators=5, random_state=2)  # 5개의 트리 
print( model.fit(X_train, y_train) )
print("학습용 데이터 셋, 테스트용 데이터 셋")
print( model.score(X_train, y_train))
print( model.score(X_test, y_test))

RandomForestRegressor(n_estimators=5, random_state=2)
학습용 데이터 셋, 테스트용 데이터 셋
0.8922137121180739
0.37937640288308927


# 학습된 랜덤포레스트의 트리 모델
print(model.estimators_, end="\n\n")  
print(model.score, end="\n\n")
print(model.base_estimator)           # 기본 모델

[DecisionTreeRegressor(max_features='auto', random_state=1872583848), DecisionTreeRegressor(max_features='auto', random_state=794921487), DecisionTreeRegressor(max_features='auto', random_state=111352301), DecisionTreeRegressor(max_features='auto', random_state=1853453896), DecisionTreeRegressor(max_features='auto', random_state=213298710)]

<bound method RegressorMixin.score of RandomForestRegressor(n_estimators=5, random_state=2)>

DecisionTreeRegressor()


print("부트스트랩 : {}".format(model.bootstrap))      
print("노드 분할 기준 : {}".format(model.criterion))  # 회귀 모델의 경우, 분할 기준은 mse(Mean Squared Error)

부트스트랩 : True
노드 분할 기준 : mse


# 사용된 피처 중요도와 피처의 개수
print( model.feature_importances_ )
print( model.n_features_ )

[0.69818654 0.25989234 0.04192112]
3


# model : 모델
# n_features : feature(변수의 개수)
# feature_names : 특성의 이름
def plot_feature_important_up(model, n_features, feature_names):
  imp = model.feature_importances_     # feature의 중요도 
  plt.barh(range(n_features) , imp, align='center')  # 그래프(가로 막대 그래프)
  plt.yticks(np.arange(n_features), feature_names)  #y축의 축의 값
  plt.xlabel("feature importance")   # x축 레이블(제목)
  plt.ylabel("feature")              # y축 제목
  plt.ylim(-1, n_features)           # y축의 범위 지정


feature_names = sel   #  선택된 피처의 이름
n_features = X.shape[1]  # 선택된 피처의 개수
plot_feature_important_up(model, n_features, feature_names) # 피처의 중요도 확인


%%time
X_train, X_test, y_train, y_test = train_test_split(nor_X, y, random_state=42)
model_5 = RandomForestRegressor(n_estimators=5, random_state=2)  # 5개의 트리 
model_5.fit(X_train, y_train)

CPU times: user 156 ms, sys: 6.49 ms, total: 163 ms
Wall time: 215 ms

RandomForestRegressor(n_estimators=5, random_state=2)


model_5

RandomForestRegressor(n_estimators=5, random_state=2)


print( model_5.score(X_train, y_train)) # 학습용 데이터의 결정계수 확인
print( model_5.score(X_test, y_test))   # 테스트용 데이터의 결정계수 확인

0.8922137121180739
0.37937640288308927


n_features = X.shape[1]
plot_feature_important_up(model, n_features, feature_names)


%%time
model_100 = RandomForestRegressor(n_estimators=100, random_state=2)  # 100개의 트리 
model_100.fit(X_train, y_train)
print( model_100.score(X_train, y_train))
print( model_100.score(X_test, y_test))

0.9341193749716377
0.45105945624970734
CPU times: user 2.41 s, sys: 49.3 ms, total: 2.46 s
Wall time: 2.76 s


n_features = X_train.shape[1]
n_features

3


plot_feature_important_up(model, n_features, feature_names)


for model in model_5.estimators_:
    model.fit(X_train, y_train)
    print("훈련 세트 정확도 : {:.3f}".format(model.score(X_train, y_train)))
    print("테스트 세트 정확도 : {:.3f}".format(model.score(X_test, y_test)))

훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.074
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.056
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.087
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.059
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.069


cnt = 1
for model in model_100.estimators_:
    model.fit(X_train, y_train)
    if cnt % 10 == 0:
        print("훈련 세트 정확도 : {:.3f}".format(model.score(X_train, y_train)))
        print("테스트 세트 정확도 : {:.3f}".format(model.score(X_test, y_test)))
    cnt += + 1

훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.039
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.071
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.090
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.050
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.044
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.092
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.066
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.074
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.058
훈련 세트 정확도 : 0.997
테스트 세트 정확도 : 0.061

컬럼명	의미	값(기타)
ID	집을 구분하는 번호
date	집을 구매한 날짜
price	집의 가격(Target variable)
bedrooms	침실의 수
bathrooms	화장실의 수
sqft_living	주거 공간의 평방 피트(면적)
sqft_lot	부지의 평방 피트(면적)
floors	집의 층 수
waterfront	집의 전방에 강이 흐르는지 유무 (a.k.a. 리버뷰)
view	집이 얼마나 좋아 보이는지의 정도
condition	집의 전반적인 상태
grade	King County grading 시스템 기준으로 매긴 집의 등급
sqft_above	지하실을 제외한 평방 피트(면적)
sqft_basement	지하실의 평방 피트(면적)
yr_built	지어진 년도
yr_renovated	집을 재건축한 년도
zipcode	우편번호
lat	위도
long	경도
sqft_living15	2015년 기준 주거 공간의 평방 피트(면적, 집을 재건축했다면, 변화가 있을 수 있음)
sqft_lot15	2015년 기준 부지의 평방 피트(면적, 집을 재건축했다면, 변화가 있을 수 있음)

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	yr_built	zipcode	lat	long	sqft_living15	sqft_lot15
0	0	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	1955	98178	47.5112	-122.257	1340	5650
1	1	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	1933	98028	47.7379	-122.233	2720	8062
2	2	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	1987	98074	47.6168	-122.045	1800	7503
3	3	20140627T000000	257500.0	3	2.25	1715	6819	2.0	...	7	1715	1995	98003	47.3097	-122.327	2238	6819
4	4	20150115T000000	291850.0	3	1.50	1060	9711	1.0	...	7	1060	1963	98198	47.4095	-122.315	1650	9711

	id	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
id	1.000000	0.020899	0.010520	0.104030	0.041725	-0.034077	0.182848	-0.011775	-0.024360	-0.101618	0.078622	0.073086	-0.050634	0.202477	-0.029810	-0.005761	0.002588	0.014757	0.029248	-0.032269
price	0.020899	1.000000	0.323672	0.525479	0.702899	0.096793	0.262588	0.265738	0.400806	0.039740	0.667211	0.608577	0.322218	0.047290	0.140808	-0.051498	0.301604	0.023547	0.586419	0.086384
bedrooms	0.010520	0.323672	1.000000	0.530548	0.596974	0.033475	0.189532	-0.004819	0.085703	0.034885	0.375286	0.494867	0.315183	0.158799	0.022729	-0.162081	-0.011190	0.135802	0.407394	0.027242
bathrooms	0.104030	0.525479	0.530548	1.000000	0.755853	0.089308	0.508649	0.075452	0.187488	-0.125907	0.666278	0.688255	0.282642	0.503964	0.065423	-0.207500	0.018110	0.227669	0.573541	0.088120
sqft_living	0.041725	0.702899	0.596974	0.755853	1.000000	0.176500	0.363193	0.108137	0.282821	-0.054213	0.762543	0.878736	0.434017	0.315927	0.064893	-0.200745	0.051609	0.245429	0.760271	0.184176
sqft_lot	-0.034077	0.096793	0.033475	0.089308	0.176500	1.000000	0.001535	0.025584	0.080441	-0.002099	0.119906	0.186242	0.017818	0.058686	-0.001451	-0.127709	-0.082234	0.227451	0.147562	0.728458
floors	0.182848	0.262588	0.189532	0.508649	0.363193	0.001535	1.000000	0.031159	0.034511	-0.261016	0.462598	0.529476	-0.239350	0.490436	0.009752	-0.059107	0.049004	0.126983	0.287125	-0.010287
waterfront	-0.011775	0.265738	-0.004819	0.075452	0.108137	0.025584	0.031159	1.000000	0.389669	0.011613	0.088061	0.081968	0.071576	-0.026523	0.104168	0.028632	-0.014772	-0.037922	0.091810	0.028255
view	-0.024360	0.400806	0.085703	0.187488	0.282821	0.080441	0.034511	0.389669	1.000000	0.045255	0.247924	0.172693	0.265880	-0.061670	0.107605	0.089247	0.005285	-0.073151	0.278267	0.076501
condition	-0.101618	0.039740	0.034885	-0.125907	-0.054213	-0.002099	-0.261016	0.011613	0.045255	1.000000	-0.143599	-0.152856	0.175064	-0.366590	-0.062342	0.001106	-0.015974	-0.110742	-0.091407	-0.003873
grade	0.078622	0.667211	0.375286	0.666278	0.762543	0.119906	0.462598	0.088061	0.247924	-0.143599	1.000000	0.759240	0.162657	0.440608	0.030155	-0.186541	0.112319	0.202130	0.715321	0.119734
sqft_above	0.073086	0.608577	0.494867	0.688255	0.878736	0.186242	0.529476	0.081968	0.172693	-0.152856	0.759240	1.000000	-0.048623	0.422431	0.031441	-0.258474	0.001074	0.347226	0.737795	0.194226
sqft_basement	-0.050634	0.322218	0.315183	0.282642	0.434017	0.017818	-0.239350	0.071576	0.265880	0.175064	0.162657	-0.048623	1.000000	-0.136214	0.076452	0.067782	0.105969	-0.141792	0.198380	0.018813
yr_built	0.202477	0.047290	0.158799	0.503964	0.315927	0.058686	0.490436	-0.026523	-0.061670	-0.366590	0.440608	0.422431	-0.136214	1.000000	-0.215760	-0.354280	-0.156563	0.414565	0.324221	0.076009
yr_renovated	-0.029810	0.140808	0.022729	0.065423	0.064893	-0.001451	0.009752	0.104168	0.107605	-0.062342	0.030155	0.031441	0.076452	-0.215760	1.000000	0.068307	0.034844	-0.070134	0.007995	-0.000348
zipcode	-0.005761	-0.051498	-0.162081	-0.207500	-0.200745	-0.127709	-0.059107	0.028632	0.089247	0.001106	-0.186541	-0.258474	0.067782	-0.354280	0.068307	1.000000	0.261312	-0.563455	-0.274176	-0.145710
lat	0.002588	0.301604	-0.011190	0.018110	0.051609	-0.082234	0.049004	-0.014772	0.005285	-0.015974	0.112319	0.001074	0.105969	-0.156563	0.034844	0.261312	1.000000	-0.133460	0.048924	-0.081725
long	0.014757	0.023547	0.135802	0.227669	0.245429	0.227451	0.126983	-0.037922	-0.073151	-0.110742	0.202130	0.347226	-0.141792	0.414565	-0.070134	-0.563455	-0.133460	1.000000	0.333673	0.256201
sqft_living15	0.029248	0.586419	0.407394	0.573541	0.760271	0.147562	0.287125	0.091810	0.278267	-0.091407	0.715321	0.737795	0.198380	0.324221	0.007995	-0.274176	0.048924	0.333673	1.000000	0.183599
sqft_lot15	-0.032269	0.086384	0.027242	0.088120	0.184176	0.728458	-0.010287	0.028255	0.076501	-0.003873	0.119734	0.194226	0.018813	0.076009	-0.000348	-0.145710	-0.081725	0.256201	0.183599	1.000000

ch02 앙상블 기법- RandomForest(3)¶

학습 내용¶

01. 랜덤 포레스트의 배경와 원리에 대해 알아본다.¶

02. 랜덤 포레스트의 파라미터에 대해 알아본다.¶

03. 트리에서 사용하는 변수의 중요도에 대해서 체크해보고 알아본다.¶

04. 의사결정트리와 선형회귀를 그래프를 통해 확인하고 알아본다.¶

목차¶

01 앙상블(Ensemble) 기법

앙상블(ensemble)란?¶

의사결정트리는 과적합되는 현상이 발생한다. 어떻게 해야 할까?

가. 랜덤 포레스트(random forest)¶

기본 아이디어¶

나. 랜덤 포레스트 구축¶

02 집값 데이터를 활용한 랜덤 포레스트 분석

5개의 랜덤 포레스트 모델을 생성¶

캐글 코리아 2차 대회 데이터 셋 데이터¶

Price와의 상관계수 확인¶

3개의 특징을 선택 후, 데이터 전처리¶

5개의 트리¶

5개의 모델 각각에 대한 정확도 평가¶

100개 모델에 각각에 대한 정확도 평가 - 10개단위 출력으로 확인¶

확인 : 각각의 모델 자체는 높은 성능을 내는 것이 아니다.¶

실습¶

Conclusion¶

실습해 보기 3¶

도전 실습¶