from sklearn.model_selection import train_test_split
import mglearn
import sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier


print(mglearn.__version__)
print(sklearn.__version__)

0.1.9
0.23.2


iris = load_iris()
X = iris.data
y = iris.target

# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X,  y, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(112, 4) (112,) (38, 4) (38,)


model = DecisionTreeClassifier(max_depth=2, min_samples_split=3)
model.fit(X_train, y_train)
print(model.score(X_train, y_train), model.score(X_test, y_test))

0.9642857142857143 0.8947368421052632


# 최적의 score 변수
best_score = 0

for depth in [2,3,4,5,6]:
    for min_samples in [5,10,30,50,100]:
        # 매개변수의 각 조합에 대해 SVC를 훈련
        tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_samples)
        tree.fit(X_train, y_train)
        
        # 테스트 세트로 의사결정트리를 평가
        score = tree.score(X_test, y_test)
        
        # 점수가 더 높으면 매개변수와 함께 기록
        if score > best_score:
            best_score = score
            best_parameters = {'max_depth':depth, 'min_samples_split':min_samples}

            print(best_score)
            print(best_parameters)
            
print("최고 점수 : {:.2f}".format(best_score))
print("최적 매개변수 :", best_parameters)

0.8947368421052632
{'max_depth': 2, 'min_samples_split': 5}
0.9736842105263158
{'max_depth': 3, 'min_samples_split': 5}
최고 점수 : 0.97
최적 매개변수 : {'max_depth': 3, 'min_samples_split': 5}


mglearn.plots.plot_threefold_split()


from sklearn.tree import DecisionTreeClassifier

# 학습(학습+검증), 테스트로 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, 
                                                    iris.target,
                                                    random_state=0)


# 학습+검증(X_trainvalid, y_trainvalid)을 활용하여 
# 학습, 검증로 분할
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, 
                                                      y_trainval,
                                                      random_state=0)


print("훈련 세트: {}, 검증 세트: {}, 테스트 세트: {}".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]  ))

훈련 세트: 84, 검증 세트: 28, 테스트 세트: 38


# 최적의 score 변수
best_score = 0

for depth in [2,3,4,5,6]:
    for min_samples in [5,10,30,50,100]:
        # 매개변수의 각 조합에 대해 트리를 훈련
        tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_samples)
        tree.fit(X_train, y_train)
        
        # 테스트 세트로 트리를 평가
        score = tree.score(X_test, y_test)
        
        # 점수가 더 높으면 매개변수와 함께 기록
        if score > best_score:
            best_score = score
            best_parameters = {'max_depth':depth, 'min_samples_split':min_samples}

            print(best_score)
            print(best_parameters)

# 학습용 세트와 검증 세트를 합쳐 모델을 다시 만든 후,
# 테스트 세트를 사용해 평가합니다.
tree = DecisionTreeClassifier(**best_parameters)
tree.fit(X_trainval, y_trainval)
test_score = tree.score(X_test, y_test)
            
print("검증 세트에서 최고 점수 : {:.2f}".format(best_score))
print("최적 매개변수 : ", best_parameters)
print("최적 매개변수에서 테스트 세트 점수 : {:.2f}".format(test_score))

0.8947368421052632
{'max_depth': 2, 'min_samples_split': 5}
0.9736842105263158
{'max_depth': 3, 'min_samples_split': 5}
검증 세트에서 최고 점수 : 0.97
최적 매개변수 :  {'max_depth': 3, 'min_samples_split': 5}
최적 매개변수에서 테스트 세트 점수 : 0.97


import numpy as np


for depth in [2,3,4,5,6]:
    for min_samples in [5,10,30,50,100]:
        # 매개변수의 각 조합에 대해 트리를 훈련
        tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_samples)
        
        # 교차 검증을 적용합니다. 
        scores =  cross_val_score(tree, X_trainval, y_trainval, cv=5)
        
        # 교차 검증 정확도의 평균을 계산.
        score = np.mean(scores)

        # 점수가 더 높으면 매개변수와 함께 기록
        if score > best_score:
            best_score = score
            best_parameters = {'max_depth':depth, 'min_samples_split':min_samples}


# 학습용 세트와 검증 세트를 합쳐 모델을 다시 만든 후,
# 테스트 세트를 사용해 평가합니다.
tree = DecisionTreeClassifier(**best_parameters)
tree.fit(X_trainval, y_trainval)
test_score = tree.score(X_test, y_test)

print("최적 매개변수 : ", best_parameters)
print("최적 매개변수에서 테스트 세트 점수 : {:.2f}".format(test_score))

최적 매개변수 :  {'max_depth': 3, 'min_samples_split': 5}
최적 매개변수에서 테스트 세트 점수 : 0.97


# mglearn.plots.plot_cross_val_selection()


mglearn.plots.plot_grid_search_overview()


# 매개변수 
param_grid = {'max_depth':[2,3,4,5,6],
              'min_samples_split':[5,10,30,50,100] }
print("매개변수 그리드 :\n", param_grid)

매개변수 그리드 :
 {'max_depth': [2, 3, 4, 5, 6], 'min_samples_split': [5, 10, 30, 50, 100]}


from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

grid_search = GridSearchCV(DecisionTreeClassifier(), 
                           param_grid, cv=5, return_train_score=True)
grid_search

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [5, 10, 30, 50, 100]},
             return_train_score=True)


# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(iris.data, 
                                                    iris.target,
                                                    random_state=0)


# 찾기 학습
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [5, 10, 30, 50, 100]},
             return_train_score=True)


print("최적 매개변수 :", grid_search.best_params_)
print("최고 교차 검증 점수 : {:.4f}".format(grid_search.best_score_))

최적 매개변수 : {'max_depth': 5, 'min_samples_split': 5}
최고 교차 검증 점수 : 0.9731


# 훈련 후, 테스트로 적용 후, 점수
grid_search.score(X_test, y_test)

0.9736842105263158


print("최적 매개변수 :", grid_search.best_params_)
print("최고 교차 검증 점수 : {:.2f}".format(grid_search.best_score_))

### 교차 검증 중, 최고 성능 모델 확인
print("최고 성능 모델 :\n", grid_search.best_estimator_)

최적 매개변수 : {'max_depth': 5, 'min_samples_split': 5}
최고 교차 검증 점수 : 0.97
최고 성능 모델 :
 DecisionTreeClassifier(max_depth=5, min_samples_split=5)


import pandas as pd
pd.set_option('display.max_columns', None)

# DataFrame으로 변환
results = pd.DataFrame(grid_search.cv_results_)
results.shape

(25, 22)


# 전체 행을 출력
display(np.transpose(results.head(25) ))


scores = np.array(results.mean_test_score).reshape(5,5)

# 교차 검증 평균 점수 히트맵 그래프
mglearn.tools.heatmap(scores, 
                      xlabel='max_depth', xticklabels=param_grid['max_depth'],
                      ylabel='min_samples_split', yticklabels=param_grid['min_samples_split'],
                      cmap='viridis'            
                      )

<matplotlib.collections.PolyCollection at 0x2afcf7b32b0>


import matplotlib.pyplot as plt
from sklearn.svm import SVC


# 매개변수 그리드
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

grid_search = GridSearchCV(SVC(), 
                           param_grid, cv=5, return_train_score=True)
grid_search

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             return_train_score=True)


# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(iris.data, 
                                                    iris.target,
                                                    random_state=0)

# 찾기 학습
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             return_train_score=True)


fig, axes = plt.subplots(1,3, figsize=(13,5))

param_grid_linear = {'C':np.linspace(1.0, 1.1, 6),
                     'gamma': np.linspace(1.0, 1.1, 6) }

param_grid_one_log = {'C':np.linspace(1,2,6), 
                     'gamma': np.logspace(-3,2,6) }

param_grid_range = {'C': np.logspace(-3,2,6),
                    'gamma': np.logspace(-7,-2,6) }

for param_grid, ax in zip([param_grid_linear,
                            param_grid_one_log,
                            param_grid_range], axes):
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    scores = grid_search.cv_results_['mean_test_score'].reshape(6,6)
    

    # 교차 검증 평균 점수 히트맵 그래프
    scores_image = mglearn.tools.heatmap(scores, 
                      xlabel='gamma', xticklabels=param_grid['gamma'],
                      ylabel='C', yticklabels=param_grid['C'],
                      cmap='viridis', ax=ax                                       
                      )

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24
mean_fit_time	0	0.000197697	0	0.000399971	0.000200129	0.000399876	0.000401688	0.000201321	0.000200033	0.000200033	0.000200033	0	0.000200129	0.000200033	0.000398874	0.000601768	0.000400066	0.000200033	0.000216246	0	0	0	0	0	0
std_fit_time	0	0.000395393	0	0.000489863	0.000400257	0.000489746	0.000491969	0.000402641	0.000400066	0.000400066	0.000400066	0	0.000400257	0.000400066	0.000488523	0.00049138	0.000489979	0.000400066	0.000432491	0	0	0	0	0	0
mean_score_time	0.000594616	0.000602818	0.00100026	0	0.000199986	0.000198507	0.000200176	0	0.000198746	0.000400019	0.000200033	0.000198317	0.000199938	0.000200129	0.000200033	0.000200033	0.000200033	0.000200033	0	0	0	0	0	0	0
std_score_time	0.000485556	0.000492211	5.80998e-06	0	0.000399971	0.000397015	0.000400352	0	0.000397491	0.000489921	0.000400066	0.000396633	0.000399876	0.000400257	0.000400066	0.000400066	0.000400066	0.000400066	0	0	0	0	0	0	0
param_max_depth	2	2	2	2	2	3	3	3	3	3	4	4	4	4	4	5	5	5	5	5	6	6	6	6	6
param_min_samples_split	5	10	30	50	100	5	10	30	50	100	5	10	30	50	100	5	10	30	50	100	5	10	30	50	100
params	{'max_depth': 2, 'min_samples_split': 5}	{'max_depth': 2, 'min_samples_split': 10}	{'max_depth': 2, 'min_samples_split': 30}	{'max_depth': 2, 'min_samples_split': 50}	{'max_depth': 2, 'min_samples_split': 100}	{'max_depth': 3, 'min_samples_split': 5}	{'max_depth': 3, 'min_samples_split': 10}	{'max_depth': 3, 'min_samples_split': 30}	{'max_depth': 3, 'min_samples_split': 50}	{'max_depth': 3, 'min_samples_split': 100}	{'max_depth': 4, 'min_samples_split': 5}	{'max_depth': 4, 'min_samples_split': 10}	{'max_depth': 4, 'min_samples_split': 30}	{'max_depth': 4, 'min_samples_split': 50}	{'max_depth': 4, 'min_samples_split': 100}	{'max_depth': 5, 'min_samples_split': 5}	{'max_depth': 5, 'min_samples_split': 10}	{'max_depth': 5, 'min_samples_split': 30}	{'max_depth': 5, 'min_samples_split': 50}	{'max_depth': 5, 'min_samples_split': 100}	{'max_depth': 6, 'min_samples_split': 5}	{'max_depth': 6, 'min_samples_split': 10}	{'max_depth': 6, 'min_samples_split': 30}	{'max_depth': 6, 'min_samples_split': 50}	{'max_depth': 6, 'min_samples_split': 100}
split0_test_score	0.956522	0.956522	0.956522	0.956522	0.347826	1	1	0.956522	0.956522	0.347826	1	1	0.956522	0.956522	0.347826	1	1	0.956522	0.956522	0.347826	1	1	0.956522	0.956522	0.347826
split1_test_score	0.913043	0.913043	0.913043	0.913043	0.347826	0.913043	0.913043	0.913043	0.913043	0.347826	0.913043	0.913043	0.913043	0.913043	0.347826	0.956522	0.913043	0.913043	0.913043	0.347826	0.956522	0.913043	0.913043	0.913043	0.347826
split2_test_score	1	1	1	1	0.363636	1	1	1	1	0.363636	1	1	1	1	0.363636	1	1	1	1	0.363636	1	1	1	1	0.363636
split3_test_score	0.909091	0.909091	0.909091	0.909091	0.363636	0.954545	0.954545	0.909091	0.909091	0.363636	0.954545	0.954545	0.909091	0.909091	0.363636	0.954545	0.954545	0.909091	0.909091	0.363636	0.954545	0.954545	0.909091	0.909091	0.363636
split4_test_score	0.954545	0.954545	0.954545	0.954545	0.409091	0.954545	0.954545	0.954545	0.954545	0.409091	0.954545	0.954545	0.954545	0.954545	0.409091	0.954545	0.954545	0.954545	0.954545	0.409091	0.954545	0.954545	0.954545	0.954545	0.409091
mean_test_score	0.94664	0.94664	0.94664	0.94664	0.366403	0.964427	0.964427	0.94664	0.94664	0.366403	0.964427	0.964427	0.94664	0.94664	0.366403	0.973123	0.964427	0.94664	0.94664	0.366403	0.973123	0.964427	0.94664	0.94664	0.366403
std_test_score	0.0333049	0.0333049	0.0333049	0.0333049	0.0224845	0.0327611	0.0327611	0.0333049	0.0333049	0.0224845	0.0327611	0.0327611	0.0333049	0.0333049	0.0224845	0.0219572	0.0327611	0.0333049	0.0333049	0.0224845	0.0219572	0.0327611	0.0333049	0.0333049	0.0224845
rank_test_score	9	9	9	9	21	3	3	9	9	21	3	3	9	9	21	1	3	9	9	21	1	3	9	9	21
split0_train_score	0.966292	0.966292	0.966292	0.966292	0.370787	0.977528	0.977528	0.966292	0.966292	0.370787	0.977528	0.977528	0.966292	0.966292	0.370787	0.977528	0.977528	0.966292	0.966292	0.370787	0.977528	0.977528	0.966292	0.966292	0.370787
split1_train_score	0.966292	0.966292	0.966292	0.966292	0.370787	0.966292	0.966292	0.966292	0.966292	0.370787	0.977528	0.966292	0.966292	0.966292	0.370787	0.988764	0.966292	0.966292	0.966292	0.370787	0.988764	0.966292	0.966292	0.966292	0.370787
split2_train_score	0.955556	0.955556	0.955556	0.955556	0.366667	0.977778	0.977778	0.955556	0.955556	0.366667	0.977778	0.977778	0.955556	0.955556	0.366667	0.977778	0.977778	0.955556	0.955556	0.366667	0.977778	0.977778	0.955556	0.955556	0.366667
split3_train_score	0.977778	0.977778	0.977778	0.977778	0.366667	0.988889	0.988889	0.977778	0.977778	0.366667	0.988889	0.988889	0.977778	0.977778	0.366667	0.988889	0.988889	0.977778	0.977778	0.366667	0.988889	0.988889	0.977778	0.977778	0.366667
split4_train_score	0.955556	0.955556	0.955556	0.955556	0.355556	0.977778	0.977778	0.977778	0.955556	0.355556	0.988889	0.988889	0.977778	0.955556	0.355556	0.988889	0.988889	0.977778	0.955556	0.355556	0.988889	0.988889	0.977778	0.955556	0.355556
mean_train_score	0.964295	0.964295	0.964295	0.964295	0.366092	0.977653	0.977653	0.968739	0.964295	0.366092	0.982122	0.979875	0.968739	0.964295	0.366092	0.98437	0.979875	0.968739	0.964295	0.366092	0.98437	0.979875	0.968739	0.964295	0.366092
std_train_score	0.00827669	0.00827669	0.00827669	0.00827669	0.00558129	0.00714648	0.00714648	0.00835675	0.00827669	0.00558129	0.00552561	0.00844868	0.00835675	0.00827669	0.00558129	0.00548484	0.00844868	0.00835675	0.00827669	0.00558129	0.00548484	0.00844868	0.00835675	0.00827669	0.00558129

모델 최적화 - 그리드 서치¶

학습 내용¶

목차

1-1-1 그리드 서치란 무엇인가?

먼저 하이퍼 파라미터는 무엇인가?¶

하이퍼 파라미터(Hyperparameter) 튜닝을 위한 다양한 방법¶

그리드 서치는 무엇인가?¶

그리드 서치(Grid Search)를 하는 이유¶

그리드 서치의 단점¶

1-1-2 간단한 Grid Search

라이브러리 불러오기¶

모델 선택 및 학습, 그리고 평가¶

의사결정트리의 max_depth, min_samples_split 등의 매개변수를 변경해 보면 성능을 확인하기¶

for문을 이용한 확인¶

그렇다면 과도한 하이퍼 파라미터 튜닝에 이를 좀 더 일반화 시키기 위한 방법은 어떤 것이 있을까?¶

좀더 일반화된 모델을 만들기 위해 학습용 데이터셋을 나눠보자.¶

1-1-3 Grid Search 실습 두번째

데이터 셋을 3개 데이터 셋을 분리 후, 그리드 서치¶

1-1-4 교차 검증을 사용한 그리드 서치

1-1-5 교차 검증을 사용한 그리드 서치

1-1-6 GridSearchCV를 사용한 최적의 매개변수 찾기

GridSearchCV를 이용한 최적의 매개변수 찾기¶

매개변수를 선택하는데 테스트 세트를 사용하지 않음!!!¶

선택한 매개변수 및 교차 검증의 정확도 확인¶

1-1-7 교차 검증 결과 분석

기타 방법¶

히트맵¶

시각화(히트맵)를 통한 이해¶

1-1-8 매개변수의 검색 범위

적절한 방법¶