from sklearn.model_selection import train_test_split
import mglearn
import sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
print(mglearn.__version__)
print(sklearn.__version__)
0.1.9 0.23.2
iris = load_iris()
X = iris.data
y = iris.target
# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(112, 4) (112,) (38, 4) (38,)
model = DecisionTreeClassifier(max_depth=2, min_samples_split=3)
model.fit(X_train, y_train)
print(model.score(X_train, y_train), model.score(X_test, y_test))
0.9642857142857143 0.8947368421052632
# 최적의 score 변수
best_score = 0
for depth in [2,3,4,5,6]:
for min_samples in [5,10,30,50,100]:
# 매개변수의 각 조합에 대해 SVC를 훈련
tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_samples)
tree.fit(X_train, y_train)
# 테스트 세트로 의사결정트리를 평가
score = tree.score(X_test, y_test)
# 점수가 더 높으면 매개변수와 함께 기록
if score > best_score:
best_score = score
best_parameters = {'max_depth':depth, 'min_samples_split':min_samples}
print(best_score)
print(best_parameters)
print("최고 점수 : {:.2f}".format(best_score))
print("최적 매개변수 :", best_parameters)
0.8947368421052632 {'max_depth': 2, 'min_samples_split': 5} 0.9736842105263158 {'max_depth': 3, 'min_samples_split': 5} 최고 점수 : 0.97 최적 매개변수 : {'max_depth': 3, 'min_samples_split': 5}
mglearn.plots.plot_threefold_split()
from sklearn.tree import DecisionTreeClassifier
# 학습(학습+검증), 테스트로 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data,
iris.target,
random_state=0)
# 학습+검증(X_trainvalid, y_trainvalid)을 활용하여
# 학습, 검증로 분할
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval,
y_trainval,
random_state=0)
print("훈련 세트: {}, 검증 세트: {}, 테스트 세트: {}".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0] ))
훈련 세트: 84, 검증 세트: 28, 테스트 세트: 38
# 최적의 score 변수
best_score = 0
for depth in [2,3,4,5,6]:
for min_samples in [5,10,30,50,100]:
# 매개변수의 각 조합에 대해 트리를 훈련
tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_samples)
tree.fit(X_train, y_train)
# 테스트 세트로 트리를 평가
score = tree.score(X_test, y_test)
# 점수가 더 높으면 매개변수와 함께 기록
if score > best_score:
best_score = score
best_parameters = {'max_depth':depth, 'min_samples_split':min_samples}
print(best_score)
print(best_parameters)
# 학습용 세트와 검증 세트를 합쳐 모델을 다시 만든 후,
# 테스트 세트를 사용해 평가합니다.
tree = DecisionTreeClassifier(**best_parameters)
tree.fit(X_trainval, y_trainval)
test_score = tree.score(X_test, y_test)
print("검증 세트에서 최고 점수 : {:.2f}".format(best_score))
print("최적 매개변수 : ", best_parameters)
print("최적 매개변수에서 테스트 세트 점수 : {:.2f}".format(test_score))
0.8947368421052632 {'max_depth': 2, 'min_samples_split': 5} 0.9736842105263158 {'max_depth': 3, 'min_samples_split': 5} 검증 세트에서 최고 점수 : 0.97 최적 매개변수 : {'max_depth': 3, 'min_samples_split': 5} 최적 매개변수에서 테스트 세트 점수 : 0.97
import numpy as np
for depth in [2,3,4,5,6]:
for min_samples in [5,10,30,50,100]:
# 매개변수의 각 조합에 대해 트리를 훈련
tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_samples)
# 교차 검증을 적용합니다.
scores = cross_val_score(tree, X_trainval, y_trainval, cv=5)
# 교차 검증 정확도의 평균을 계산.
score = np.mean(scores)
# 점수가 더 높으면 매개변수와 함께 기록
if score > best_score:
best_score = score
best_parameters = {'max_depth':depth, 'min_samples_split':min_samples}
# 학습용 세트와 검증 세트를 합쳐 모델을 다시 만든 후,
# 테스트 세트를 사용해 평가합니다.
tree = DecisionTreeClassifier(**best_parameters)
tree.fit(X_trainval, y_trainval)
test_score = tree.score(X_test, y_test)
print("최적 매개변수 : ", best_parameters)
print("최적 매개변수에서 테스트 세트 점수 : {:.2f}".format(test_score))
최적 매개변수 : {'max_depth': 3, 'min_samples_split': 5} 최적 매개변수에서 테스트 세트 점수 : 0.97
# mglearn.plots.plot_cross_val_selection()
mglearn.plots.plot_grid_search_overview()
# 매개변수
param_grid = {'max_depth':[2,3,4,5,6],
'min_samples_split':[5,10,30,50,100] }
print("매개변수 그리드 :\n", param_grid)
매개변수 그리드 : {'max_depth': [2, 3, 4, 5, 6], 'min_samples_split': [5, 10, 30, 50, 100]}
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
grid_search = GridSearchCV(DecisionTreeClassifier(),
param_grid, cv=5, return_train_score=True)
grid_search
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), param_grid={'max_depth': [2, 3, 4, 5, 6], 'min_samples_split': [5, 10, 30, 50, 100]}, return_train_score=True)
# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=0)
# 찾기 학습
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), param_grid={'max_depth': [2, 3, 4, 5, 6], 'min_samples_split': [5, 10, 30, 50, 100]}, return_train_score=True)
print("최적 매개변수 :", grid_search.best_params_)
print("최고 교차 검증 점수 : {:.4f}".format(grid_search.best_score_))
최적 매개변수 : {'max_depth': 5, 'min_samples_split': 5} 최고 교차 검증 점수 : 0.9731
# 훈련 후, 테스트로 적용 후, 점수
grid_search.score(X_test, y_test)
0.9736842105263158
print("최적 매개변수 :", grid_search.best_params_)
print("최고 교차 검증 점수 : {:.2f}".format(grid_search.best_score_))
### 교차 검증 중, 최고 성능 모델 확인
print("최고 성능 모델 :\n", grid_search.best_estimator_)
최적 매개변수 : {'max_depth': 5, 'min_samples_split': 5} 최고 교차 검증 점수 : 0.97 최고 성능 모델 : DecisionTreeClassifier(max_depth=5, min_samples_split=5)
import pandas as pd
pd.set_option('display.max_columns', None)
# DataFrame으로 변환
results = pd.DataFrame(grid_search.cv_results_)
results.shape
(25, 22)
# 전체 행을 출력
display(np.transpose(results.head(25) ))
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean_fit_time | 0 | 0.000197697 | 0 | 0.000399971 | 0.000200129 | 0.000399876 | 0.000401688 | 0.000201321 | 0.000200033 | 0.000200033 | 0.000200033 | 0 | 0.000200129 | 0.000200033 | 0.000398874 | 0.000601768 | 0.000400066 | 0.000200033 | 0.000216246 | 0 | 0 | 0 | 0 | 0 | 0 |
std_fit_time | 0 | 0.000395393 | 0 | 0.000489863 | 0.000400257 | 0.000489746 | 0.000491969 | 0.000402641 | 0.000400066 | 0.000400066 | 0.000400066 | 0 | 0.000400257 | 0.000400066 | 0.000488523 | 0.00049138 | 0.000489979 | 0.000400066 | 0.000432491 | 0 | 0 | 0 | 0 | 0 | 0 |
mean_score_time | 0.000594616 | 0.000602818 | 0.00100026 | 0 | 0.000199986 | 0.000198507 | 0.000200176 | 0 | 0.000198746 | 0.000400019 | 0.000200033 | 0.000198317 | 0.000199938 | 0.000200129 | 0.000200033 | 0.000200033 | 0.000200033 | 0.000200033 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
std_score_time | 0.000485556 | 0.000492211 | 5.80998e-06 | 0 | 0.000399971 | 0.000397015 | 0.000400352 | 0 | 0.000397491 | 0.000489921 | 0.000400066 | 0.000396633 | 0.000399876 | 0.000400257 | 0.000400066 | 0.000400066 | 0.000400066 | 0.000400066 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
param_max_depth | 2 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 | 4 | 4 | 4 | 4 | 4 | 5 | 5 | 5 | 5 | 5 | 6 | 6 | 6 | 6 | 6 |
param_min_samples_split | 5 | 10 | 30 | 50 | 100 | 5 | 10 | 30 | 50 | 100 | 5 | 10 | 30 | 50 | 100 | 5 | 10 | 30 | 50 | 100 | 5 | 10 | 30 | 50 | 100 |
params | {'max_depth': 2, 'min_samples_split': 5} | {'max_depth': 2, 'min_samples_split': 10} | {'max_depth': 2, 'min_samples_split': 30} | {'max_depth': 2, 'min_samples_split': 50} | {'max_depth': 2, 'min_samples_split': 100} | {'max_depth': 3, 'min_samples_split': 5} | {'max_depth': 3, 'min_samples_split': 10} | {'max_depth': 3, 'min_samples_split': 30} | {'max_depth': 3, 'min_samples_split': 50} | {'max_depth': 3, 'min_samples_split': 100} | {'max_depth': 4, 'min_samples_split': 5} | {'max_depth': 4, 'min_samples_split': 10} | {'max_depth': 4, 'min_samples_split': 30} | {'max_depth': 4, 'min_samples_split': 50} | {'max_depth': 4, 'min_samples_split': 100} | {'max_depth': 5, 'min_samples_split': 5} | {'max_depth': 5, 'min_samples_split': 10} | {'max_depth': 5, 'min_samples_split': 30} | {'max_depth': 5, 'min_samples_split': 50} | {'max_depth': 5, 'min_samples_split': 100} | {'max_depth': 6, 'min_samples_split': 5} | {'max_depth': 6, 'min_samples_split': 10} | {'max_depth': 6, 'min_samples_split': 30} | {'max_depth': 6, 'min_samples_split': 50} | {'max_depth': 6, 'min_samples_split': 100} |
split0_test_score | 0.956522 | 0.956522 | 0.956522 | 0.956522 | 0.347826 | 1 | 1 | 0.956522 | 0.956522 | 0.347826 | 1 | 1 | 0.956522 | 0.956522 | 0.347826 | 1 | 1 | 0.956522 | 0.956522 | 0.347826 | 1 | 1 | 0.956522 | 0.956522 | 0.347826 |
split1_test_score | 0.913043 | 0.913043 | 0.913043 | 0.913043 | 0.347826 | 0.913043 | 0.913043 | 0.913043 | 0.913043 | 0.347826 | 0.913043 | 0.913043 | 0.913043 | 0.913043 | 0.347826 | 0.956522 | 0.913043 | 0.913043 | 0.913043 | 0.347826 | 0.956522 | 0.913043 | 0.913043 | 0.913043 | 0.347826 |
split2_test_score | 1 | 1 | 1 | 1 | 0.363636 | 1 | 1 | 1 | 1 | 0.363636 | 1 | 1 | 1 | 1 | 0.363636 | 1 | 1 | 1 | 1 | 0.363636 | 1 | 1 | 1 | 1 | 0.363636 |
split3_test_score | 0.909091 | 0.909091 | 0.909091 | 0.909091 | 0.363636 | 0.954545 | 0.954545 | 0.909091 | 0.909091 | 0.363636 | 0.954545 | 0.954545 | 0.909091 | 0.909091 | 0.363636 | 0.954545 | 0.954545 | 0.909091 | 0.909091 | 0.363636 | 0.954545 | 0.954545 | 0.909091 | 0.909091 | 0.363636 |
split4_test_score | 0.954545 | 0.954545 | 0.954545 | 0.954545 | 0.409091 | 0.954545 | 0.954545 | 0.954545 | 0.954545 | 0.409091 | 0.954545 | 0.954545 | 0.954545 | 0.954545 | 0.409091 | 0.954545 | 0.954545 | 0.954545 | 0.954545 | 0.409091 | 0.954545 | 0.954545 | 0.954545 | 0.954545 | 0.409091 |
mean_test_score | 0.94664 | 0.94664 | 0.94664 | 0.94664 | 0.366403 | 0.964427 | 0.964427 | 0.94664 | 0.94664 | 0.366403 | 0.964427 | 0.964427 | 0.94664 | 0.94664 | 0.366403 | 0.973123 | 0.964427 | 0.94664 | 0.94664 | 0.366403 | 0.973123 | 0.964427 | 0.94664 | 0.94664 | 0.366403 |
std_test_score | 0.0333049 | 0.0333049 | 0.0333049 | 0.0333049 | 0.0224845 | 0.0327611 | 0.0327611 | 0.0333049 | 0.0333049 | 0.0224845 | 0.0327611 | 0.0327611 | 0.0333049 | 0.0333049 | 0.0224845 | 0.0219572 | 0.0327611 | 0.0333049 | 0.0333049 | 0.0224845 | 0.0219572 | 0.0327611 | 0.0333049 | 0.0333049 | 0.0224845 |
rank_test_score | 9 | 9 | 9 | 9 | 21 | 3 | 3 | 9 | 9 | 21 | 3 | 3 | 9 | 9 | 21 | 1 | 3 | 9 | 9 | 21 | 1 | 3 | 9 | 9 | 21 |
split0_train_score | 0.966292 | 0.966292 | 0.966292 | 0.966292 | 0.370787 | 0.977528 | 0.977528 | 0.966292 | 0.966292 | 0.370787 | 0.977528 | 0.977528 | 0.966292 | 0.966292 | 0.370787 | 0.977528 | 0.977528 | 0.966292 | 0.966292 | 0.370787 | 0.977528 | 0.977528 | 0.966292 | 0.966292 | 0.370787 |
split1_train_score | 0.966292 | 0.966292 | 0.966292 | 0.966292 | 0.370787 | 0.966292 | 0.966292 | 0.966292 | 0.966292 | 0.370787 | 0.977528 | 0.966292 | 0.966292 | 0.966292 | 0.370787 | 0.988764 | 0.966292 | 0.966292 | 0.966292 | 0.370787 | 0.988764 | 0.966292 | 0.966292 | 0.966292 | 0.370787 |
split2_train_score | 0.955556 | 0.955556 | 0.955556 | 0.955556 | 0.366667 | 0.977778 | 0.977778 | 0.955556 | 0.955556 | 0.366667 | 0.977778 | 0.977778 | 0.955556 | 0.955556 | 0.366667 | 0.977778 | 0.977778 | 0.955556 | 0.955556 | 0.366667 | 0.977778 | 0.977778 | 0.955556 | 0.955556 | 0.366667 |
split3_train_score | 0.977778 | 0.977778 | 0.977778 | 0.977778 | 0.366667 | 0.988889 | 0.988889 | 0.977778 | 0.977778 | 0.366667 | 0.988889 | 0.988889 | 0.977778 | 0.977778 | 0.366667 | 0.988889 | 0.988889 | 0.977778 | 0.977778 | 0.366667 | 0.988889 | 0.988889 | 0.977778 | 0.977778 | 0.366667 |
split4_train_score | 0.955556 | 0.955556 | 0.955556 | 0.955556 | 0.355556 | 0.977778 | 0.977778 | 0.977778 | 0.955556 | 0.355556 | 0.988889 | 0.988889 | 0.977778 | 0.955556 | 0.355556 | 0.988889 | 0.988889 | 0.977778 | 0.955556 | 0.355556 | 0.988889 | 0.988889 | 0.977778 | 0.955556 | 0.355556 |
mean_train_score | 0.964295 | 0.964295 | 0.964295 | 0.964295 | 0.366092 | 0.977653 | 0.977653 | 0.968739 | 0.964295 | 0.366092 | 0.982122 | 0.979875 | 0.968739 | 0.964295 | 0.366092 | 0.98437 | 0.979875 | 0.968739 | 0.964295 | 0.366092 | 0.98437 | 0.979875 | 0.968739 | 0.964295 | 0.366092 |
std_train_score | 0.00827669 | 0.00827669 | 0.00827669 | 0.00827669 | 0.00558129 | 0.00714648 | 0.00714648 | 0.00835675 | 0.00827669 | 0.00558129 | 0.00552561 | 0.00844868 | 0.00835675 | 0.00827669 | 0.00558129 | 0.00548484 | 0.00844868 | 0.00835675 | 0.00827669 | 0.00558129 | 0.00548484 | 0.00844868 | 0.00835675 | 0.00827669 | 0.00558129 |
scores = np.array(results.mean_test_score).reshape(5,5)
# 교차 검증 평균 점수 히트맵 그래프
mglearn.tools.heatmap(scores,
xlabel='max_depth', xticklabels=param_grid['max_depth'],
ylabel='min_samples_split', yticklabels=param_grid['min_samples_split'],
cmap='viridis'
)
<matplotlib.collections.PolyCollection at 0x2afcf7b32b0>
import matplotlib.pyplot as plt
from sklearn.svm import SVC
# 매개변수 그리드
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }
grid_search = GridSearchCV(SVC(),
param_grid, cv=5, return_train_score=True)
grid_search
GridSearchCV(cv=5, estimator=SVC(), param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, return_train_score=True)
# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=0)
# 찾기 학습
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=SVC(), param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, return_train_score=True)
fig, axes = plt.subplots(1,3, figsize=(13,5))
param_grid_linear = {'C':np.linspace(1.0, 1.1, 6),
'gamma': np.linspace(1.0, 1.1, 6) }
param_grid_one_log = {'C':np.linspace(1,2,6),
'gamma': np.logspace(-3,2,6) }
param_grid_range = {'C': np.logspace(-3,2,6),
'gamma': np.logspace(-7,-2,6) }
for param_grid, ax in zip([param_grid_linear,
param_grid_one_log,
param_grid_range], axes):
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
scores = grid_search.cv_results_['mean_test_score'].reshape(6,6)
# 교차 검증 평균 점수 히트맵 그래프
scores_image = mglearn.tools.heatmap(scores,
xlabel='gamma', xticklabels=param_grid['gamma'],
ylabel='C', yticklabels=param_grid['C'],
cmap='viridis', ax=ax
)
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2022 LIM Co. all rights reserved.