import os, warnings
import numpy as np
# 경고 메시지 무시하거나 숨길때(ignore), 다시보이게(default)
# warnings.filterwarnings(action='default')
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
import sklearn
import pandas as pd
import mglearn
print(sklearn.__version__)
0.23.2
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['price'] = boston.target
print(df.shape)
(506, 14)
df.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
X = df.drop(['price'], axis=1)
y = df['price']
lr_model = LinearRegression()
neg_mse_scores = cross_val_score(lr_model, X, y,
scoring='neg_mean_squared_error', cv=5 )
rmse = np.sqrt(-1 * neg_mse_scores)
print(rmse)
print("평균 RMSE : {0:.3f}".format( np.mean(rmse) ) )
[3.52991509 5.10378498 5.75101191 8.9867887 5.77179405] 평균 RMSE : 5.829
from sklearn.model_selection import LeaveOneOut
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
loo = LeaveOneOut()
iris = load_iris()
tree = DecisionTreeClassifier()
scores = cross_val_score(tree, iris.data, iris.target, cv=loo)
print("교차 검증 분할 횟수 : ", len(scores))
print("평균 정확도 : {:.2f}".format(scores.mean()))
교차 검증 분할 횟수 : 150 평균 정확도 : 0.95
mglearn.plots.plot_shuffle_split()
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(train_size=10, test_size=5, n_splits=10)
scores = cross_val_score(tree, iris.data, iris.target,
cv=shuffle_split)
print("교차 검증 점수 : \n{}".format(scores))
avg = scores.mean()
print("평균 : {:.2f}".format( avg ) )
교차 검증 점수 : [1. 0.6 1. 0.8 0.4 1. 1. 1. 1. 1. ] 평균 : 0.88
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=0.5, train_size=0.5, n_splits=10)
scores = cross_val_score(tree, iris.data, iris.target, cv=shuffle_split)
print("교차 검증 점수 : \n{}".format(scores))
scores.mean()
교차 검증 점수 : [0.93333333 0.96 0.93333333 0.94666667 0.97333333 0.94666667 0.93333333 0.86666667 0.94666667 0.90666667]
0.9346666666666665
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
iris = load_iris()
logreg = LogisticRegression()
# n_split : 5 분할 폴드 수(기본값 : 5)
# n_repeat = 10 반복횟수(기본값 : 10)
# Point : 반복할때마다 데이터를 다시 섞습니다.
rskfold = RepeatedStratifiedKFold(random_state=42, n_splits=5, n_repeats=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=rskfold)
print("횟수 : ", len(scores))
print("교차 검증 점수 : \n", scores)
print("교차 검증 평균 점수 : {:.3f}".format(scores.mean() ) )
횟수 : 50 교차 검증 점수 : [1. 0.96666667 0.93333333 1. 0.93333333 0.96666667 0.96666667 0.93333333 1. 0.96666667 0.93333333 1. 1. 0.96666667 0.96666667 0.9 1. 1. 0.93333333 0.96666667 0.93333333 0.96666667 0.96666667 1. 0.96666667 1. 0.96666667 0.96666667 0.9 1. 0.96666667 0.96666667 0.96666667 0.96666667 0.93333333 0.96666667 0.96666667 1. 1. 0.9 0.96666667 1. 0.9 0.96666667 0.96666667 0.9 0.96666667 0.96666667 1. 0.96666667] 교차 검증 평균 점수 : 0.965
mglearn.plots.plot_group_kfold()
from sklearn.model_selection import GroupKFold
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=12, random_state=0)
print(X.shape, y.shape)
# 처음 세 개의 샘플은 같은 그룹에 속하고,
# 다음은 네 개의 샘플이 같습니다.
groups = [0,0,0, 1,1,1,1, 2,2, 3,3,3] # 4개 그룹
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print("교차 검증 점수 : \n", scores)
(12, 2) (12,) 교차 검증 점수 : [0.75 0.6 0.66666667]
gkf = GroupKFold(n_splits=3)
groups = [0,0,0, 1,1,1,1, 2,2, 3,3,3]
X, y = make_blobs(n_samples=12, random_state=0)
for train, test in gkf.split(X, y, groups=groups):
print("%s %s" % (train, test))
[ 0 1 2 7 8 9 10 11] [3 4 5 6] [0 1 2 3 4 5 6] [ 7 8 9 10 11] [ 3 4 5 6 7 8 9 10 11] [0 1 2]
gkf = GroupKFold(n_splits=4)
groups = [0,0,0, 1,1,1,1, 2,2, 3,3,3]
X, y = make_blobs(n_samples=12, random_state=0)
for train, test in gkf.split(X, y, groups=groups):
print("%s %s" % (train, test))
[ 0 1 2 7 8 9 10 11] [3 4 5 6] [0 1 2 3 4 5 6 7 8] [ 9 10 11] [ 3 4 5 6 7 8 9 10 11] [0 1 2] [ 0 1 2 3 4 5 6 9 10 11] [7 8]
from sklearn.model_selection import GroupKFold
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
gkf = GroupKFold(n_splits=3)
for train, test in gkf.split(X, y, groups=groups):
print("%s %s" % (train, test))
[0 1 2 3 4 5] [6 7 8 9] [0 1 2 6 7 8 9] [3 4 5] [3 4 5 6 7 8 9] [0 1 2]
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2022 LIM Co. all rights reserved.