from IPython.display import display, Image
import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
elif platform.system()=="Darwin":
rc('font', family='AppleGothic')
else:
print("Unknown System")
display(Image(filename='img/overfitting01.png'))
* 이전트리의 오차를 얼마나 강하게 보정할 것인가를 제어(learning_rate이다.)
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# n_features : 변수의 개수
# feature_names : 변수의 이름
# model : 모델
def plot_feature_important_up(model, n_features, feature_names):
imp = model.feature_importances_ # feature의 중요도
plt.barh(range(n_features) , imp, align='center') # 그래프(가로 막대 그래프)
plt.yticks(np.arange(n_features), feature_names) #y축의 축의 값
plt.xlabel("feature importance") # x축 레이블(제목)
plt.ylabel("feature") # y축 제목
plt.ylim(-1, n_features) # y축의 범위 지정
from sklearn.ensemble import GradientBoostingClassifier
cancer = load_breast_cancer()
# 데이터 확인
print(cancer.keys())
print(cancer.target_names) # 목적변수(종속변수)
print(cancer.target)
print(cancer.feature_names)
print(cancer.data.shape) # 569행 , 30열
print(cancer.data) # 설명변수(독립변수)
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename']) ['malignant' 'benign'] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1] ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean smoothness' 'mean compactness' 'mean concavity' 'mean concave points' 'mean symmetry' 'mean fractal dimension' 'radius error' 'texture error' 'perimeter error' 'area error' 'smoothness error' 'compactness error' 'concavity error' 'concave points error' 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture' 'worst perimeter' 'worst area' 'worst smoothness' 'worst compactness' 'worst concavity' 'worst concave points' 'worst symmetry' 'worst fractal dimension'] (569, 30) [[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01] [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02] [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02] ... [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02] [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01] [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
# 데이터 나누기
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# 모델 생성 및 학습
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
GradientBoostingClassifier(random_state=0)
# 정확도 확인
print("훈련 세트 정확도: {:.3f}".format(gbrt.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbrt.score(X_test, y_test)))
훈련 세트 정확도: 1.000 테스트 세트 정확도: 0.965
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)
print("훈련 세트 정확도: {:.3f}".format(gbrt.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbrt.score(X_test, y_test)))
훈련 세트 정확도: 0.991 테스트 세트 정확도: 0.972
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)
print("훈련 세트 정확도: {:.3f}".format(gbrt.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbrt.score(X_test, y_test)))
훈련 세트 정확도: 0.988 테스트 세트 정확도: 0.965
### 소스 코드
# model : 모델
# n_features : feature(변수의 개수)
# feature_names : 특성의 이름
def plot_feature_important_up(model, n_features, feature_names):
imp = model.feature_importances_ # feature의 중요도
plt.barh(range(n_features) , imp, align='center') # 그래프(가로 막대 그래프)
plt.yticks(np.arange(n_features), feature_names) #y축의 축의 값
plt.xlabel("feature importance") # x축 레이블(제목)
plt.ylabel("feature") # y축 제목
plt.ylim(-1, n_features) # y축의 범위 지정
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1,learning_rate=0.1)
gbrt.fit(X_train, y_train)
feature_num = X_train.shape[1]
feature_names = cancer.feature_names
plot_feature_important_up(gbrt, feature_num, feature_names)
import pandas as pd
import seaborn as sns
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)
GradientBoostingClassifier(learning_rate=0.01, random_state=0)
fea_imp_values = gbrt.feature_importances_
importances = pd.Series(fea_imp_values, index=feature_names)
plt.figure(figsize=(8,6))
plt.title('feature importances')
sns.barplot(x=importances, y=importances.index)
<AxesSubplot:title={'center':'feature importances'}>
단점 :
장점 :