from IPython.display import display, Image
import matplotlib.pyplot as plt
import mglearn


import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform


# 한글 및 마이너스 표시 설정
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    matplotlib.rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")

matplotlib.rcParams['axes.unicode_minus'] = False
%matplotlib inline


display(Image(filename='img/decisiontree01.png'))


display(Image(filename='img/decisiontree02.png'))


display(Image(filename='img/decisiontree03.png'))


display(Image(filename='img/decisiontree04.png'))


import os
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression


ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, 
                                     "ram_price.csv"))


import numpy as np
import seaborn as sns


np.min(ram_prices.price), np.min( np.log(ram_prices.price) )

(0.0037, -5.599422459331958)


np.log(0.0037)

-5.599422459331958


# 한글 폰트가 지수에 음수를 표시하지 못하므로 ytick의 폰트를 바꾸어 줍니다.
# plt.yticks(fontname = "Arial") 

plt.figure(figsize=(14,8))

plt.subplot(1,2,1)
plt.plot(ram_prices.date, ram_prices.price) 
plt.xlabel("year")
plt.ylabel("price($/Mbyte)")

# Make a plot with log scaling on the y axis.(y축 로그 스케일)
plt.subplot(1,2,2)
plt.semilogy(ram_prices.date, ram_prices.price) 
plt.xlabel("year")
plt.ylabel("price($/Mbyte)")

Text(0, 0.5, 'price($/Mbyte)')

Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.
Font 'default' does not have a glyph for '-' [U+2212], substituting with a dummy symbol.


plt.figure(figsize=(14,8))

plt.subplot(1,2,1)
sns.distplot(ram_prices.price)

plt.subplot(1,2,2)
log_price = np.log(ram_prices.price)
sns.distplot(log_price)

C:\Users\toto\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
C:\Users\toto\anaconda3\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

<AxesSubplot:xlabel='price', ylabel='Density'>


from sklearn.tree import DecisionTreeRegressor

# 2000년 이전을 학습 데이터로,습
# 2000년 이후를 테스트 데이터로 변경
data_train = ram_prices[ram_prices.date < 2000]
data_test = ram_prices[ram_prices.date >= 2000]

# 가격 예측을 위해 날짜 특성만을 이용합니다
X_train = data_train.date[:, np.newaxis]

# 데이터와 타깃 사이의 관계를 간단하게 만들기 위해 로그 스케일로 바꿉니다
y_train = np.log(data_train.price)

<ipython-input-78-6de4b1fb924c>:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  X_train = data_train.date[:, np.newaxis]


tree = DecisionTreeRegressor().fit(X_train, y_train)
linear_reg = LinearRegression().fit(X_train, y_train)

# 예측은 전체 기간에 대해서 수행합니다
X_all = ram_prices.date[:, np.newaxis]

pred_tree = tree.predict(X_all)
pred_lr = linear_reg.predict(X_all)

# 예측한 값의 로그 스케일을 되돌립니다
price_tree = np.exp(pred_tree)
price_lr = np.exp(pred_lr)

<ipython-input-79-95e0b1948a38>:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  X_all = ram_prices.date[:, np.newaxis]


plt.yticks(fontname = "Arial") # 한글 폰트가 지수에 음수를 표시하지 못하므로 ytick의 폰트를 바꾸어 줍니다.
plt.semilogy(data_train.date, data_train.price, label="Train data")
plt.semilogy(data_test.date, data_test.price, label="Test data")
plt.semilogy(ram_prices.date, price_tree, label="Decision Tree Predict")
plt.semilogy(ram_prices.date, price_lr, label="LinearRegression Predict")
plt.legend()

<matplotlib.legend.Legend at 0x1ae078a8340>


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import seaborn as sns


cancer = load_breast_cancer()
all_X = cancer.data
all_Y = cancer.target


def testTreeModel(TestSize=0.3):
    cancer = load_breast_cancer()
    all_X = cancer.data
    all_Y = cancer.target
    X_train, X_test, y_train, y_test = train_test_split(all_X, 
                                        all_Y, 
                                        stratify=cancer.target, 
                                        test_size = TestSize,
                                        random_state=77)
    
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(X_train, y_train)
    print("훈련 세트 정확도 : {:.3f}".format(tree.score(X_train, y_train)))
    print("테스트 세트 정확도 : {:.3f}".format(tree.score(X_test, y_test)))


testTreeModel(0.3)  # 테스트 사이즈 30%
testTreeModel(0.1)  # 테스트 사이즈 10%
testTreeModel(0.2)  # 테스트 사이즈 20%

훈련 세트 정확도 : 1.000
테스트 세트 정확도 : 0.918
훈련 세트 정확도 : 1.000
테스트 세트 정확도 : 0.912
훈련 세트 정확도 : 1.000
테스트 세트 정확도 : 0.912


def testTreeModel(TestSize=0.3, treedepth=3):
    cancer = load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(all_X, 
                                            all_Y, 
                                            stratify=cancer.target, 
                                            test_size = TestSize,
                                            random_state=77)
    tree = DecisionTreeClassifier(max_depth=treedepth, random_state=0)
    tree.fit(X_train, y_train)
    print("훈련 세트 정확도 : {:.3f}".format(tree.score(X_train, y_train)))
    print("테스트 세트 정확도 : {:.3f}".format(tree.score(X_test, y_test)))


cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(all_X, 
                                             all_Y, 
                                             stratify=cancer.target, 
                                             test_size = 30,
                                             random_state=77)
tree = DecisionTreeClassifier(max_depth=2, random_state=0)
tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2, random_state=0)


## 특성의 중요도
tree.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.01305268, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.85298388, 0.        , 0.        ,
       0.        , 0.        , 0.13396343, 0.        , 0.        ])


cancer.data.shape[1]  # 특성 개수

30


import matplotlib.pyplot as plt
import numpy as np

import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform


def plot_feature_imp_cancer(model):
    n_features = cancer.data.shape[1]
    imp = model.feature_importances_
    plt.barh(range(n_features) , imp, align='center')
    plt.yticks(np.arange(n_features), cancer.feature_names)
    
    plt.xlabel("특성 중요도")
    plt.ylabel("특성")
    plt.ylim(-1, n_features)


plot_feature_imp_cancer(tree)

01. 기본- 결정트리(decision tree)¶

학습 내용¶

01. 의사결정트리- 모델 이해¶

설명¶

02. 회귀 문제에서의 의사결정트리(decision tree)¶

실습¶

데이터 로드¶

시각화¶

price 분포 확인¶

모델 선택 및 학습¶

두 모델은 확연한 차이를 보인다.¶

03. 의사결정 트리의 단점 및 장점¶

04. Overfitting(과적합)을 막는 두가지 전략¶

05. 그렇다면 어떻게 사전 가지치를 할 수 있을까?¶

트리의 최대 깊이 제한 (max_depth)¶

리프의 최대 개수 제한 (max_leaf_nodes)¶

노드 분할을 위한 포인트의 최소 개수 지정 (min_sample_leaf)¶

사전 가지치기만 지원, DecisionTreeRegressor, DecisionTreeClassifier¶

test_size를 변경해 가면서 모델 생성¶

max_depth를 변경해 가면서 모델 생성¶

06. 트리(tree)의 특성 중요도(feature importance)¶

실습과제 2¶

실습과제 3¶