(가) decision tree는 classification(분류)와 regression(회귀) 문제에 널리 사용하는 모델이다. (나) 스무고개 놀이의 질문과 비슷하다.
# 라이브러리 불러오기
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
구분 | 설명 | |
---|---|---|
Pregnancies | 임신 | |
Glucose | 포도당 | |
BloodPressure | 혈압 | |
SkinThickness | 피부두께 | |
Insulin | 인슐린 | |
BMI | BMI | 111 |
Diabetes Pedigree Function | 당뇨병혈통기능 | |
Age | 나이 | |
Outcome | 결과 |
pima = pd.read_csv("diabetes.csv")
pima.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object')
pima.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
pima.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object')
pima.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
pima.head(3)
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
pima.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object')
# 데이터 셋 (feature와 target 변수로 나누기)
feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age']
X = pima[feature_cols] # Features
y = pima.Outcome # Target variable
# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
print(X_test.columns)
print(X_train.columns)
print(y_train.shape)
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object') Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object') (537,)
# 의사결정 트리 모델 생성 및 학습
model = DecisionTreeClassifier(max_depth=5, random_state=0).fit(X_train,y_train)
# 예측
y_pred = model.predict(X_test)
from sklearn import metrics
# Model Accuracy, 얼마나 정확한가? 정확도
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.7575757575757576
from sklearn.tree import export_graphviz
export_graphviz(model, out_file="tree.dot",
class_names=['당뇨', '당뇨X'],
feature_names = feature_cols,
impurity = True, # gini 계수
filled=True) # color
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
import pydotplus
from IPython.display import Image
import graphviz
# model : 모델명,
# class_n : 클래스명,
# feature_n : 특징 이름
def tree_plot(model, class_n, feature_n):
export_graphviz(model, out_file="tree.dot",
class_names = class_n,
feature_names = feature_n,
impurity = True, # gini 계수
filled=True,
rounded=True,
special_characters=True) # color
with open("tree.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
tree_plot(model, ['당뇨', '당뇨X'], feature_cols)
model = DecisionTreeClassifier(criterion="entropy",
max_depth=3,
random_state=0) # 의사결정트리 모델
model.fit(X_train,y_train) # 학습
y_pred = model.predict(X_test) # 데이터 셋 예측
# 정확도 확인
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.7705627705627706