import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score


cancer = load_breast_cancer()

cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancer_df.head()


cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         569 non-null    float64
 15  compactness error        569 non-null    float64
 16  concavity error          569 non-null    float64
 17  concave points error     569 non-null    float64
 18  symmetry error           569 non-null    float64
 19  fractal dimension error  569 non-null    float64
 20  worst radius             569 non-null    float64
 21  worst texture            569 non-null    float64
 22  worst perimeter          569 non-null    float64
 23  worst area               569 non-null    float64
 24  worst smoothness         569 non-null    float64
 25  worst compactness        569 non-null    float64
 26  worst concavity          569 non-null    float64
 27  worst concave points     569 non-null    float64
 28  worst symmetry           569 non-null    float64
 29  worst fractal dimension  569 non-null    float64
dtypes: float64(30)
memory usage: 133.5 KB


print( cancer_df.shape)

(569, 30)


# 피처와 레이블를 지정.
X = cancer_df[:]
y = cancer.target

X.shape, y.shape

((569, 30), (569,))


from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(X, y, 
                                                     test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))


# 모델 선택
model = RandomForestClassifier(random_state=77)
# 학습
model.fit(X_train, y_train)
# 예측
pred = model.predict(X_test)

print("예측 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ))

예측 정확도 : 0.9737


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


params = {
    "n_estimators" : [100],
    "max_depth":[6,8,10,12],
    "min_samples_leaf":[8,12,18],
    "min_samples_split":[8,16,20]
}


# RandomForestClassifier 모델 객체 생성 후, GridSearch 수행
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(model_rf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

print("최적의 하이퍼 파라미터 : \n", grid_cv.best_params_)
print("최고의 정확도 : {0:.4f}".format(grid_cv.best_score_ ))

최적의 하이퍼 파라미터 : 
 {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고의 정확도 : 0.9451


# 모델 선택
model_lrf = RandomForestClassifier(n_estimators= 100, max_depth= 6, 
                                   min_samples_leaf= 8, min_samples_split= 8, random_state=0)

model_lrf.fit(X_train, y_train)
pred = model_lrf.predict(X_test)

print("예측 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ))

예측 정확도 : 0.9649


import seaborn as sns
import matplotlib.pyplot as plt

f_imp_values = model_lrf.feature_importances_
f_importances = pd.Series(f_imp_values, index=X_train.columns)
f_top10 = f_importances.sort_values(ascending=False)[:10]       # 10위까지의 중요도 확인


plt.figure(figsize=(8, 6))
plt.title("Feature importances Top 20")
sns.barplot(x=f_top10, y=f_top10.index)
plt.show()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

위스콘신 유방암 데이터의 하이퍼 파라미터 튜닝¶

학습 목표¶

학습 내용¶

목차

01 데이터 로드 및 전처리

데이터 설명¶

데이터 나누기¶

02 GridSearchCV를 이용한 랜덤 포레스트의 하이퍼 파라미터 튜닝

기본 RandomForest 모델¶

03 모델 수행 후, feature(피처)의 중요도 확인해보기

정리¶