import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancer_df.head()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 30 columns
cancer_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mean radius 569 non-null float64 1 mean texture 569 non-null float64 2 mean perimeter 569 non-null float64 3 mean area 569 non-null float64 4 mean smoothness 569 non-null float64 5 mean compactness 569 non-null float64 6 mean concavity 569 non-null float64 7 mean concave points 569 non-null float64 8 mean symmetry 569 non-null float64 9 mean fractal dimension 569 non-null float64 10 radius error 569 non-null float64 11 texture error 569 non-null float64 12 perimeter error 569 non-null float64 13 area error 569 non-null float64 14 smoothness error 569 non-null float64 15 compactness error 569 non-null float64 16 concavity error 569 non-null float64 17 concave points error 569 non-null float64 18 symmetry error 569 non-null float64 19 fractal dimension error 569 non-null float64 20 worst radius 569 non-null float64 21 worst texture 569 non-null float64 22 worst perimeter 569 non-null float64 23 worst area 569 non-null float64 24 worst smoothness 569 non-null float64 25 worst compactness 569 non-null float64 26 worst concavity 569 non-null float64 27 worst concave points 569 non-null float64 28 worst symmetry 569 non-null float64 29 worst fractal dimension 569 non-null float64 dtypes: float64(30) memory usage: 133.5 KB
print( cancer_df.shape)
(569, 30)
# 피처와 레이블를 지정.
X = cancer_df[:]
y = cancer.target
X.shape, y.shape
((569, 30), (569,))
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((455, 30), (114, 30), (455,), (114,))
# 모델 선택
model = RandomForestClassifier(random_state=77)
# 학습
model.fit(X_train, y_train)
# 예측
pred = model.predict(X_test)
print("예측 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ))
예측 정확도 : 0.9737
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
params = {
"n_estimators" : [100],
"max_depth":[6,8,10,12],
"min_samples_leaf":[8,12,18],
"min_samples_split":[8,16,20]
}
# RandomForestClassifier 모델 객체 생성 후, GridSearch 수행
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(model_rf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print("최적의 하이퍼 파라미터 : \n", grid_cv.best_params_)
print("최고의 정확도 : {0:.4f}".format(grid_cv.best_score_ ))
최적의 하이퍼 파라미터 : {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100} 최고의 정확도 : 0.9451
UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-20: ordinal not in range(128)
해결 : 에러 발생시 n_jobs를 지우고 해 본다.
# 모델 선택
model_lrf = RandomForestClassifier(n_estimators= 100, max_depth= 6,
min_samples_leaf= 8, min_samples_split= 8, random_state=0)
model_lrf.fit(X_train, y_train)
pred = model_lrf.predict(X_test)
print("예측 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ))
예측 정확도 : 0.9649
import seaborn as sns
import matplotlib.pyplot as plt
f_imp_values = model_lrf.feature_importances_
f_importances = pd.Series(f_imp_values, index=X_train.columns)
f_top10 = f_importances.sort_values(ascending=False)[:10] # 10위까지의 중요도 확인
plt.figure(figsize=(8, 6))
plt.title("Feature importances Top 20")
sns.barplot(x=f_top10, y=f_top10.index)
plt.show()