import pandas as pd
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import matplotlib
import lightgbm as lgbm
import sklearn as sk
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
print("LightGBM : ", lgbm.__version__)
print("Scikit-learn : ", sk.__version__)
print("pandas : ", pd.__version__)
LightGBM : 3.3.3 Scikit-learn : 1.0.2 pandas : 1.4.4
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancer_df.head()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 30 columns
print( cancer_df.shape)
(569, 30)
# 피처와 레이블를 지정.
X = cancer_df[:]
y = cancer.target
X.shape, y.shape
((569, 30), (569,))
X_train , X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((455, 30), (114, 30), (455,), (114,))
from lightgbm import LGBMClassifier
# 모델 선택
model_lgbm = LGBMClassifier(n_estimators= 400)
# Lgbm은 중간에 조기 중단이 가능.
evals = [(X_test, y_test)]
# 모델 학습
model_lgbm.fit(X_train, y_train, early_stopping_rounds=100,
eval_metric='logloss',
eval_set=evals,
verbose=True)
# 모델을 활용한 예측
preds = model_lgbm.predict(X_test)
pred_proba = model_lgbm.predict_proba(X_test)[:, 1]
C:\Users\withJesus\anaconda3\lib\site-packages\lightgbm\sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. " C:\Users\withJesus\anaconda3\lib\site-packages\lightgbm\sklearn.py:736: UserWarning: 'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead. _log_warning("'verbose' argument is deprecated and will be removed in a future release of LightGBM. "
[1] valid_0's binary_logloss: 0.602575 [2] valid_0's binary_logloss: 0.536123 [3] valid_0's binary_logloss: 0.482224 [4] valid_0's binary_logloss: 0.439121 [5] valid_0's binary_logloss: 0.399484 [6] valid_0's binary_logloss: 0.360741 [7] valid_0's binary_logloss: 0.329635 [8] valid_0's binary_logloss: 0.300259 [9] valid_0's binary_logloss: 0.279599 [10] valid_0's binary_logloss: 0.259173 [11] valid_0's binary_logloss: 0.239127 [12] valid_0's binary_logloss: 0.222309 [13] valid_0's binary_logloss: 0.209351 [14] valid_0's binary_logloss: 0.194316 [15] valid_0's binary_logloss: 0.185282 [16] valid_0's binary_logloss: 0.173675 [17] valid_0's binary_logloss: 0.165308 [18] valid_0's binary_logloss: 0.155769 [19] valid_0's binary_logloss: 0.14865 [20] valid_0's binary_logloss: 0.140995 [21] valid_0's binary_logloss: 0.134334 [22] valid_0's binary_logloss: 0.126574 [23] valid_0's binary_logloss: 0.119264 [24] valid_0's binary_logloss: 0.114799 [25] valid_0's binary_logloss: 0.110568 [26] valid_0's binary_logloss: 0.106157 [27] valid_0's binary_logloss: 0.100816 [28] valid_0's binary_logloss: 0.0978631 [29] valid_0's binary_logloss: 0.0945873 [30] valid_0's binary_logloss: 0.0916942 [31] valid_0's binary_logloss: 0.0863612 [32] valid_0's binary_logloss: 0.0822965 [33] valid_0's binary_logloss: 0.078824 [34] valid_0's binary_logloss: 0.0752847 [35] valid_0's binary_logloss: 0.0718233 [36] valid_0's binary_logloss: 0.0696371 [37] valid_0's binary_logloss: 0.0676798 [38] valid_0's binary_logloss: 0.0665177 [39] valid_0's binary_logloss: 0.0660656 [40] valid_0's binary_logloss: 0.064516 [41] valid_0's binary_logloss: 0.0634248 [42] valid_0's binary_logloss: 0.0616478 [43] valid_0's binary_logloss: 0.0603263 [44] valid_0's binary_logloss: 0.0598075 [45] valid_0's binary_logloss: 0.058935 [46] valid_0's binary_logloss: 0.0594708 [47] valid_0's binary_logloss: 0.0575433 [48] valid_0's binary_logloss: 0.05764 [49] valid_0's binary_logloss: 0.058022 [50] valid_0's binary_logloss: 0.056115 [51] valid_0's binary_logloss: 0.0551967 [52] valid_0's binary_logloss: 0.0546137 [53] valid_0's binary_logloss: 0.055392 [54] valid_0's binary_logloss: 0.0550331 [55] valid_0's binary_logloss: 0.0550406 [56] valid_0's binary_logloss: 0.0551554 [57] valid_0's binary_logloss: 0.0550553 [58] valid_0's binary_logloss: 0.0549359 [59] valid_0's binary_logloss: 0.056094 [60] valid_0's binary_logloss: 0.0571328 [61] valid_0's binary_logloss: 0.0580245 [62] valid_0's binary_logloss: 0.0574484 [63] valid_0's binary_logloss: 0.0577891 [64] valid_0's binary_logloss: 0.0592917 [65] valid_0's binary_logloss: 0.0597466 [66] valid_0's binary_logloss: 0.0597562 [67] valid_0's binary_logloss: 0.0608003 [68] valid_0's binary_logloss: 0.0599771 [69] valid_0's binary_logloss: 0.060662 [70] valid_0's binary_logloss: 0.0610663 [71] valid_0's binary_logloss: 0.0617392 [72] valid_0's binary_logloss: 0.0611657 [73] valid_0's binary_logloss: 0.0607031 [74] valid_0's binary_logloss: 0.0607682 [75] valid_0's binary_logloss: 0.0618635 [76] valid_0's binary_logloss: 0.0632427 [77] valid_0's binary_logloss: 0.0641161 [78] valid_0's binary_logloss: 0.0648781 [79] valid_0's binary_logloss: 0.064908 [80] valid_0's binary_logloss: 0.0650122 [81] valid_0's binary_logloss: 0.0651241 [82] valid_0's binary_logloss: 0.0651113 [83] valid_0's binary_logloss: 0.0650562 [84] valid_0's binary_logloss: 0.0642698 [85] valid_0's binary_logloss: 0.0629834 [86] valid_0's binary_logloss: 0.0632386 [87] valid_0's binary_logloss: 0.0623831 [88] valid_0's binary_logloss: 0.0611975 [89] valid_0's binary_logloss: 0.060535 [90] valid_0's binary_logloss: 0.0604596 [91] valid_0's binary_logloss: 0.0594299 [92] valid_0's binary_logloss: 0.0594986 [93] valid_0's binary_logloss: 0.0599838 [94] valid_0's binary_logloss: 0.05974 [95] valid_0's binary_logloss: 0.0606709 [96] valid_0's binary_logloss: 0.0607891 [97] valid_0's binary_logloss: 0.062165 [98] valid_0's binary_logloss: 0.0608996 [99] valid_0's binary_logloss: 0.062041 [100] valid_0's binary_logloss: 0.0608407 [101] valid_0's binary_logloss: 0.0623696 [102] valid_0's binary_logloss: 0.0629992 [103] valid_0's binary_logloss: 0.0645364 [104] valid_0's binary_logloss: 0.0657452 [105] valid_0's binary_logloss: 0.065999 [106] valid_0's binary_logloss: 0.0665913 [107] valid_0's binary_logloss: 0.0645921 [108] valid_0's binary_logloss: 0.0643004 [109] valid_0's binary_logloss: 0.0635534 [110] valid_0's binary_logloss: 0.0639176 [111] valid_0's binary_logloss: 0.0648292 [112] valid_0's binary_logloss: 0.0648756 [113] valid_0's binary_logloss: 0.064164 [114] valid_0's binary_logloss: 0.061978 [115] valid_0's binary_logloss: 0.0630472 [116] valid_0's binary_logloss: 0.0615236 [117] valid_0's binary_logloss: 0.0621549 [118] valid_0's binary_logloss: 0.0618786 [119] valid_0's binary_logloss: 0.060674 [120] valid_0's binary_logloss: 0.0612046 [121] valid_0's binary_logloss: 0.0625152 [122] valid_0's binary_logloss: 0.0626253 [123] valid_0's binary_logloss: 0.0622938 [124] valid_0's binary_logloss: 0.0616508 [125] valid_0's binary_logloss: 0.0628013 [126] valid_0's binary_logloss: 0.0636045 [127] valid_0's binary_logloss: 0.0619977 [128] valid_0's binary_logloss: 0.0633133 [129] valid_0's binary_logloss: 0.0610869 [130] valid_0's binary_logloss: 0.0618683 [131] valid_0's binary_logloss: 0.0601291 [132] valid_0's binary_logloss: 0.0589631 [133] valid_0's binary_logloss: 0.0602436 [134] valid_0's binary_logloss: 0.0598243 [135] valid_0's binary_logloss: 0.0607904 [136] valid_0's binary_logloss: 0.0620596 [137] valid_0's binary_logloss: 0.0619066 [138] valid_0's binary_logloss: 0.0624083 [139] valid_0's binary_logloss: 0.0640835 [140] valid_0's binary_logloss: 0.0648152 [141] valid_0's binary_logloss: 0.0651628 [142] valid_0's binary_logloss: 0.0648746 [143] valid_0's binary_logloss: 0.0631569 [144] valid_0's binary_logloss: 0.0645178 [145] valid_0's binary_logloss: 0.0649077 [146] valid_0's binary_logloss: 0.0653192 [147] valid_0's binary_logloss: 0.0644097 [148] valid_0's binary_logloss: 0.0630949 [149] valid_0's binary_logloss: 0.0634037 [150] valid_0's binary_logloss: 0.0638329 [151] valid_0's binary_logloss: 0.0636558 [152] valid_0's binary_logloss: 0.0632649
from sklearn import metrics
# 모델 평가를 위한 함수
def get_clf_eval(y_test, y_pred=None, pred_proba=None):
confusion = metrics.confusion_matrix(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
F1_score = metrics.f1_score(y_test, y_pred)
AUC = metrics.roc_auc_score(y_test, pred_proba)
# 평가지표 출력
print('오차행렬:\n', confusion)
print('\n정확도: {:.4f}'.format(accuracy))
print('정밀도: {:.4f}'.format(precision))
print('재현율: {:.4f}'.format(recall))
print('F1: {:.4f}'.format(F1_score))
print('AUC: {:.4f}'.format(AUC))
get_clf_eval(y_test, preds, pred_proba)
오차행렬: [[44 3] [ 1 66]] 정확도: 0.9649 정밀도: 0.9565 재현율: 0.9851 F1: 0.9706 AUC: 0.9990
from lightgbm import plot_importance
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10,12))
plot_importance(model_lgbm, ax=ax)
<AxesSubplot:title={'center':'Feature importance'}, xlabel='Feature importance', ylabel='Features'>