import warnings
warnings.filterwarnings(action='ignore')


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import mglearn
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np


from mglearn.datasets import make_blobs 

### 데이터 만들기
X, y = make_blobs(n_samples=(400, 50), 
                  centers=2, 
                  cluster_std=[7.0, 2],         # 클러스터의 표준 편차
                  random_state=42)
                     
print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

(450, 2) (450,)


X_train[0:10], y_train[0:10]

(array([[-0.18299954,  3.77488037],
        [-2.73847051,  5.21031273],
        [ 4.14376924,  4.97596054],
        [ 5.21160962,  2.64208326],
        [ 1.78996928, 14.3168401 ],
        [ 1.09604925, 12.61078778],
        [-2.16954623,  3.19763531],
        [-8.04251881, 12.31456463],
        [-0.48077362, 23.54209172],
        [-8.287678  ,  6.76458524]]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0]))


plt.scatter(X[:,0], X[:,1], 
            c=y, 
            cmap=plt.cm.autumn, s=60, edgecolors='k')

<matplotlib.collections.PathCollection at 0x7fa72ba8daf0>


mglearn.plots.plot_decision_threshold()


svc = SVC(gamma=.05).fit(X_train, y_train)   
pred = svc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       107
           1       0.42      0.83      0.56         6

    accuracy                           0.93       113
   macro avg       0.70      0.88      0.76       113
weighted avg       0.96      0.93      0.94       113


pred = svc.decision_function(X_test)
print(pred[0:10])
np.min(pred), np.max(pred)

[-1.0167542   0.72583536 -1.17766946 -1.00425497 -1.0002495  -0.99977182
 -1.07285711 -1.2206812  -1.24018502 -1.30361098]

(-1.509707253620952, 1.6245457437087478)


decision_0 = svc.decision_function(X_test)   > 0     # 임계값을 0으로 
decision_m08 = svc.decision_function(X_test) > -.8   # 임계값을 -0.8로

# TP - 잘 맞추는 것을 늘린다.
print("임계값 0 일때    : 1(양성) 개수 :", decision_0.sum() ) 
print("임계값 -0.8 일때 : 1(양성) 개수 :", decision_m08.sum() )

임계값 0 일때    : 1(양성) 개수 : 12
임계값 -0.8 일때 : 1(양성) 개수 : 18


print("임계값 0 일때    : 0(음성) 개수 :", len(decision_0) - decision_0.sum()) 
print("임계값 -0.8 일때 : 0(음성) 개수 :", len(decision_m08) - decision_m08.sum() )

임계값 0 일때    : 0(음성) 개수 : 101
임계값 -0.8 일때 : 0(음성) 개수 : 95


y_pred_0 = svc.decision_function(X_test)   > 0 
y_pred_08 = svc.decision_function(X_test) > -.8


# 임계값 0
print(classification_report(y_test, y_pred_0))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       107
           1       0.42      0.83      0.56         6

    accuracy                           0.93       113
   macro avg       0.70      0.88      0.76       113
weighted avg       0.96      0.93      0.94       113


print(classification_report(y_test, y_pred_08))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94       107
           1       0.33      1.00      0.50         6

    accuracy                           0.89       113
   macro avg       0.67      0.94      0.72       113
weighted avg       0.96      0.89      0.92       113

모델 검증¶

1.1.1 이진 분류의 평가지표¶

1.1.2 임계값과 평가지표¶

1.1.3 평가지표 - ROC 커브, AUC¶

1.1.4 다중 분류의 평가지표¶

학습 내용¶

목차

이진 분류 예측 - 예측을 0,1로 하는 것이 아니라 확률로 해보기¶

01. 데이터 준비 및 라이브러리 임포트

데이터 시각화¶

임계값에 따른 값을 확인¶

재현율(recall) 조정해보기¶

02. 모델(SVC) 예측 후, 평가 지표 확인

모델의 임계값을 활용하여 0,1 개수 조정¶

임계값을 낮추는 것은¶

Review¶

실습¶

기타 방법¶