import matplotlib
from matplotlib import font_manager, rc
import platform
import warnings
import numpy as np
warnings.filterwarnings(action='ignore')


### 한글
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")
    
matplotlib.rcParams['axes.unicode_minus'] = False


from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


digits = load_digits()


X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)


unique, counts = np.unique(y_test, return_counts=True)

print( np.asarray((unique, counts)).T )

[[ 0 37]
 [ 1 43]
 [ 2 44]
 [ 3 45]
 [ 4 38]
 [ 5 48]
 [ 6 52]
 [ 7 48]
 [ 8 48]
 [ 9 47]]


lr = LogisticRegression(solver='liblinear', 
                        multi_class='ovr').fit(X_train, y_train)
pred = lr.predict(X_test)

print("정확도 : {:.3f}".format(accuracy_score(y_test, pred)))
print("오차 행렬 :\n", confusion_matrix(y_test, pred))

정확도 : 0.953
오차 행렬 :
 [[37  0  0  0  0  0  0  0  0  0]
 [ 0 39  0  0  0  0  2  0  2  0]
 [ 0  0 41  3  0  0  0  0  0  0]
 [ 0  0  1 43  0  0  0  0  0  1]
 [ 0  0  0  0 38  0  0  0  0  0]
 [ 0  1  0  0  0 47  0  0  0  0]
 [ 0  0  0  0  0  0 52  0  0  0]
 [ 0  1  0  1  1  0  0 45  0  0]
 [ 0  3  1  0  0  0  0  0 43  1]
 [ 0  0  0  1  0  1  0  0  1 44]]


import mglearn
import matplotlib.pyplot as plt


scores_image = mglearn.tools.heatmap(
                     confusion_matrix(y_test, pred), 
                     xlabel='예측 레이블', ylabel='진짜 레이블', 
                     xticklabels=digits.target_names,
                     yticklabels=digits.target_names, cmap=plt.cm.gray_r, fmt="%d")

plt.title("오차 행렬")
plt.gca().invert_yaxis()


from sklearn.metrics import classification_report


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.89      0.91      0.90        43
           2       0.95      0.93      0.94        44
           3       0.90      0.96      0.92        45
           4       0.97      1.00      0.99        38
           5       0.98      0.98      0.98        48
           6       0.96      1.00      0.98        52
           7       1.00      0.94      0.97        48
           8       0.93      0.90      0.91        48
           9       0.96      0.94      0.95        47

    accuracy                           0.95       450
   macro avg       0.95      0.95      0.95       450
weighted avg       0.95      0.95      0.95       450


## 혼동 행렬
from IPython.display import display, Image
#display(Image(filename='img/model_validation01.png'))
mglearn.plots.plot_binary_confusion_matrix()


from sklearn.metrics import f1_score


print("micro 평균 f1점수 : {:.3f}".format(f1_score(y_test, 
                                               pred, average='micro')))
print("macro 평균 f1점수 : {:.3f}".format(f1_score(y_test, 
                                               pred, average='macro')))

micro 평균 f1점수 : 0.953
macro 평균 f1점수 : 0.954

다중 분류의 평가지표¶

1.1.1 이진 분류의 평가지표¶

1.1.2 임계값과 평가지표¶

1.1.3 평가지표 - ROC 커브, AUC¶

1.1.4 다중 분류의 평가지표¶

학습 내용¶

목차

01. 데이터 준비 및 라이브러리 임포트

데이터 준비¶

02 모델 구축하기 - 로지스틱 회귀

모델의 정확도는 95.3%로 꽤 좋은 성능 좋다.¶

각 행은 실제 정답 레이블에 해당하며, 열은 예측 레이블에 해당¶

오차 행렬 그래프로 표시¶

03. 정밀도, 재현율, f1-score 확인

다중 클래스용 f1-score점수는 한 클래스를 양성 클래스로 두고, 나머지 클래스를 음성 클래스로 간주하여 클래스마다 f1-score를 계산¶

다중 분류에서 불균형 데이터셋을 위해 가장 널리 사용하는 평가 지표는 f1-score점수의 다중 분류 버전.¶