### 한글 폰트 설정
import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform

path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")
    
matplotlib.rcParams['axes.unicode_minus'] = False

%matplotlib inline


# !pip install mglearn

Collecting mglearn
  Obtaining dependency information for mglearn from https://files.pythonhosted.org/packages/bb/8b/687d30a3df6b870af541dde6327423e35713e38243db135f57b4ebd054f3/mglearn-0.2.0-py2.py3-none-any.whl.metadata
  Downloading mglearn-0.2.0-py2.py3-none-any.whl.metadata (628 bytes)
Requirement already satisfied: numpy in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (1.24.3)
Requirement already satisfied: matplotlib in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (3.7.2)
Requirement already satisfied: scikit-learn in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (1.3.0)
Requirement already satisfied: pandas in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (2.0.3)
Requirement already satisfied: pillow in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (9.4.0)
Requirement already satisfied: cycler in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (0.11.0)
Requirement already satisfied: imageio in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (2.26.0)
Requirement already satisfied: joblib in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (1.2.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (1.0.5)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (23.1)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\colab\anaconda3\lib\site-packages (from pandas->mglearn) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\colab\anaconda3\lib\site-packages (from pandas->mglearn) (2023.3)
Requirement already satisfied: scipy>=1.5.0 in c:\users\colab\anaconda3\lib\site-packages (from scikit-learn->mglearn) (1.11.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\colab\anaconda3\lib\site-packages (from scikit-learn->mglearn) (2.2.0)
Requirement already satisfied: six>=1.5 in c:\users\colab\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->mglearn) (1.16.0)
Downloading mglearn-0.2.0-py2.py3-none-any.whl (581 kB)
   ---------------------------------------- 0.0/581.4 kB ? eta -:--:--
    --------------------------------------- 10.2/581.4 kB ? eta -:--:--
   -- ------------------------------------ 30.7/581.4 kB 330.3 kB/s eta 0:00:02
   -- ------------------------------------ 30.7/581.4 kB 330.3 kB/s eta 0:00:02
   ------ -------------------------------- 92.2/581.4 kB 476.3 kB/s eta 0:00:02
   ---------------------------------------- 581.4/581.4 kB 2.6 MB/s eta 0:00:00
Installing collected packages: mglearn
Successfully installed mglearn-0.2.0


import mglearn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%matplotlib inline


mglearn.plots.plot_knn_classification(n_neighbors=3)


mglearn.plots.plot_knn_classification(n_neighbors=5)


mglearn.plots.plot_knn_regression(n_neighbors=1)


mglearn.plots.plot_knn_regression(n_neighbors=3)


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


X, y = mglearn.datasets.make_forge()

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                             random_state=0)


fig, axes = plt.subplots(1, 3, figsize=(10, 3))

for n_neighbors, ax in zip([1, 3, 9], axes):
    # fit 메소드는 self 오브젝트를 리턴합니다
    # 그래서 객체 생성과 fit 메소드를 한 줄에 쓸 수 있습니다
    model = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    mglearn.plots.plot_2d_separator(model, X, 
                                    fill=True, eps=0.5, ax=ax, alpha=.4)
    
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{} 이웃".format(n_neighbors))
    ax.set_xlabel("특성 0")
    ax.set_ylabel("특성 1")
axes[0].legend(loc=3)

<matplotlib.legend.Legend at 0x25047457a50>


from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt


cancer = load_breast_cancer()
print("cancer.keys() : \n{}".format(cancer.keys()))
print("유방암 데이터의 행열 : {}".format(cancer.data.shape))

cancer.keys() : 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
유방암 데이터의 행열 : (569, 30)


print("특성이름(featuer_names) : {}".format(cancer['feature_names']))
print("유방암 데이터의 형태 : ", cancer.data.shape)
print()
print("클래스 이름(target_names) : {}".format(cancer['target_names']))
print("클래스별 샘플 개수 : \n",np.bincount(cancer.target))

특성이름(featuer_names) : ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
유방암 데이터의 형태 :  (569, 30)

클래스 이름(target_names) : ['malignant' 'benign']
클래스별 샘플 개수 : 
 [212 357]


from sklearn.model_selection import train_test_split


X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                         stratify=cancer.target,     
                                         random_state=77)


print( len(y_train) )
y_1_all = (y == 1).sum() 
y_0_all =  (y == 0).sum() 

print(f"target y의 \n 0의 개수 : {y_0_all} \n 1의 개수 : {y_1_all}")

426
target y의 
 0의 개수 : 212 
 1의 개수 : 357


y_train_1 = (y_train == 1).sum()
y_test_1 = (y_test == 1).sum()

y_train_0 = len(y_train) - (y_train == 1).sum()
y_test_0 = len(y_test) - (y_test == 1).sum()

print("데이터 셋의 target(학습:테스트)의 비율 - 1에 대해서")
print(f"train비율 : {y_train_1/y_1_all}, test비율 : {y_test_1/y_1_all}")

print("데이터 셋의 target(학습:테스트)의 비율 - 0에 대해서")
print(f"train비율 : {y_train_0/y_0_all}, test비율 : {y_test_0/y_0_all}")

데이터 셋의 target(학습:테스트)의 비율 - 1에 대해서
train비율 : 0.7478991596638656, test비율 : 0.25210084033613445
데이터 셋의 target(학습:테스트)의 비율 - 0에 대해서
train비율 : 0.75, test비율 : 0.25


from sklearn.neighbors import KNeighborsClassifier


model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred

c:\Users\colab\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py:110: UserWarning: Could not find the number of physical cores for the following reason:
[WinError 2] 지정된 파일을 찾을 수 없습니다
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
  warnings.warn(
  File "c:\Users\colab\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\colab\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\colab\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\colab\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

array([1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1])


# 예측 후, 정확도 구하기
(pred == y_test).sum()/(len(pred))

0.9090909090909091


acc_tr = model.score(X_train, y_train)   # 정확도(학습용 데이터)
acc_test = model.score(X_test, y_test)   # 정확도(테스트 데이터)
acc_tr, acc_test

(0.9553990610328639, 0.9090909090909091)


print("k : {}".format(3))
print("훈련 데이터셋 정확도 : {:.2f}".format(acc_tr))
print("테스트 데이터 셋 정확도 : {:.2f}".format(acc_test))

k : 3
훈련 데이터셋 정확도 : 0.96
테스트 데이터 셋 정확도 : 0.91


k_list = []
tr_acc = []
test_acc = []
k_nums = range(1, 22, 2)# 1,3,5~21

for n in k_nums:
    # 모델 선택 및 학습 
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(X_train, y_train)
    
    # 정확도 구하기 
    acc_tr = model.score(X_train, y_train)
    acc_test = model.score(X_test, y_test)
    
    # 정확도 값 저장.
    k_list.append(n)
    tr_acc.append(acc_tr)
    test_acc.append(acc_test)
    
    print("k : ", n)
    print("학습용셋 정확도 {:.3f}".format(acc_tr) )
    print("테스트용셋 정확도 {:.3f}".format(acc_test) )

k :  1
학습용셋 정확도 1.000
테스트용셋 정확도 0.888
k :  3
학습용셋 정확도 0.955
테스트용셋 정확도 0.909
k :  5
학습용셋 정확도 0.953
테스트용셋 정확도 0.916
k :  7
학습용셋 정확도 0.953
테스트용셋 정확도 0.909
k :  9
학습용셋 정확도 0.946
테스트용셋 정확도 0.909
k :  11
학습용셋 정확도 0.939
테스트용셋 정확도 0.909
k :  13
학습용셋 정확도 0.937
테스트용셋 정확도 0.916
k :  15
학습용셋 정확도 0.939
테스트용셋 정확도 0.916
k :  17
학습용셋 정확도 0.934
테스트용셋 정확도 0.923
k :  19
학습용셋 정확도 0.937
테스트용셋 정확도 0.923
k :  21
학습용셋 정확도 0.934
테스트용셋 정확도 0.923


import seaborn as sns
print(sns.__version__)

0.12.2


# tr_acc = []
# test_acc = []
dat = { "tr_acc":tr_acc, "test_acc":test_acc }
data_df = pd.DataFrame(dat, index=k_list)
data_df


sns.lineplot(data=data_df, palette="tab10")
plt.show()


import matplotlib.pyplot as plt

plt.plot(k_nums, tr_acc, '-')
plt.plot(k_nums, test_acc, '-')
plt.xticks(k_nums)                # x축 지정

# 제목, x축, y축 레이블
plt.title("knn - model accuracy")
plt.xlabel("knn - k values")
plt.ylabel("knn - accuracy")

Text(0, 0.5, 'knn - accuracy')

	tr_acc	test_acc
1	1.000000	0.888112
3	0.955399	0.909091
5	0.953052	0.916084
7	0.953052	0.909091
9	0.946009	0.909091
11	0.938967	0.909091
13	0.936620	0.916084
15	0.938967	0.916084
17	0.934272	0.923077
19	0.936620	0.923077
21	0.934272	0.923077

ch2 지도학습 - knn¶

학습 내용¶

01 지도학습의 종류¶

분류(Classification)¶

회귀(Regression)¶

02 knn 알고리즘 시각화¶

knn의 k가 3인 경우의 알고리즘(분류- 범주형 값의 예측)¶

knn의 k가 1인 경우의 알고리즘(회귀-수치형 값의 예측)¶

하이퍼 파라미터 k에 따른 결정경계¶

데이터 준비 및 나누기¶

일반화, 과대적합, 과소적합¶

03 유방암 데이터 셋 실습¶

feature 이름, class 이름¶

데이터 셋 나누기¶

(실습)¶

04 머신러닝 모델 만들고 예측하기¶

작업 단계¶

score를 이용한 결과 확인¶

05 k의 값에 따른 정확도 확인해 보기¶

데이터 시각화¶

matplotlib 이용 시각화¶

직접 해보기¶

실습해 보기¶

더 해보기¶