### 한글 폰트 설정
import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
elif platform.system()=="Darwin":
rc('font', family='AppleGothic')
else:
print("Unknown System")
matplotlib.rcParams['axes.unicode_minus'] = False
%matplotlib inline
# !pip install mglearn
Collecting mglearn Obtaining dependency information for mglearn from https://files.pythonhosted.org/packages/bb/8b/687d30a3df6b870af541dde6327423e35713e38243db135f57b4ebd054f3/mglearn-0.2.0-py2.py3-none-any.whl.metadata Downloading mglearn-0.2.0-py2.py3-none-any.whl.metadata (628 bytes) Requirement already satisfied: numpy in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (1.24.3) Requirement already satisfied: matplotlib in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (3.7.2) Requirement already satisfied: scikit-learn in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (1.3.0) Requirement already satisfied: pandas in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (2.0.3) Requirement already satisfied: pillow in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (9.4.0) Requirement already satisfied: cycler in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (0.11.0) Requirement already satisfied: imageio in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (2.26.0) Requirement already satisfied: joblib in c:\users\colab\anaconda3\lib\site-packages (from mglearn) (1.2.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (1.0.5) Requirement already satisfied: fonttools>=4.22.0 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (23.1) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\colab\anaconda3\lib\site-packages (from matplotlib->mglearn) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\colab\anaconda3\lib\site-packages (from pandas->mglearn) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\colab\anaconda3\lib\site-packages (from pandas->mglearn) (2023.3) Requirement already satisfied: scipy>=1.5.0 in c:\users\colab\anaconda3\lib\site-packages (from scikit-learn->mglearn) (1.11.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\colab\anaconda3\lib\site-packages (from scikit-learn->mglearn) (2.2.0) Requirement already satisfied: six>=1.5 in c:\users\colab\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->mglearn) (1.16.0) Downloading mglearn-0.2.0-py2.py3-none-any.whl (581 kB) ---------------------------------------- 0.0/581.4 kB ? eta -:--:-- --------------------------------------- 10.2/581.4 kB ? eta -:--:-- -- ------------------------------------ 30.7/581.4 kB 330.3 kB/s eta 0:00:02 -- ------------------------------------ 30.7/581.4 kB 330.3 kB/s eta 0:00:02 ------ -------------------------------- 92.2/581.4 kB 476.3 kB/s eta 0:00:02 ---------------------------------------- 581.4/581.4 kB 2.6 MB/s eta 0:00:00 Installing collected packages: mglearn Successfully installed mglearn-0.2.0
import mglearn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
mglearn.plots.plot_knn_classification(n_neighbors=3)
mglearn.plots.plot_knn_classification(n_neighbors=5)
mglearn.plots.plot_knn_regression(n_neighbors=1)
mglearn.plots.plot_knn_regression(n_neighbors=3)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
X, y = mglearn.datasets.make_forge()
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=0)
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
for n_neighbors, ax in zip([1, 3, 9], axes):
# fit 메소드는 self 오브젝트를 리턴합니다
# 그래서 객체 생성과 fit 메소드를 한 줄에 쓸 수 있습니다
model = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
mglearn.plots.plot_2d_separator(model, X,
fill=True, eps=0.5, ax=ax, alpha=.4)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title("{} 이웃".format(n_neighbors))
ax.set_xlabel("특성 0")
ax.set_ylabel("특성 1")
axes[0].legend(loc=3)
<matplotlib.legend.Legend at 0x25047457a50>
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
cancer = load_breast_cancer()
print("cancer.keys() : \n{}".format(cancer.keys()))
print("유방암 데이터의 행열 : {}".format(cancer.data.shape))
cancer.keys() : dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']) 유방암 데이터의 행열 : (569, 30)
print("특성이름(featuer_names) : {}".format(cancer['feature_names']))
print("유방암 데이터의 형태 : ", cancer.data.shape)
print()
print("클래스 이름(target_names) : {}".format(cancer['target_names']))
print("클래스별 샘플 개수 : \n",np.bincount(cancer.target))
특성이름(featuer_names) : ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean smoothness' 'mean compactness' 'mean concavity' 'mean concave points' 'mean symmetry' 'mean fractal dimension' 'radius error' 'texture error' 'perimeter error' 'area error' 'smoothness error' 'compactness error' 'concavity error' 'concave points error' 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture' 'worst perimeter' 'worst area' 'worst smoothness' 'worst compactness' 'worst concavity' 'worst concave points' 'worst symmetry' 'worst fractal dimension'] 유방암 데이터의 형태 : (569, 30) 클래스 이름(target_names) : ['malignant' 'benign'] 클래스별 샘플 개수 : [212 357]
from sklearn.model_selection import train_test_split
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
stratify=cancer.target,
random_state=77)
print( len(y_train) )
y_1_all = (y == 1).sum()
y_0_all = (y == 0).sum()
print(f"target y의 \n 0의 개수 : {y_0_all} \n 1의 개수 : {y_1_all}")
426 target y의 0의 개수 : 212 1의 개수 : 357
y_train_1 = (y_train == 1).sum()
y_test_1 = (y_test == 1).sum()
y_train_0 = len(y_train) - (y_train == 1).sum()
y_test_0 = len(y_test) - (y_test == 1).sum()
print("데이터 셋의 target(학습:테스트)의 비율 - 1에 대해서")
print(f"train비율 : {y_train_1/y_1_all}, test비율 : {y_test_1/y_1_all}")
print("데이터 셋의 target(학습:테스트)의 비율 - 0에 대해서")
print(f"train비율 : {y_train_0/y_0_all}, test비율 : {y_test_0/y_0_all}")
데이터 셋의 target(학습:테스트)의 비율 - 1에 대해서 train비율 : 0.7478991596638656, test비율 : 0.25210084033613445 데이터 셋의 target(학습:테스트)의 비율 - 0에 대해서 train비율 : 0.75, test비율 : 0.25
(1) 모델 만들기 (2) 모델 학습 시키기(fit) (3) 모델을 이용한 값 예측(predict) (4) 훈련 데이터를 이용한 정확도 확인 (5) 테스트 데이터를 이용한 정확도 확인
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred
c:\Users\colab\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py:110: UserWarning: Could not find the number of physical cores for the following reason: [WinError 2] 지정된 파일을 찾을 수 없습니다 Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use. warnings.warn( File "c:\Users\colab\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores cpu_info = subprocess.run( ^^^^^^^^^^^^^^^ File "c:\Users\colab\anaconda3\Lib\subprocess.py", line 548, in run with Popen(*popenargs, **kwargs) as process: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "c:\Users\colab\anaconda3\Lib\subprocess.py", line 1026, in __init__ self._execute_child(args, executable, preexec_fn, close_fds, File "c:\Users\colab\anaconda3\Lib\subprocess.py", line 1538, in _execute_child hp, ht, pid, tid = _winapi.CreateProcess(executable, args, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
array([1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1])
# 예측 후, 정확도 구하기
(pred == y_test).sum()/(len(pred))
0.9090909090909091
acc_tr = model.score(X_train, y_train) # 정확도(학습용 데이터)
acc_test = model.score(X_test, y_test) # 정확도(테스트 데이터)
acc_tr, acc_test
(0.9553990610328639, 0.9090909090909091)
print("k : {}".format(3))
print("훈련 데이터셋 정확도 : {:.2f}".format(acc_tr))
print("테스트 데이터 셋 정확도 : {:.2f}".format(acc_test))
k : 3 훈련 데이터셋 정확도 : 0.96 테스트 데이터 셋 정확도 : 0.91
k_list = []
tr_acc = []
test_acc = []
k_nums = range(1, 22, 2)# 1,3,5~21
for n in k_nums:
# 모델 선택 및 학습
model = KNeighborsClassifier(n_neighbors=n)
model.fit(X_train, y_train)
# 정확도 구하기
acc_tr = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)
# 정확도 값 저장.
k_list.append(n)
tr_acc.append(acc_tr)
test_acc.append(acc_test)
print("k : ", n)
print("학습용셋 정확도 {:.3f}".format(acc_tr) )
print("테스트용셋 정확도 {:.3f}".format(acc_test) )
k : 1 학습용셋 정확도 1.000 테스트용셋 정확도 0.888 k : 3 학습용셋 정확도 0.955 테스트용셋 정확도 0.909 k : 5 학습용셋 정확도 0.953 테스트용셋 정확도 0.916 k : 7 학습용셋 정확도 0.953 테스트용셋 정확도 0.909 k : 9 학습용셋 정확도 0.946 테스트용셋 정확도 0.909 k : 11 학습용셋 정확도 0.939 테스트용셋 정확도 0.909 k : 13 학습용셋 정확도 0.937 테스트용셋 정확도 0.916 k : 15 학습용셋 정확도 0.939 테스트용셋 정확도 0.916 k : 17 학습용셋 정확도 0.934 테스트용셋 정확도 0.923 k : 19 학습용셋 정확도 0.937 테스트용셋 정확도 0.923 k : 21 학습용셋 정확도 0.934 테스트용셋 정확도 0.923
import seaborn as sns
print(sns.__version__)
0.12.2
# tr_acc = []
# test_acc = []
dat = { "tr_acc":tr_acc, "test_acc":test_acc }
data_df = pd.DataFrame(dat, index=k_list)
data_df
tr_acc | test_acc | |
---|---|---|
1 | 1.000000 | 0.888112 |
3 | 0.955399 | 0.909091 |
5 | 0.953052 | 0.916084 |
7 | 0.953052 | 0.909091 |
9 | 0.946009 | 0.909091 |
11 | 0.938967 | 0.909091 |
13 | 0.936620 | 0.916084 |
15 | 0.938967 | 0.916084 |
17 | 0.934272 | 0.923077 |
19 | 0.936620 | 0.923077 |
21 | 0.934272 | 0.923077 |
sns.lineplot(data=data_df, palette="tab10")
plt.show()
import matplotlib.pyplot as plt
plt.plot(k_nums, tr_acc, '-')
plt.plot(k_nums, test_acc, '-')
plt.xticks(k_nums) # x축 지정
# 제목, x축, y축 레이블
plt.title("knn - model accuracy")
plt.xlabel("knn - k values")
plt.ylabel("knn - accuracy")
Text(0, 0.5, 'knn - accuracy')