import mglearn
import warnings

warnings.filterwarnings('ignore')


mglearn.plots.plot_scaling()


### 한글 폰트 설정
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform
import matplotlib

path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")


import mglearn


mglearn.plots.plot_scaling()


from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
print("[cancer.keys()]  \n{}".format(cancer.keys()))
print("유방암 데이터의 형태 : {}".format(cancer.data.shape))
print("유방암 데이터의 피처명 : ", cancer.feature_names)

[cancer.keys()]  
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
유방암 데이터의 형태 : (569, 30)
유방암 데이터의 피처명 :  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


# 데이터 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(cancer.data, 
                                              cancer.target, 
                                              stratify=cancer.target, 
                                              random_state=0)

print(X_train.shape , X_test.shape)
print(y_train.shape, y_test.shape)

(426, 30) (143, 30)
(426,) (143,)


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler()


import numpy as np
# np.set_printoptions(precision=3)
# 소숫점 3자리까지 표현
np.set_printoptions(formatter={'float_kind': lambda x: "{0:0.3f}".format(x)})


X_train_s = scaler.transform(X_train)
print("변환전 크기 : {}".format(X_train.shape))
print("변환전 값의 최소, 최대 : \n {}, {}".format(X_train.min(axis=0), 
                                         X_train.max(axis=0)))
print()
print("변환 후 크기 : {}".format(X_train_s.shape))
print("변환후 값의 최소, 최대 : \n {}, {}".format(X_train_s.min(axis=0), 
                                         X_train_s.max(axis=0)))

변환전 크기 : (426, 30)
변환전 값의 최소, 최대 : 
 [7.691 9.710 47.920 170.400 0.053 0.019 0.000 0.000 0.106 0.050 0.112
 0.360 0.757 6.802 0.003 0.002 0.000 0.000 0.010 0.001 8.678 12.020 54.490
 223.600 0.081 0.034 0.000 0.000 0.157 0.055], [28.110 39.280 188.500 2501.000 0.142 0.345 0.375 0.191 0.304 0.097 2.873
 3.896 21.980 542.200 0.031 0.135 0.304 0.041 0.079 0.022 36.040 49.540
 251.200 4254.000 0.223 0.938 1.252 0.290 0.664 0.173]

변환 후 크기 : (426, 30)
변환후 값의 최소, 최대 : 
 [0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000], [1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
 1.000 1.000 1.000 1.000 1.000 1.000]


import seaborn as sns
import matplotlib.pyplot as plt


plt.figure(figsize=(10,6))
plt.subplot(2,2,1)
sns.boxplot(X_train)

plt.subplot(2,2,2)
sns.boxplot(X_train_s)

# 첫번째 열(mean radius)의 데이터 확인
plt.subplot(2,2,3)
sns.boxplot(X_train[:, 0])

plt.subplot(2,2,4)
sns.boxplot(X_train_s[:, 0])

<AxesSubplot:>


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# 같은 내용 다른 방법
X_train_scaler = scaler.fit(X_train).transform(X_train)
X_train_scaler_d = scaler.fit_transform(X_train)

### fit().transform()과 fit_transform()과 같다.
print("변경전 :", X_train.max(), X_train.min() )
print("변경후 :", X_train_scaler.max(), X_train_scaler.min() )
print("변경후 :", X_train_scaler_d.max(), X_train_scaler_d.min() )

변경전 : 4254.0 0.0
변경후 : 10.57748988678996 -3.131829324965022
변경후 : 10.57748988678996 -3.131829324965022


fig, axes = plt.subplots(1, 3, figsize=(10,6))
sns.boxplot(X_train, ax=axes[0])
sns.boxplot(X_train_scaler, ax=axes[1])
sns.boxplot(X_train_scaler_d, ax=axes[2])

<AxesSubplot:>


from sklearn.datasets import make_blobs

# 인위적인 데이터셋 생성
X, _ = make_blobs(n_samples=50, centers=5, 
                  random_state=4, cluster_std=2)

# 학습용 세트와 테스트 세트로 나눕니다
X_train, X_test = train_test_split(X, random_state=5, 
                                   test_size=.1)


# 학습용 세트와 테스트 세트의 산점도를 그립니다
fig, axes = plt.subplots(1, 3, figsize=(13, 4))
axes[0].scatter(X_train[:, 0], X_train[:, 1],
                c=mglearn.cm2(0), label="학습용 세트", s=60)
axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^',
                c=mglearn.cm2(1), label="테스트 세트", s=60)
axes[0].legend(loc='upper left')
axes[0].set_title("원본 데이터")

# MinMaxScaler를 사용해 스케일을 조정합니다
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 스케일이 조정된 데이터의 산점도를 그립니다
axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
                c=mglearn.cm2(0), label="학습용 세트", s=60)
axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^',
                c=mglearn.cm2(1), label="테스트 세트", s=60)
axes[1].set_title("스케일 조정된 데이터")

# 테스트 세트의 스케일을 fit() 실행시, 따로 조정합니다
# 테스트 세트의 최솟값은 0, 최댓값은 1이 됩니다
# 이는 예제를 위한 것으로 절대로 이렇게 사용해서는 안됩니다
test_scaler = MinMaxScaler()
test_scaler.fit(X_test)
X_test_scaled_badly = test_scaler.transform(X_test)

# 잘못 조정된 데이터의 산점도를 그립니다
axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
                c=mglearn.cm2(0), label="training set", s=60)
axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1],
                marker='^', c=mglearn.cm2(1), label="test set", s=60)
axes[2].set_title("잘못 조정된 데이터")

for ax in axes:
    ax.set_xlabel("특성 0")
    ax.set_ylabel("특성 1")
fig.tight_layout()

*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.

CH03 비지도 학습과 데이터 전처리¶

학습 내용¶

목차

01. 비지도 학습이란?

02. 비지도 학습의 종류

비지도 변환(unsupervised transformation)

사용 분야

현재 어려운 부분

소셜 미디어에서 선거, 총기, 팝스타 같은 주제로 일어나는 토론을 추적, 텍스트 문서에서 주제를 추출¶

군집 알고리즘(Clustering)¶

03. 데이터 전처리와 스케일 조정

(가) StandardScaler - 표준화¶

(나) RobustScaler¶

(다) MinMaxScaler - 정규화¶

(라) Normalizer¶

04. 데이터 변환 실습 (cancer 데이터)

MinMaxScaler(정규화)를 이용¶

실제로 학습 데이터의 스케일을 조정하려면, 스케일 객체의 transform 메서드를 사용.¶

변환전 후, 시각화¶

메서드 단축해서 사용¶

표준화 수행¶

시각화 한 결과, 2번째, 3번째 결과는 같다.¶

주의해야할 것¶