import mglearn


mglearn.plots.plot_pca_illustration()


from sklearn.datasets import fetch_lfw_people
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split


people = fetch_lfw_people(min_faces_per_person=20, 
                          resize=0.7) # 컬러 이미지는 color=True로 옵션을 준다.

print(people.keys())
print("이미지 행렬 ", people.images.shape)
print("사람 " , people.target_names)      # 총 62명
image_shape = people.images[0].shape      # 3023장의 이미지

print(image_shape)

dict_keys(['data', 'images', 'target', 'target_names', 'DESCR'])
이미지 행렬  (3023, 87, 65)
사람  ['Alejandro Toledo' 'Alvaro Uribe' 'Amelie Mauresmo' 'Andre Agassi'
 'Angelina Jolie' 'Ariel Sharon' 'Arnold Schwarzenegger'
 'Atal Bihari Vajpayee' 'Bill Clinton' 'Carlos Menem' 'Colin Powell'
 'David Beckham' 'Donald Rumsfeld' 'George Robertson' 'George W Bush'
 'Gerhard Schroeder' 'Gloria Macapagal Arroyo' 'Gray Davis'
 'Guillermo Coria' 'Hamid Karzai' 'Hans Blix' 'Hugo Chavez' 'Igor Ivanov'
 'Jack Straw' 'Jacques Chirac' 'Jean Chretien' 'Jennifer Aniston'
 'Jennifer Capriati' 'Jennifer Lopez' 'Jeremy Greenstock' 'Jiang Zemin'
 'John Ashcroft' 'John Negroponte' 'Jose Maria Aznar'
 'Juan Carlos Ferrero' 'Junichiro Koizumi' 'Kofi Annan' 'Laura Bush'
 'Lindsay Davenport' 'Lleyton Hewitt' 'Luiz Inacio Lula da Silva'
 'Mahmoud Abbas' 'Megawati Sukarnoputri' 'Michael Bloomberg' 'Naomi Watts'
 'Nestor Kirchner' 'Paul Bremer' 'Pete Sampras' 'Recep Tayyip Erdogan'
 'Ricardo Lagos' 'Roh Moo-hyun' 'Rudolph Giuliani' 'Saddam Hussein'
 'Serena Williams' 'Silvio Berlusconi' 'Tiger Woods' 'Tom Daschle'
 'Tom Ridge' 'Tony Blair' 'Vicente Fox' 'Vladimir Putin' 'Winona Ryder']
(87, 65)


fig, axes = plt.subplots(2, 5, figsize=(15,8),
                        subplot_kw= {'xticks':(), 'yticks':() })

for target, image, ax in zip(people.target, people.images, axes.ravel() ):
    ax.imshow(image) # 이미지 표시 
    ax.set_title(people.target_names[target]) # 이미지별 이름


print("이미지 크기:", people.images.shape)
print("클래스 개수:", len(people.target_names))

이미지 크기: (3023, 87, 65)
클래스 개수: 62


# 타깃 값의 각각의 데이터 개수
counts = np.bincount(people.target)
print( counts )
print("최소 : {}, 최대 : {}".format(np.min(counts), np.max(counts)))

[ 39  35  21  36  20  77  42  24  29  21 236  31 121  22 530 109  44  26
  30  22  39  71  20  28  52  55  21  42  21  24  20  53  31  23  28  60
  32  41  22  41  48  29  33  20  22  37  20  22  30  27  32  26  23  52
  33  23  25  33 144  32  49  24]
최소 : 20, 최대 : 530


people.target_names

array(['Alejandro Toledo', 'Alvaro Uribe', 'Amelie Mauresmo',
       'Andre Agassi', 'Angelina Jolie', 'Ariel Sharon',
       'Arnold Schwarzenegger', 'Atal Bihari Vajpayee', 'Bill Clinton',
       'Carlos Menem', 'Colin Powell', 'David Beckham', 'Donald Rumsfeld',
       'George Robertson', 'George W Bush', 'Gerhard Schroeder',
       'Gloria Macapagal Arroyo', 'Gray Davis', 'Guillermo Coria',
       'Hamid Karzai', 'Hans Blix', 'Hugo Chavez', 'Igor Ivanov',
       'Jack Straw', 'Jacques Chirac', 'Jean Chretien',
       'Jennifer Aniston', 'Jennifer Capriati', 'Jennifer Lopez',
       'Jeremy Greenstock', 'Jiang Zemin', 'John Ashcroft',
       'John Negroponte', 'Jose Maria Aznar', 'Juan Carlos Ferrero',
       'Junichiro Koizumi', 'Kofi Annan', 'Laura Bush',
       'Lindsay Davenport', 'Lleyton Hewitt', 'Luiz Inacio Lula da Silva',
       'Mahmoud Abbas', 'Megawati Sukarnoputri', 'Michael Bloomberg',
       'Naomi Watts', 'Nestor Kirchner', 'Paul Bremer', 'Pete Sampras',
       'Recep Tayyip Erdogan', 'Ricardo Lagos', 'Roh Moo-hyun',
       'Rudolph Giuliani', 'Saddam Hussein', 'Serena Williams',
       'Silvio Berlusconi', 'Tiger Woods', 'Tom Daschle', 'Tom Ridge',
       'Tony Blair', 'Vicente Fox', 'Vladimir Putin', 'Winona Ryder'],
      dtype='<U25')


### 타깃(사람)의 이름과 각 사람별 데이터 개수 출력.
for i, (count, name) in enumerate( zip(counts, people.target_names) ):
    print("{0:25} {1:3}".format(name, count), end= '   ')
    if (i + 1) % 3 == 0:
        print()

Alejandro Toledo           39   Alvaro Uribe               35   Amelie Mauresmo            21   
Andre Agassi               36   Angelina Jolie             20   Ariel Sharon               77   
Arnold Schwarzenegger      42   Atal Bihari Vajpayee       24   Bill Clinton               29   
Carlos Menem               21   Colin Powell              236   David Beckham              31   
Donald Rumsfeld           121   George Robertson           22   George W Bush             530   
Gerhard Schroeder         109   Gloria Macapagal Arroyo    44   Gray Davis                 26   
Guillermo Coria            30   Hamid Karzai               22   Hans Blix                  39   
Hugo Chavez                71   Igor Ivanov                20   Jack Straw                 28   
Jacques Chirac             52   Jean Chretien              55   Jennifer Aniston           21   
Jennifer Capriati          42   Jennifer Lopez             21   Jeremy Greenstock          24   
Jiang Zemin                20   John Ashcroft              53   John Negroponte            31   
Jose Maria Aznar           23   Juan Carlos Ferrero        28   Junichiro Koizumi          60   
Kofi Annan                 32   Laura Bush                 41   Lindsay Davenport          22   
Lleyton Hewitt             41   Luiz Inacio Lula da Silva  48   Mahmoud Abbas              29   
Megawati Sukarnoputri      33   Michael Bloomberg          20   Naomi Watts                22   
Nestor Kirchner            37   Paul Bremer                20   Pete Sampras               22   
Recep Tayyip Erdogan       30   Ricardo Lagos              27   Roh Moo-hyun               32   
Rudolph Giuliani           26   Saddam Hussein             23   Serena Williams            52   
Silvio Berlusconi          33   Tiger Woods                23   Tom Daschle                25   
Tom Ridge                  33   Tony Blair                144   Vicente Fox                32   
Vladimir Putin             49   Winona Ryder               24


### 타깃(사람)의 이름과 각 사람별 데이터 개수 출력.(2사람, 1줄에)
for i, (count, name) in enumerate(zip(counts, people.target_names)):
    print("{0:5}, {1:25}".format(count, name), end="   ")
    if (i + 1) % 2 == 0:
        print()

   39, Alejandro Toledo               35, Alvaro Uribe                
   21, Amelie Mauresmo                36, Andre Agassi                
   20, Angelina Jolie                 77, Ariel Sharon                
   42, Arnold Schwarzenegger          24, Atal Bihari Vajpayee        
   29, Bill Clinton                   21, Carlos Menem                
  236, Colin Powell                   31, David Beckham               
  121, Donald Rumsfeld                22, George Robertson            
  530, George W Bush                 109, Gerhard Schroeder           
   44, Gloria Macapagal Arroyo        26, Gray Davis                  
   30, Guillermo Coria                22, Hamid Karzai                
   39, Hans Blix                      71, Hugo Chavez                 
   20, Igor Ivanov                    28, Jack Straw                  
   52, Jacques Chirac                 55, Jean Chretien               
   21, Jennifer Aniston               42, Jennifer Capriati           
   21, Jennifer Lopez                 24, Jeremy Greenstock           
   20, Jiang Zemin                    53, John Ashcroft               
   31, John Negroponte                23, Jose Maria Aznar            
   28, Juan Carlos Ferrero            60, Junichiro Koizumi           
   32, Kofi Annan                     41, Laura Bush                  
   22, Lindsay Davenport              41, Lleyton Hewitt              
   48, Luiz Inacio Lula da Silva      29, Mahmoud Abbas               
   33, Megawati Sukarnoputri          20, Michael Bloomberg           
   22, Naomi Watts                    37, Nestor Kirchner             
   20, Paul Bremer                    22, Pete Sampras                
   30, Recep Tayyip Erdogan           27, Ricardo Lagos               
   32, Roh Moo-hyun                   26, Rudolph Giuliani            
   23, Saddam Hussein                 52, Serena Williams             
   33, Silvio Berlusconi              23, Tiger Woods                 
   25, Tom Daschle                    33, Tom Ridge                   
  144, Tony Blair                     32, Vicente Fox                 
   49, Vladimir Putin                 24, Winona Ryder


people.target.shape

(3023,)


nums = people.target.shape  # 이미지의 수
mask = np.zeros(nums, dtype=np.bool)
mask

<ipython-input-14-01ce2b9256eb>:2: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask = np.zeros(nums, dtype=np.bool)

array([False, False, False, ..., False, False, False])


# 중복값을 제외한 값을 표시
np.unique(people.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61], dtype=int64)


# 각 사람별 50명씩 선택
nums = np.unique(people.target)

for target in nums:
    mask[np.where(people.target==target)[0][:50]] = 1 # 각 사람별 50장 선택

# 전체 데이터의 1로 되어 있는 것을 가져오기
X_people = people.data[mask]
y_people = people.target[mask]

# 0 ~ 255 사이의 흑백 이미지의 픽셀 값을 0~1 스케일로 조정.
# (옮긴이) MinMaxScaler를 적용하는 것과 거의 같습니다.
X_people = X_people / 255.
print(X_people.shape)

(2063, 5655)


from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(
        X_people, y_people, stratify=y_people, random_state=0)

# 이웃 개수를 한 개로 하여 KNeightborsClassifier 모델을 만든다.
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
print("1-최근접 이웃의 테스트 세트 점수:{:.2f}".format(knn.score(X_test, y_test)))

1-최근접 이웃의 테스트 세트 점수:0.23


from sklearn.decomposition import PCA


# 화이트닝 옵션은 PCA변환을 할 때 이 표준편차를 나누어 적용. 
# PCA 변환은 데이터의 평균을 0으로 만들어주므로 
# 화이트닝을 적용하는 것은 PCA변환한 뒤에 StandardScaler를 적용하는 것과 같다.
pca = PCA(n_components=100, whiten=True, random_state=0).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

print("X_train_pca.shape:", X_train_pca.shape)
print("X_test_pca.shape:", X_test_pca.shape)

X_train_pca.shape: (1547, 100)
X_test_pca.shape: (516, 100)


knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_pca, y_train)

print("테스트 정확도 : {:.2f}".format(knn.score(X_test_pca, y_test)))

테스트 정확도 : 0.31


# 한글
import matplotlib
from matplotlib import font_manager, rc
import platform

path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")


print("pca.components_.shape ", pca.components_.shape)

fig, axes = plt.subplots(3, 5, figsize=(15,12),
                        subplot_kw={'xticks':(), 'yticks':() })

for i, (component, ax) in enumerate(zip(pca.components_, axes.ravel())):
    ax.imshow(component.reshape(image_shape), cmap="viridis")
    ax.set_title("주성분 {}".format((i+1)))

pca.components_.shape  (100, 5655)


mglearn.plots.plot_pca_faces(X_train, X_test, image_shape)

WARNING:root:[MemorizedFunc(func=<function pca_faces at 0x000002891C1CAE50>, location=cache\joblib)]: Clearing function cache identified by mglearn\plot_pca\pca_faces

________________________________________________________________________________
[Memory] Calling mglearn.plot_pca.pca_faces...
pca_faces(array([[0.535948, ..., 0.243137],
       ...,
       [0.043137, ..., 0.596078]], dtype=float32), 
array([[0.237908, ..., 0.269281],
       ...,
       [0.4     , ..., 0.254902]], dtype=float32))
________________________________________________________pca_faces - 2.5s, 0.0min


from matplotlib.offsetbox import OffsetImage, AnnotationBbox

image_shape = people.images[0].shape
plt.figure(figsize=(20, 3))
ax = plt.gca()

imagebox = OffsetImage(people.images[0], zoom=2, cmap="gray")
ab = AnnotationBbox(imagebox, (.05, 0.4), pad=0.0, xycoords='data')
ax.add_artist(ab)

for i in range(4):
    imagebox = OffsetImage(pca.components_[i].reshape(image_shape), zoom=2,
                           cmap="viridis")

    ab = AnnotationBbox(imagebox, (.285 + .2 * i, 0.4),
                        pad=0.0, xycoords='data')
    ax.add_artist(ab)
    if i == 0:
        plt.text(.155, .3, 'x_{} *'.format(i), fontdict={'fontsize': 30})
    else:
        plt.text(.145 + .2 * i, .3, '+ x_{} *'.format(i),
                 fontdict={'fontsize': 30})

plt.text(.95, .3, '+ ...', fontdict={'fontsize': 30})

plt.rc('text')
plt.text(.12, .3, '=', fontdict={'fontsize': 30})
plt.axis("off")
plt.show()
plt.close()
plt.rc('text')


from sklearn.decomposition import NMF
nmf = NMF(n_components=15, random_state=0)
nmf.fit(X_train)
X_train_nmf = nmf.transform(X_train)
X_test_nmf = nmf.transform(X_test)

fig, axes = plt.subplots(3, 5, figsize=(15, 12),
                         subplot_kw={'xticks': (), 'yticks': ()})
for i, (component, ax) in enumerate(zip(nmf.components_, axes.ravel())):
    ax.imshow(component.reshape(image_shape))
    ax.set_title("성분 {}".format(i))

C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"


compn = 3
# 4번째 성분으로 정렬하여 처음 10개 이미지를 출력합니다
inds = np.argsort(X_train_nmf[:, compn])[::-1]
fig, axes = plt.subplots(2, 5, figsize=(15, 8),
                         subplot_kw={'xticks': (), 'yticks': ()})
for i, (ind, ax) in enumerate(zip(inds, axes.ravel())):
    ax.imshow(X_train[ind].reshape(image_shape))
    
compn = 7
# 8번째 성분으로 정렬하여 처음 10개 이미지를 출력합니다
inds = np.argsort(X_train_nmf[:, compn])[::-1]
fig, axes = plt.subplots(2, 5, figsize=(15, 8),
                         subplot_kw={'xticks': (), 'yticks': ()})
for i, (ind, ax) in enumerate(zip(inds, axes.ravel())):
    ax.imshow(X_train[ind].reshape(image_shape))


mglearn.plots.plot_nmf_faces(X_train, X_test, image_shape)

WARNING:root:[MemorizedFunc(func=<function nmf_faces at 0x000002891C1D7280>, location=cache\joblib)]: Clearing function cache identified by mglearn\plot_nmf\nmf_faces
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "

________________________________________________________________________________
[Memory] Calling mglearn.plot_nmf.nmf_faces...
nmf_faces(array([[0.535948, ..., 0.243137],
       ...,
       [0.043137, ..., 0.596078]], dtype=float32), 
array([[0.237908, ..., 0.269281],
       ...,
       [0.4     , ..., 0.254902]], dtype=float32))

C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\toto\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"

CH03 비지도학습 - PCA 얼굴 예제¶

학습 내용¶

01 왜 비지도학습을 사용하는가?¶

03. 사람 얼굴 이미지 데이터 셋을 이용한 PCA 예제 실습¶

LFW(Labeled Faces in the Wild)데이터 셋에서 얼굴 이미지에서 특성 추출¶

얼굴 이미지 10개 표시하기¶

이미지 크기¶

각 사람의 이미지는 몇장씩 있는가?¶

50개씩 데이터를 추출하기 위해..¶

주어진 데이터를 이용해서 지도학습 모델에 적용시켜보자.¶

PCA를 이용해서 87 x 65(=5655)의 특징을 갖는 성분을 100개의 주성분으로 추출.¶

주성분 이미지 확인¶

한글 보이도록 설정¶

각각의 주성분 그래프로 확인해 보기¶

주성분의 일부를 사용하여 이미지 재구성¶

그외.¶