import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
from IPython.display import display, Image


display(Image(filename='data_u_data.png'))


col_name = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=col_name)
print(df.shape)
df.head()

(100000, 4)


display(Image(filename='data_u_item.png'))


# 장르 분야
col_name = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
           'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 
           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
           'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('../data/ml-100k/u.item', sep='|', 
                         names=col_name, encoding='latin-1')

movie_names = movies[['item_id', 'movie title']]
c_movies_data = pd.merge(df, movie_names, on='item_id')
print(c_movies_data.shape)
c_movies_data.head()

(100000, 5)


rating_crosstab = c_movies_data.pivot_table(values='rating', 
                                            index='user_id', 
                                            columns='movie title', fill_value=0)
print(rating_crosstab.shape)
rating_crosstab.head()

(943, 1664)


X = rating_crosstab.T
print(X.shape)

(1664, 943)


SVD = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(1664, 12)


### correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
print( corr_mat.shape )
corr_mat

(1664, 1664)

array([[ 1.        , -0.11573577,  0.51362284, ...,  0.38310045,
         0.20193733,  0.5065142 ],
       [-0.11573577,  1.        ,  0.05820808, ...,  0.15805829,
         0.51795357,  0.27104818],
       [ 0.51362284,  0.05820808,  1.        , ...,  0.76575655,
         0.43824619,  0.19507139],
       ...,
       [ 0.38310045,  0.15805829,  0.76575655, ...,  1.        ,
         0.18043708,  0.12115972],
       [ 0.20193733,  0.51795357,  0.43824619, ...,  0.18043708,
         1.        ,  0.20126072],
       [ 0.5065142 ,  0.27104818,  0.19507139, ...,  0.12115972,
         0.20126072,  1.        ]])


rating_crosstab.columns.get_loc("Star Wars (1977)")

1398


col_idx = rating_crosstab.columns.get_loc("Star Wars (1977)")
corr_specific = corr_mat[col_idx]    # Star Wars (1977)의 위치 행 획득
print(corr_specific.shape)

(1664,)


result = pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})
print(result.shape)
result.head()

(1664, 2)


result.sort_values('corr_specific', ascending=False).head(10)


col_idx = rating_crosstab.columns.get_loc("Godfather, The (1972)")
corr_specific = corr_mat[col_idx]    # Godfather, The (1972)의 위치 행 획득
print(corr_specific.shape)

(1664,)


result = pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})
result.sort_values('corr_specific', ascending=False).head(10)

	user_id	item_id	rating	timestamp
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596

	user_id	item_id	rating	timestamp	movie title
0	196	242	3	881250949	Kolya (1996)
1	63	242	3	875747190	Kolya (1996)
2	226	242	5	883888671	Kolya (1996)
3	154	242	3	879138235	Kolya (1996)
4	306	242	5	876503793	Kolya (1996)

movie title	'Til There Was You (1997)	1-900 (1994)	101 Dalmatians (1996)	12 Angry Men (1957)	187 (1997)	2 Days in the Valley (1996)	20,000 Leagues Under the Sea (1954)	2001: A Space Odyssey (1968)	3 Ninjas: High Noon At Mega Mountain (1998)	39 Steps, The (1935)	...	Yankee Zulu (1994)	Year of the Horse (1997)	You So Crazy (1994)	Young Frankenstein (1974)	Young Guns (1988)	Young Guns II (1990)	Young Poisoner's Handbook, The (1995)	Zeus and Roxanne (1997)	unknown	Á köldum klaka (Cold Fever) (1994)
user_id
1	0	0	2	5	0	0	3	4	0	0	...	0	0	0	5	3	0	0	0	4	0
2	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	2	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
5	0	0	2	0	0	0	0	4	0	0	...	0	0	0	4	0	0	0	0	4	0

	corr_specific	Movies
0	0.357238	'Til There Was You (1997)
1	0.421507	1-900 (1994)
2	0.593815	101 Dalmatians (1996)
3	0.722361	12 Angry Men (1957)
4	0.325221	187 (1997)

	corr_specific	Movies
1398	1.000000	Star Wars (1977)
1234	0.988052	Return of the Jedi (1983)
1460	0.942655	Terminator 2: Judgment Day (1991)
1523	0.933978	Toy Story (1995)
1461	0.931701	Terminator, The (1984)
1205	0.925185	Raiders of the Lost Ark (1981)
456	0.923562	Empire Strikes Back, The (1980)
570	0.915965	Fugitive, The (1993)
414	0.914299	Die Hard (1988)
44	0.892894	Aliens (1986)

협업 필터링 영화 추천¶

학습 내용¶

목차

01 데이터 불러오기

데이터 셋¶

데이터 불러오기¶

데이터 불러오기¶

데이터 정보¶

02 데이터 준비 및 특이값분해(SVD), 상관계수 구하기

사용자-아이템 표 만들기¶

아이템-사용자 형태를 위해 행열 바꾸기¶

SVD(특잇값 분해)¶

Correlation Pearson¶

03 유사영화를 찾아보기

Similar Movies to Star Wars (1977)¶

10개의 영화 추천¶

(실습) Godfather, The (1972)에 대한 10개의 영화 추천해 보기¶

실습해 보기¶

History¶

	corr_specific	Movies
612	1.000000	Godfather, The (1972)
613	0.921444	Godfather: Part II, The (1974)
498	0.921420	Fargo (1996)
623	0.900758	GoodFellas (1990)
237	0.865385	Bronx Tale, A (1993)
1398	0.865148	Star Wars (1977)
209	0.864269	Boot, Das (1981)
389	0.857308	Dead Man Walking (1995)
622	0.845558	Good, The Bad and The Ugly, The (1966)
1190	0.842705	Pulp Fiction (1994)