import seaborn as sns


tips = sns.load_dataset("tips")
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


tips.head()


tips.shape

(244, 7)


tips_have = tips.iloc[ 0:220, :]  # 현재 가진 고객 데이터
tips_new  = tips.iloc [220: , :]  # 미래의 고객 데이터 

tips_new.drop(["size"], axis=1, inplace=True)

tips_have.shape, tips_new.shape

C:\Users\totofriend\AppData\Local\Temp\ipykernel_2576\841681958.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tips_new.drop(["size"], axis=1, inplace=True)

((220, 7), (24, 6))


tips_have.columns, tips_new.columns

(Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object'),
 Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time'], dtype='object'))


tips_have.head()


sel = ['total_bill', 'tip']


# sel = ['total_bill', 'tip']

X = tips_have[sel]
y = tips_have['size']  # 우리가 예측할 컬럼(변수)

test_X = tips_new[sel]       # 예측할 친구는 다른 데이터 셋


from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier() # 모델 만들기
model.fit(X, y)     # 모델 훈련시키기  model.fit(입력, 출력)
pred = model.predict(test_X)    # 학습된 모델로 예측하기
pred

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 5, 4, 4, 4, 4,
       2, 4], dtype=int64)


from sklearn.model_selection import train_test_split


# random_state는 난수 발생기의 패턴을 고정시키기 위해 사용한다.
# 이를 통해 우리는 X(입력), y(출력)이 각각 학습용, 테스트용으로 나누어진다.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


model = RandomForestClassifier() # 모델 만들기
model.fit(X_train, y_train)     # 모델 훈련시키기  model.fit(입력, 출력)
pred = model.predict(X_test)    # 학습된 모델로 예측하기
pred

array([2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 4, 2, 2, 3, 2,
       4, 2, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 2, 4, 3, 2, 2, 2, 2, 2, 2], dtype=int64)


pred == y_test

152    False
74      True
71     False
161     True
162     True
143    False
63     False
153    False
219    False
135     True
149     True
5      False
90     False
168     True
202     True
191     True
201     True
96     False
106     True
75      True
55     False
12      True
157     True
64     False
37     False
130     True
101     True
61      True
8       True
18     False
179    False
15     False
139     True
7      False
124     True
159    False
136     True
144     True
199     True
155    False
66      True
33     False
89      True
158     True
196     True
173     True
185    False
207     True
16      True
145     True
200    False
146    False
22      True
183    False
45      True
Name: size, dtype: bool


# 오차를 계산해 보자.
(pred == y_test).sum()

32


### model.score()를 이용해서 구하기
print( model.score(X_train, y_train) )
print( model.score(X_test, y_test) )

1.0
0.5818181818181818


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


model = KNeighborsClassifier() # 모델 만들기
model.fit(X_train, y_train)     # 모델 훈련시키기  model.fit(입력, 출력)
pred = model.predict(X_test)    # 학습된 모델로 예측하기
pred

array([2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 3, 2, 2, 2,
       3, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2,
       2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2], dtype=int64)


# 정확도
(pred == y_test).sum() / len(pred)  * 100

52.72727272727272


model = DecisionTreeClassifier() # 모델 만들기
model.fit(X_train, y_train)     # 모델 훈련시키기  model.fit(입력, 출력)
pred = model.predict(X_test)    # 학습된 모델로 예측하기
pred

array([2, 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2,
       2, 2, 2, 4, 2, 2, 2, 2, 3, 4, 2, 2, 2, 3, 3, 2, 2, 4, 3, 2, 2, 2,
       3, 2, 2, 4, 3, 2, 2, 4, 2, 2, 2], dtype=int64)


import numpy as np


# 정확도
# (pred == y_test).sum() / len(pred)  * 100
np.mean(pred == y_test)

0.509090909090909

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Ch2 앙상블 기법에 대해 알아보기¶

대표적인 알고리즘 RandomForest¶

학습 내용¶

01 앙상블(ensemble)란 무엇일까?¶

02 랜덤 포레스트(RandomForest)는 무엇인가?¶

03 실습¶

04 머신러닝 과제¶

주어진 데이터를 토대로 이용 고객을 예측해 보자.¶

우선 데이터 만들어보기¶

05 머신러닝 과제 수행¶

머신러닝은 다음과 같은 과정을 거친다.¶

우리의 과제¶

랜덤 포레스트 이용¶

06 우리가 만든 모델이 좋은지 아닌지 어떻게 평가할 수 없을까?¶

07 다시 모델을 만들고, 이제는 평가가 가능하다. 살펴보자.¶

여기서 예측한 pred와 y_test는 비교하여 얼마나 오차가 있는지 확인 가능하다.¶

정확도가 58.2%이다.¶

08 다른 모델의 정확도는 어떨까? 확인해 보자.¶

결과 확인¶