import seaborn as sns
tips = sns.load_dataset("tips")
tips.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null category 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 7.4 KB
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
tips.shape
(244, 7)
tips_have = tips.iloc[ 0:220, :] # 현재 가진 고객 데이터
tips_new = tips.iloc [220: , :] # 미래의 고객 데이터
tips_new.drop(["size"], axis=1, inplace=True)
tips_have.shape, tips_new.shape
C:\Users\totofriend\AppData\Local\Temp\ipykernel_2576\841681958.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy tips_new.drop(["size"], axis=1, inplace=True)
((220, 7), (24, 6))
tips_have.columns, tips_new.columns
(Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object'), Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time'], dtype='object'))
tips_have.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
sel = ['total_bill', 'tip']
# sel = ['total_bill', 'tip']
X = tips_have[sel]
y = tips_have['size'] # 우리가 예측할 컬럼(변수)
test_X = tips_new[sel] # 예측할 친구는 다른 데이터 셋
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier() # 모델 만들기
model.fit(X, y) # 모델 훈련시키기 model.fit(입력, 출력)
pred = model.predict(test_X) # 학습된 모델로 예측하기
pred
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 5, 4, 4, 4, 4, 2, 4], dtype=int64)
from sklearn.model_selection import train_test_split
# random_state는 난수 발생기의 패턴을 고정시키기 위해 사용한다.
# 이를 통해 우리는 X(입력), y(출력)이 각각 학습용, 테스트용으로 나누어진다.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = RandomForestClassifier() # 모델 만들기
model.fit(X_train, y_train) # 모델 훈련시키기 model.fit(입력, 출력)
pred = model.predict(X_test) # 학습된 모델로 예측하기
pred
array([2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 4, 2, 2, 3, 2, 4, 2, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 2, 2, 2, 2, 2], dtype=int64)
pred == y_test
152 False 74 True 71 False 161 True 162 True 143 False 63 False 153 False 219 False 135 True 149 True 5 False 90 False 168 True 202 True 191 True 201 True 96 False 106 True 75 True 55 False 12 True 157 True 64 False 37 False 130 True 101 True 61 True 8 True 18 False 179 False 15 False 139 True 7 False 124 True 159 False 136 True 144 True 199 True 155 False 66 True 33 False 89 True 158 True 196 True 173 True 185 False 207 True 16 True 145 True 200 False 146 False 22 True 183 False 45 True Name: size, dtype: bool
# 오차를 계산해 보자.
(pred == y_test).sum()
32
### model.score()를 이용해서 구하기
print( model.score(X_train, y_train) )
print( model.score(X_test, y_test) )
1.0 0.5818181818181818
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
model = KNeighborsClassifier() # 모델 만들기
model.fit(X_train, y_train) # 모델 훈련시키기 model.fit(입력, 출력)
pred = model.predict(X_test) # 학습된 모델로 예측하기
pred
array([2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2], dtype=int64)
# 정확도
(pred == y_test).sum() / len(pred) * 100
52.72727272727272
model = DecisionTreeClassifier() # 모델 만들기
model.fit(X_train, y_train) # 모델 훈련시키기 model.fit(입력, 출력)
pred = model.predict(X_test) # 학습된 모델로 예측하기
pred
array([2, 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 2, 2, 2, 2, 4, 2, 2, 2, 2, 3, 4, 2, 2, 2, 3, 3, 2, 2, 4, 3, 2, 2, 2, 3, 2, 2, 4, 3, 2, 2, 4, 2, 2, 2], dtype=int64)
import numpy as np
# 정확도
# (pred == y_test).sum() / len(pred) * 100
np.mean(pred == y_test)
0.509090909090909
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2022 LIM Co. all rights reserved.