import urllib.request as req
local = "mushroom.csv"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
req.urlretrieve(url, local)
print("다운로드 완료")
다운로드 완료
### 모델 만들기
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
# 데이터 읽기
mush = pd.read_csv("mushroom.csv", header=None)
mush
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u |
1 | e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g |
2 | e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m |
3 | p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u |
4 | e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8119 | e | k | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | o | o | p | b | c | l |
8120 | e | x | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | n | o | p | b | v | l |
8121 | e | f | s | n | f | n | a | c | b | n | e | ? | s | s | o | o | p | o | o | p | b | c | l |
8122 | p | k | y | n | f | y | f | c | n | b | t | ? | s | k | w | w | p | w | o | e | w | v | l |
8123 | e | x | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | o | o | p | o | c | l |
8124 rows × 23 columns
from sklearn import preprocessing
encoder_le = preprocessing.LabelEncoder()
mush['label'] = encoder_le.fit_transform(mush.iloc[:, 0]) # 1열의 값을 변경하여 label을 만든다.
mush
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u | 1 |
1 | e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g | 0 |
2 | e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m | 0 |
3 | p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u | 1 |
4 | e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8119 | e | k | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | o | o | p | b | c | l | 0 |
8120 | e | x | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | n | o | p | b | v | l | 0 |
8121 | e | f | s | n | f | n | a | c | b | n | e | ? | s | s | o | o | p | o | o | p | b | c | l | 0 |
8122 | p | k | y | n | f | y | f | c | n | b | t | ? | s | k | w | w | p | w | o | e | w | v | l | 1 |
8123 | e | x | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | o | o | p | o | c | l | 0 |
8124 rows × 24 columns
for i in range(1,23,1):
mush['col' + str(i)] = encoder_le.fit_transform(mush.iloc[:, i]) # 각 열의 값을 변경하여 feature를 만든다.
mush
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | label | col1 | col2 | col3 | col4 | col5 | col6 | col7 | col8 | col9 | col10 | col11 | col12 | col13 | col14 | col15 | col16 | col17 | col18 | col19 | col20 | col21 | col22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u | 1 | 5 | 2 | 4 | 1 | 6 | 1 | 0 | 1 | 4 | 0 | 3 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 2 | 3 | 5 |
1 | e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g | 0 | 5 | 2 | 9 | 1 | 0 | 1 | 0 | 0 | 4 | 0 | 2 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 3 | 2 | 1 |
2 | e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m | 0 | 0 | 2 | 8 | 1 | 3 | 1 | 0 | 0 | 5 | 0 | 2 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 3 | 2 | 3 |
3 | p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u | 1 | 5 | 3 | 8 | 1 | 6 | 1 | 0 | 1 | 5 | 0 | 3 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 2 | 3 | 5 |
4 | e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g | 0 | 5 | 2 | 3 | 0 | 5 | 1 | 1 | 0 | 4 | 1 | 3 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 0 | 3 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8119 | e | k | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | o | o | p | b | c | l | 0 | 3 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 11 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 1 | 1 | 4 | 0 | 1 | 2 |
8120 | e | x | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | n | o | p | b | v | l | 0 | 5 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 11 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 0 | 1 | 4 | 0 | 4 | 2 |
8121 | e | f | s | n | f | n | a | c | b | n | e | ? | s | s | o | o | p | o | o | p | b | c | l | 0 | 2 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 5 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 1 | 1 | 4 | 0 | 1 | 2 |
8122 | p | k | y | n | f | y | f | c | n | b | t | ? | s | k | w | w | p | w | o | e | w | v | l | 1 | 3 | 3 | 4 | 0 | 8 | 1 | 0 | 1 | 0 | 1 | 0 | 2 | 1 | 7 | 7 | 0 | 2 | 1 | 0 | 7 | 4 | 2 |
8123 | e | x | s | n | f | n | a | c | b | y | e | ? | s | s | o | o | p | o | o | p | o | c | l | 0 | 5 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 11 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 1 | 1 | 4 | 4 | 1 | 2 |
8124 rows × 46 columns
col_all = list(range(0,23,1))
col_all
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
mush.drop(col_all, axis=1)
label | col1 | col2 | col3 | col4 | col5 | col6 | col7 | col8 | col9 | col10 | col11 | col12 | col13 | col14 | col15 | col16 | col17 | col18 | col19 | col20 | col21 | col22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2 | 4 | 1 | 6 | 1 | 0 | 1 | 4 | 0 | 3 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 2 | 3 | 5 |
1 | 0 | 5 | 2 | 9 | 1 | 0 | 1 | 0 | 0 | 4 | 0 | 2 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 3 | 2 | 1 |
2 | 0 | 0 | 2 | 8 | 1 | 3 | 1 | 0 | 0 | 5 | 0 | 2 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 3 | 2 | 3 |
3 | 1 | 5 | 3 | 8 | 1 | 6 | 1 | 0 | 1 | 5 | 0 | 3 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 2 | 3 | 5 |
4 | 0 | 5 | 2 | 3 | 0 | 5 | 1 | 1 | 0 | 4 | 1 | 3 | 2 | 2 | 7 | 7 | 0 | 2 | 1 | 0 | 3 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8119 | 0 | 3 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 11 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 1 | 1 | 4 | 0 | 1 | 2 |
8120 | 0 | 5 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 11 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 0 | 1 | 4 | 0 | 4 | 2 |
8121 | 0 | 2 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 5 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 1 | 1 | 4 | 0 | 1 | 2 |
8122 | 1 | 3 | 3 | 4 | 0 | 8 | 1 | 0 | 1 | 0 | 1 | 0 | 2 | 1 | 7 | 7 | 0 | 2 | 1 | 0 | 7 | 4 | 2 |
8123 | 0 | 5 | 2 | 4 | 0 | 5 | 0 | 0 | 0 | 11 | 0 | 0 | 2 | 2 | 5 | 5 | 0 | 1 | 1 | 4 | 4 | 1 | 2 |
8124 rows × 23 columns
X = mush.loc[: , "col1":"col7"] # 모든 행, col1~col7까지 선택
y = mush['label'] # 예측하고자 하는 열 선택
print(X.shape, y.shape)
(8124, 7) (8124,)
### 학습 데이터와 테스트 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(6093, 7) (6093,) (2031, 7) (2031,)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
### 모델 선택 및 학습
model = RandomForestClassifier()
model.fit(X_train, y_train)
RandomForestClassifier()
### 새로운 데이터로 예측해 보기
predict = model.predict(X_test)
predict
array([1, 0, 0, ..., 1, 0, 0])
print( len(predict), len(y_test) )
2031 2031
y_test.values
array([1, 0, 0, ..., 1, 0, 0])
# 얼마나 적중했을까?
import numpy as np
np.mean( predict==y_test.values )
0.9945839487936977