from urllib.request import urlopen
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
dat = pd.read_csv(target_url, header=None, prefix='V')
dat.head()
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V51 | V52 | V53 | V54 | V55 | V56 | V57 | V58 | V59 | V60 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0200 | 0.0371 | 0.0428 | 0.0207 | 0.0954 | 0.0986 | 0.1539 | 0.1601 | 0.3109 | 0.2111 | ... | 0.0027 | 0.0065 | 0.0159 | 0.0072 | 0.0167 | 0.0180 | 0.0084 | 0.0090 | 0.0032 | R |
1 | 0.0453 | 0.0523 | 0.0843 | 0.0689 | 0.1183 | 0.2583 | 0.2156 | 0.3481 | 0.3337 | 0.2872 | ... | 0.0084 | 0.0089 | 0.0048 | 0.0094 | 0.0191 | 0.0140 | 0.0049 | 0.0052 | 0.0044 | R |
2 | 0.0262 | 0.0582 | 0.1099 | 0.1083 | 0.0974 | 0.2280 | 0.2431 | 0.3771 | 0.5598 | 0.6194 | ... | 0.0232 | 0.0166 | 0.0095 | 0.0180 | 0.0244 | 0.0316 | 0.0164 | 0.0095 | 0.0078 | R |
3 | 0.0100 | 0.0171 | 0.0623 | 0.0205 | 0.0205 | 0.0368 | 0.1098 | 0.1276 | 0.0598 | 0.1264 | ... | 0.0121 | 0.0036 | 0.0150 | 0.0085 | 0.0073 | 0.0050 | 0.0044 | 0.0040 | 0.0117 | R |
4 | 0.0762 | 0.0666 | 0.0481 | 0.0394 | 0.0590 | 0.0649 | 0.1209 | 0.2467 | 0.3564 | 0.4459 | ... | 0.0031 | 0.0054 | 0.0105 | 0.0110 | 0.0015 | 0.0072 | 0.0048 | 0.0107 | 0.0094 | R |
5 rows × 61 columns
sns.set_style('whitegrid')
sns.countplot(x='V60', data=dat)
<AxesSubplot:xlabel='V60', ylabel='count'>
dat['target']=dat['V60']
dat.columns
Index(['V0', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'label', 'target'], dtype='object')
### 인코딩 (R:0, M:1)
dat['target'] = dat['V60'].map( {'R':0, 'M':1 } ).astype(int)
dat['target'].value_counts()
1 111 0 97 Name: target, dtype: int64
dat.head()
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V53 | V54 | V55 | V56 | V57 | V58 | V59 | V60 | label | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0200 | 0.0371 | 0.0428 | 0.0207 | 0.0954 | 0.0986 | 0.1539 | 0.1601 | 0.3109 | 0.2111 | ... | 0.0159 | 0.0072 | 0.0167 | 0.0180 | 0.0084 | 0.0090 | 0.0032 | R | R | 0 |
1 | 0.0453 | 0.0523 | 0.0843 | 0.0689 | 0.1183 | 0.2583 | 0.2156 | 0.3481 | 0.3337 | 0.2872 | ... | 0.0048 | 0.0094 | 0.0191 | 0.0140 | 0.0049 | 0.0052 | 0.0044 | R | R | 0 |
2 | 0.0262 | 0.0582 | 0.1099 | 0.1083 | 0.0974 | 0.2280 | 0.2431 | 0.3771 | 0.5598 | 0.6194 | ... | 0.0095 | 0.0180 | 0.0244 | 0.0316 | 0.0164 | 0.0095 | 0.0078 | R | R | 0 |
3 | 0.0100 | 0.0171 | 0.0623 | 0.0205 | 0.0205 | 0.0368 | 0.1098 | 0.1276 | 0.0598 | 0.1264 | ... | 0.0150 | 0.0085 | 0.0073 | 0.0050 | 0.0044 | 0.0040 | 0.0117 | R | R | 0 |
4 | 0.0762 | 0.0666 | 0.0481 | 0.0394 | 0.0590 | 0.0649 | 0.1209 | 0.2467 | 0.3564 | 0.4459 | ... | 0.0105 | 0.0110 | 0.0015 | 0.0072 | 0.0048 | 0.0107 | 0.0094 | R | R | 0 |
5 rows × 63 columns
from sklearn.model_selection import train_test_split
new_df = dat.drop("V60", axis='columns')
X_all = new_df.loc[:,'V0':'V59'] # V0~V59 열 선택
y = new_df['target'] # 'target' 열 선택
# test:10%, train:90%로 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_all, y,
test_size=0.1,
random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((187, 60), (21, 60), (187,), (21,))
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
### 모델 선택 및 학습
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
model1.score(X_test, y_test)
0.7142857142857143
### 모델 선택 및 학습
model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
model2.score(X_test, y_test)
0.8095238095238095
### 최종 모델 선택 및 예측
last_model = KNeighborsClassifier()
last_model.fit(X_train, y_train)
pred = last_model.predict(X_test)
pred
array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1])
### 최종 모델 평가
( sum(pred==y_test) / len(pred) ) * 100
80.95238095238095