from urllib.request import urlopen
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
dat = pd.read_csv(target_url, header=None, prefix='V')
dat.head()


sns.set_style('whitegrid')
sns.countplot(x='V60', data=dat)

<AxesSubplot:xlabel='V60', ylabel='count'>


dat['target']=dat['V60']
dat.columns

Index(['V0', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30',
       'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40',
       'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50',
       'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60',
       'label', 'target'],
      dtype='object')


### 인코딩 (R:0, M:1)
dat['target'] = dat['V60'].map( {'R':0, 'M':1 }  ).astype(int)
dat['target'].value_counts()

1    111
0     97
Name: target, dtype: int64


dat.head()


from sklearn.model_selection import train_test_split


new_df = dat.drop("V60", axis='columns')
X_all = new_df.loc[:,'V0':'V59']  # V0~V59 열 선택
y = new_df['target']              # 'target' 열 선택

# test:10%, train:90%로 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_all, y, 
                                                   test_size=0.1,
                                                   random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((187, 60), (21, 60), (187,), (21,))


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


### 모델 선택 및 학습
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
model1.score(X_test, y_test)

0.7142857142857143


### 모델 선택 및 학습
model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
model2.score(X_test, y_test)

0.8095238095238095


### 최종 모델 선택 및 예측
last_model = KNeighborsClassifier()
last_model.fit(X_train, y_train)
pred = last_model.predict(X_test)
pred

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1])


### 최종 모델 평가 
( sum(pred==y_test) / len(pred) ) * 100

80.95238095238095

	V0	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V51	V52	V53	V54	V55	V56	V57	V58	V59	V60
0	0.0200	0.0371	0.0428	0.0207	0.0954	0.0986	0.1539	0.1601	0.3109	0.2111	...	0.0027	0.0065	0.0159	0.0072	0.0167	0.0180	0.0084	0.0090	0.0032	R
1	0.0453	0.0523	0.0843	0.0689	0.1183	0.2583	0.2156	0.3481	0.3337	0.2872	...	0.0084	0.0089	0.0048	0.0094	0.0191	0.0140	0.0049	0.0052	0.0044	R
2	0.0262	0.0582	0.1099	0.1083	0.0974	0.2280	0.2431	0.3771	0.5598	0.6194	...	0.0232	0.0166	0.0095	0.0180	0.0244	0.0316	0.0164	0.0095	0.0078	R
3	0.0100	0.0171	0.0623	0.0205	0.0205	0.0368	0.1098	0.1276	0.0598	0.1264	...	0.0121	0.0036	0.0150	0.0085	0.0073	0.0050	0.0044	0.0040	0.0117	R
4	0.0762	0.0666	0.0481	0.0394	0.0590	0.0649	0.1209	0.2467	0.3564	0.4459	...	0.0031	0.0054	0.0105	0.0110	0.0015	0.0072	0.0048	0.0107	0.0094	R

	V0	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V53	V54	V55	V56	V57	V58	V59	V60	label
0	0.0200	0.0371	0.0428	0.0207	0.0954	0.0986	0.1539	0.1601	0.3109	0.2111	...	0.0159	0.0072	0.0167	0.0180	0.0084	0.0090	0.0032	R	R
1	0.0453	0.0523	0.0843	0.0689	0.1183	0.2583	0.2156	0.3481	0.3337	0.2872	...	0.0048	0.0094	0.0191	0.0140	0.0049	0.0052	0.0044	R	R
2	0.0262	0.0582	0.1099	0.1083	0.0974	0.2280	0.2431	0.3771	0.5598	0.6194	...	0.0095	0.0180	0.0244	0.0316	0.0164	0.0095	0.0078	R	R
3	0.0100	0.0171	0.0623	0.0205	0.0205	0.0368	0.1098	0.1276	0.0598	0.1264	...	0.0150	0.0085	0.0073	0.0050	0.0044	0.0040	0.0117	R	R
4	0.0762	0.0666	0.0481	0.0394	0.0590	0.0649	0.1209	0.2467	0.3564	0.4459	...	0.0105	0.0110	0.0015	0.0072	0.0048	0.0107	0.0094	R	R

분류(classification) 문제 실습 - 항구의 기뢰 찾기¶

학습 목표¶

학습 내용¶

데이터 설명¶

01. 데이터 준비¶

데이터 최종 전처리¶

실습 과제¶