import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')


train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')


print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)

학습용 데이터 :  (26049, 16)
테스트용 데이터 :  (6512, 15)


y = train['income']
test['income'] = "blank"

all_dat = pd.concat([train, test], axis=0)
print(all_dat.shape)

(32561, 16)


all_dat.income.value_counts()

<=50K    19744
blank     6512
>50K      6305
Name: income, dtype: int64


all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")


from sklearn.preprocessing import LabelEncoder


en_x = LabelEncoder()
all_dat['workclass_lbl'] = en_x.fit_transform(all_dat['workclass'])
all_dat.head(3)


all_dat['education_lbl'] = en_x.fit_transform(all_dat['education'])
all_dat['marital_status_lbl'] = en_x.fit_transform(all_dat['marital_status'])
all_dat['occupation_lbl'] = en_x.fit_transform(all_dat['occupation'])
all_dat['relationship_lbl'] = en_x.fit_transform(all_dat['relationship'])
all_dat['race_lbl'] = en_x.fit_transform(all_dat['race'])
all_dat['native_country_lbl'] = en_x.fit_transform(all_dat['native_country'])
all_dat.head(3)


mf_mapping = {"Male": 1, "Female": 2}
all_dat['sex'] = all_dat['sex'].map(mf_mapping)
all_dat.head(3)


sel_cat = ['workclass', 'education', 'marital_status',
           'occupation', 'relationship', 'race', 'native_country', 'income' ]
all_dat_n = all_dat.drop(sel_cat, axis=1)
all_dat_n


X_cat = all_dat_n.drop(['target'],axis=1)
y = all_dat_n['target']


train_n = all_dat_n.loc[ (all_dat_n['target']==0) | (all_dat_n['target']==1)  , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999  ,  : ]

print(train_n.shape, test_n.shape)

(26049, 16) (6512, 16)


from sklearn.model_selection import train_test_split

sel = ['age', 'education_num', 'sex']

X_tr_all = train_n[sel]
y_tr_all = train_n['target']
X_test_all = test_n[sel]

X_train, X_test, y_train, y_test = train_test_split(X_tr_all, 
                                                    y_tr_all,
                                                    test_size=0.3,
                                                    random_state=77)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


model = LogisticRegression()
model.fit(X_train, y_train)
pred_log = model.predict(X_test)

model.score(X_train, y_train), model.score(X_test, y_test),

(0.7960403641548756, 0.7901471529110684)


confusion = confusion_matrix(y_test, pred_log)
print("오차 행렬:\n{}".format(confusion))

오차 행렬:
[[5509  380]
 [1260  666]]


from sklearn.metrics import classification_report
print(classification_report(y_test, pred_log,
                           target_names=["50K>=", "50K<"]))

              precision    recall  f1-score   support

       50K>=       0.81      0.94      0.87      5889
        50K<       0.64      0.35      0.45      1926

    accuracy                           0.79      7815
   macro avg       0.73      0.64      0.66      7815
weighted avg       0.77      0.79      0.77      7815


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


model = RandomForestClassifier()
model.fit(X_train, y_train)
pred_rf = model.predict(X_test)

model = AdaBoostClassifier()
model.fit(X_train, y_train)
pred_ada = model.predict(X_test)

model = GradientBoostingClassifier()
model.fit(X_train, y_train)
pred_gr = model.predict(X_test)


from sklearn.metrics import classification_report
print(classification_report(y_test, pred_rf,
                           target_names=["50K>=", "50K<"]))

              precision    recall  f1-score   support

       50K>=       0.82      0.91      0.86      5889
        50K<       0.59      0.40      0.48      1926

    accuracy                           0.78      7815
   macro avg       0.71      0.65      0.67      7815
weighted avg       0.77      0.78      0.77      7815


from sklearn.metrics import classification_report
print(classification_report(y_test, pred_ada,
                           target_names=["50K>=", "50K<"]))

              precision    recall  f1-score   support

       50K>=       0.82      0.94      0.88      5889
        50K<       0.67      0.36      0.47      1926

    accuracy                           0.80      7815
   macro avg       0.74      0.65      0.67      7815
weighted avg       0.78      0.80      0.78      7815


from sklearn.metrics import classification_report
print(classification_report(y_test, pred_gr,
                           target_names=["50K>=", "50K<"]))

              precision    recall  f1-score   support

       50K>=       0.83      0.93      0.87      5889
        50K<       0.64      0.40      0.50      1926

    accuracy                           0.80      7815
   macro avg       0.74      0.67      0.68      7815
weighted avg       0.78      0.80      0.78      7815

	id	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	hours_per_week	native_country	income	target	workclass_lbl
0	0	40	Private	168538	HS-grad	9	Married-civ-spouse	Sales	Husband	White	Male	60	United-States	>50K	1	4
1	1	17	Private	101626	9th	5	Never-married	Machine-op-inspct	Own-child	White	Male	20	United-States	<=50K	0	4
2	2	18	Private	353358	Some-college	10	Never-married	Other-service	Own-child	White	Male	16	United-States	<=50K	0	4

	id	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	...	native_country	income	target	workclass_lbl	education_lbl	marital_status_lbl	occupation_lbl	relationship_lbl	race_lbl	native_country_lbl
0	0	40	Private	168538	HS-grad	9	Married-civ-spouse	Sales	Husband	White	...	United-States	>50K	1	4	11	2	12	0	4	39
1	1	17	Private	101626	9th	5	Never-married	Machine-op-inspct	Own-child	White	...	United-States	<=50K	0	4	6	4	7	3	4	39
2	2	18	Private	353358	Some-college	10	Never-married	Other-service	Own-child	White	...	United-States	<=50K	0	4	15	4	8	3	4	39

	id	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	...	native_country	income	target	workclass_lbl	education_lbl	marital_status_lbl	occupation_lbl	relationship_lbl	race_lbl	native_country_lbl
0	0	40	Private	168538	HS-grad	9	Married-civ-spouse	Sales	Husband	White	...	United-States	>50K	1	4	11	2	12	0	4	39
1	1	17	Private	101626	9th	5	Never-married	Machine-op-inspct	Own-child	White	...	United-States	<=50K	0	4	6	4	7	3	4	39
2	2	18	Private	353358	Some-college	10	Never-married	Other-service	Own-child	White	...	United-States	<=50K	0	4	15	4	8	3	4	39

	id	age	fnlwgt	education_num	sex	capital_gain	capital_loss	hours_per_week	target	workclass_lbl	education_lbl	marital_status_lbl	occupation_lbl	relationship_lbl	race_lbl	native_country_lbl
0	0	40	168538	9	1	0	0	60	1	4	11	2	12	0	4	39
1	1	17	101626	5	1	0	0	20	0	4	6	4	7	3	4	39
2	2	18	353358	10	1	0	0	16	0	4	15	4	8	3	4	39
3	3	21	151158	10	2	0	0	25	0	4	15	4	10	3	4	39
4	4	24	122234	10	2	0	0	20	0	4	15	4	1	1	2	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6507	6507	35	61343	13	1	0	0	40	999	4	9	2	12	0	4	39
6508	6508	41	32185	13	1	0	0	40	999	5	9	2	13	0	4	39
6509	6509	39	409189	3	1	0	0	40	999	4	4	2	8	0	4	26
6510	6510	35	180342	9	1	0	0	40	999	4	11	2	3	0	4	39
6511	6511	28	156819	9	2	0	0	36	999	4	11	0	6	4	4	39

캐글 코리아 4차 대회¶

학습 내용¶

목차

01. 라이브러리 임포트 및 데이터 준비

데이터 탐색¶

02. 데이터 전처리

라벨 인코딩¶

03. 모델 구축 및 평가하기

로지스틱 모델¶

혼동 행렬¶