import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')


train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')


print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)

학습용 데이터 :  (26049, 16)
테스트용 데이터 :  (6512, 15)


y = train['income']
test['income'] = "blank"


all_dat = pd.concat([train, test], axis=0)
print(all_dat.shape)

(32561, 16)


all_dat.income.value_counts()

<=50K    19744
blank     6512
>50K      6305
Name: income, dtype: int64


sns.countplot(x="income", data=all_dat)

<AxesSubplot:xlabel='income', ylabel='count'>


all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")


all_dat.head()


all_dat.columns

Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income', 'target'],
      dtype='object')


from sklearn.preprocessing import LabelEncoder


en_x = LabelEncoder()
all_dat['workclass_lbl'] = en_x.fit_transform(all_dat['workclass'])
all_dat.head(3)


all_dat['education_lbl'] = en_x.fit_transform(all_dat['education'])
all_dat['marital_status_lbl'] = en_x.fit_transform(all_dat['marital_status'])
all_dat['occupation_lbl'] = en_x.fit_transform(all_dat['occupation'])
all_dat['relationship_lbl'] = en_x.fit_transform(all_dat['relationship'])
all_dat['race_lbl'] = en_x.fit_transform(all_dat['race'])
all_dat['native_country_lbl'] = en_x.fit_transform(all_dat['native_country'])
all_dat.head(3)


all_dat['sex'].unique()

array(['Male', 'Female'], dtype=object)


mf_mapping = {"Male": 1, "Female": 2}
all_dat['sex'] = all_dat['sex'].map(mf_mapping)
all_dat.head(3)


sel_cat = ['workclass', 'education', 'marital_status',
           'occupation', 'relationship', 'race', 'native_country', 'income' ]
all_dat_n = all_dat.drop(sel_cat, axis=1)
all_dat_n


X_cat = all_dat_n.drop(['target'],axis=1)
y = all_dat_n['target']


train_n = all_dat_n.loc[ (all_dat_n['target']==0) | (all_dat_n['target']==1)  , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999  ,  : ]


print(train_n.shape, test_n.shape)

(26049, 16) (6512, 16)


train_n.head(3)


test_n.head(3)


test_n = test_n.drop(['target'], axis=1)

print(train_n.shape, test_n.shape)

(26049, 16) (6512, 15)


train_n.columns

Index(['id', 'age', 'fnlwgt', 'education_num', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'target', 'workclass_lbl',
       'education_lbl', 'marital_status_lbl', 'occupation_lbl',
       'relationship_lbl', 'race_lbl', 'native_country_lbl'],
      dtype='object')


from sklearn.model_selection import train_test_split


sel = ['age', 'education_num', 'sex']

X_tr_all = train_n[sel]
y_tr_all = train_n['target']
X_test_all = test_n[sel]

X_train, X_test, y_train, y_test = train_test_split(X_tr_all, 
                                                    y_tr_all,
                                                    test_size=0.3,
                                                    random_state=77)


from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
model.fit(X_train, y_train)

model.score(X_train, y_train), model.score(X_test, y_test),

(0.7960403641548756, 0.7901471529110684)


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


model_list = [RandomForestClassifier(),AdaBoostClassifier(), GradientBoostingClassifier()]

for model in model_list:
    m = model
    m.fit(X_train, y_train)
    
    ac_tr = model.score(X_train, y_train)
    ac_test = model.score(X_test, y_test)
    
    print(ac_tr, ac_test)

0.8174838214324888 0.7850287907869482
0.8044861248217615 0.7982085732565579
0.8076669957222771 0.7980806142034549


model = GradientBoostingClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test_all)


sub['prediction'] = pred
sub.to_csv("secondSub4th_gb.csv", index=False)


### score : 0.80939

	id	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	hours_per_week	native_country	income	target
0	0	40	Private	168538	HS-grad	9	Married-civ-spouse	Sales	Husband	White	Male	60	United-States	>50K	1
1	1	17	Private	101626	9th	5	Never-married	Machine-op-inspct	Own-child	White	Male	20	United-States	<=50K	0
2	2	18	Private	353358	Some-college	10	Never-married	Other-service	Own-child	White	Male	16	United-States	<=50K	0
3	3	21	Private	151158	Some-college	10	Never-married	Prof-specialty	Own-child	White	Female	25	United-States	<=50K	0
4	4	24	Private	122234	Some-college	10	Never-married	Adm-clerical	Not-in-family	Black	Female	20	?	<=50K	0

캐글 코리아 4차 대회¶

학습 내용¶

목차

01. 라이브러리 임포트 및 데이터 준비

데이터 탐색¶

02. 데이터 전처리

라벨 인코딩¶

03. 모델 구축하기

로지스틱 모델 만들기¶

다른 모델 확인해 보기¶

최종 모델¶

	id	age	fnlwgt	education_num	sex	hours_per_week	target	workclass_lbl	education_lbl	marital_status_lbl	occupation_lbl	relationship_lbl	race_lbl	native_country_lbl
0	0	28	67661	10	2	40	999	4	15	4	1	2	4	39
1	1	40	37869	9	1	50	999	5	11	2	4	0	4	39
2	2	20	109952	10	1	25	999	4	15	4	6	3	4	39

	id	age	fnlwgt	education_num	sex	capital_gain	capital_loss	hours_per_week	target	workclass_lbl	education_lbl	marital_status_lbl	occupation_lbl	relationship_lbl	race_lbl	native_country_lbl
0	0	40	168538	9	1	0	0	60	1	4	11	2	12	0	4	39
1	1	17	101626	5	1	0	0	20	0	4	6	4	7	3	4	39
2	2	18	353358	10	1	0	0	16	0	4	15	4	8	3	4	39
3	3	21	151158	10	2	0	0	25	0	4	15	4	10	3	4	39
4	4	24	122234	10	2	0	0	20	0	4	15	4	1	1	2	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6507	6507	35	61343	13	1	0	0	40	999	4	9	2	12	0	4	39
6508	6508	41	32185	13	1	0	0	40	999	5	9	2	13	0	4	39
6509	6509	39	409189	3	1	0	0	40	999	4	4	2	8	0	4	26
6510	6510	35	180342	9	1	0	0	40	999	4	11	2	3	0	4	39
6511	6511	28	156819	9	2	0	0	36	999	4	11	0	6	4	4	39