import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')


train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')


print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)

학습용 데이터 :  (26049, 16)
테스트용 데이터 :  (6512, 15)


y = train['income']
test['income'] = "blank"


all_dat = pd.concat([train, test], axis=0)
print(all_dat.shape)

(32561, 16)


all_dat.income.value_counts()

<=50K    19744
blank     6512
>50K      6305
Name: income, dtype: int64


sns.countplot(x="income", data=all_dat)

<AxesSubplot:xlabel='income', ylabel='count'>


all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")


all_dat.head()


all_dat.columns

Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income', 'target'],
      dtype='object')


sel_cat = ['workclass', 'education', 'marital_status',
           'occupation', 'relationship', 'race', 
           'sex', 'native_country' ]

X_cat = all_dat[sel_cat]
y = all_dat['target']


X_dummy = pd.get_dummies(X_cat)
X_dummy


all_dat_n = pd.concat([all_dat, X_dummy], axis=1)
all_dat_n


sel_cat = ['workclass', 'education', 'marital_status',
           'occupation', 'relationship', 'race', 
           'sex', 'native_country', 'income']

all_dat_n = all_dat_n.drop(sel_cat, axis=1)


train_n = all_dat_n.loc[ (all_dat_n['target']==0) | 
                        (all_dat_n['target']==1)  , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999  ,  : ]


print(train_n.shape, test_n.shape)

(26049, 110) (6512, 110)


X = train_n.drop(['target'], axis=1)
y = train_n['target']

test_X = test_n.drop(['target'], axis=1)


print(X.shape, y.shape, test_X.shape)

(26049, 109) (26049,) (6512, 109)


X.columns

Index(['id', 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov',
       ...
       'native_country_Portugal', 'native_country_Puerto-Rico',
       'native_country_Scotland', 'native_country_South',
       'native_country_Taiwan', 'native_country_Thailand',
       'native_country_Trinadad&Tobago', 'native_country_United-States',
       'native_country_Vietnam', 'native_country_Yugoslavia'],
      dtype='object', length=109)


type(X)

pandas.core.frame.DataFrame


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


sel = ['age', 'fnlwgt', 'capital_gain']

X_tr_all = X[sel]
y_tr_all = y
X_test_all = test_X[sel]

X_train, X_test, y_train, y_test = train_test_split(X_tr_all, 
                                                    y_tr_all,
                                                    test_size=0.3,
                                                    random_state=77)


from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import time


model_list = ["RandomForestRegressor",  "xgb_basic", "lightgbm-model", 
              "GradientBoostingClassifier", "LogisticRegression"]
model_score = []
model_time = []


# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


now_time = time.time()

model = RandomForestRegressor(random_state=30)
model.fit(X_train, y_train)
score = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
print(score)

pro_time = time.time() - now_time

print("걸린 시간 :", pro_time)  # 걸린 시간
print("RandomForestRegressor Score : {}".format(np.mean( score ) ))  # 점수

[0.7204471  0.73955972 0.73611203 0.72815306 0.71192845]
걸린 시간 : 13.336974143981934
RandomForestRegressor Score : 0.7272400712073421


now_time = time.time()

xg_reg = xgb.XGBRegressor(objective ='reg:logistic', 
            colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
            learning_rate = 0.1,
            max_depth = 3, 
            alpha = 0.1, 
            n_estimators = 100)  # n_estimators=100

xg_reg.fit(X_train, y_train)
score = cross_val_score(xg_reg, X_train, y_train, cv=5, scoring="roc_auc")
print(score)

pro_time = time.time() - now_time

print("걸린 시간 :", pro_time)  # 걸린 시간
print("xgboosting Score : {}".format(np.mean( score ) ))  # 점수

[0.79294036 0.79022102 0.77922751 0.7732279  0.76699142]
걸린 시간 : 1.699568271636963
xgboosting Score : 0.7805216400578529


now_time = time.time()

m_lgbm1 = lgb.LGBMRegressor()
m_lgbm1.fit(X_train, y_train)
score = cross_val_score(m_lgbm1, X_train, y_train, cv=5, scoring="roc_auc")
print(score)

pro_time = time.time() - now_time

print("걸린 시간 :", pro_time)  # 걸린 시간
print("LightGBM 모델 Score : {}".format(np.mean( score ) ))  # 점수

[0.77682267 0.78360496 0.77297815 0.76017757 0.75535851]
걸린 시간 : 0.670039176940918
LightGBM 모델 Score : 0.7697883701739261


model = xgb.XGBRegressor(objective ='reg:logistic', 
            colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
            learning_rate = 0.1,
            max_depth = 3, 
            alpha = 0.1, 
            n_estimators = 100)  # n_estimators=100

model.fit(X_train, y_train)

pred = model.predict(X_test_all)
pred

array([0.10635718, 0.2864344 , 0.00805269, ..., 0.31812307, 0.23884018,
       0.12535065], dtype=float32)


pred = np.where(pred > 0.32, 1, 0)
np.sum(pred==1)

1564


sub['prediction'] = pred
sub.to_csv("thirdSub4th_xgb2.csv", index=False)

	id	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	hours_per_week	native_country	income	target
0	0	40	Private	168538	HS-grad	9	Married-civ-spouse	Sales	Husband	White	Male	60	United-States	>50K	1
1	1	17	Private	101626	9th	5	Never-married	Machine-op-inspct	Own-child	White	Male	20	United-States	<=50K	0
2	2	18	Private	353358	Some-college	10	Never-married	Other-service	Own-child	White	Male	16	United-States	<=50K	0
3	3	21	Private	151158	Some-college	10	Never-married	Prof-specialty	Own-child	White	Female	25	United-States	<=50K	0
4	4	24	Private	122234	Some-college	10	Never-married	Adm-clerical	Not-in-family	Black	Female	20	?	<=50K	0

	id	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	...	native_country_Portugal	native_country_Puerto-Rico	native_country_Scotland	native_country_South	native_country_Taiwan	native_country_Thailand	native_country_Trinadad&Tobago	native_country_United-States	native_country_Vietnam	native_country_Yugoslavia
0	0	40	Private	168538	HS-grad	9	Married-civ-spouse	Sales	Husband	White	...	0	0	0	0	0	0	0	1	0	0
1	1	17	Private	101626	9th	5	Never-married	Machine-op-inspct	Own-child	White	...	0	0	0	0	0	0	0	1	0	0
2	2	18	Private	353358	Some-college	10	Never-married	Other-service	Own-child	White	...	0	0	0	0	0	0	0	1	0	0
3	3	21	Private	151158	Some-college	10	Never-married	Prof-specialty	Own-child	White	...	0	0	0	0	0	0	0	1	0	0
4	4	24	Private	122234	Some-college	10	Never-married	Adm-clerical	Not-in-family	Black	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6507	6507	35	Private	61343	Bachelors	13	Married-civ-spouse	Sales	Husband	White	...	0	0	0	0	0	0	0	1	0	0
6508	6508	41	Self-emp-inc	32185	Bachelors	13	Married-civ-spouse	Tech-support	Husband	White	...	0	0	0	0	0	0	0	1	0	0
6509	6509	39	Private	409189	5th-6th	3	Married-civ-spouse	Other-service	Husband	White	...	0	0	0	0	0	0	0	0	0	0
6510	6510	35	Private	180342	HS-grad	9	Married-civ-spouse	Craft-repair	Husband	White	...	0	0	0	0	0	0	0	1	0	0
6511	6511	28	Private	156819	HS-grad	9	Divorced	Handlers-cleaners	Unmarried	White	...	0	0	0	0	0	0	0	1	0	0

캐글 코리아 4차 대회¶

학습내용¶

목차

01. 데이터 준비 및 라이브러리 임포트

데이터 탐색¶

02. 데이터 전처리

03. 모델 구축하기

로지스틱 모델¶

Xgboost 모델¶

LightGBM 모델¶

최종 모델 예측¶