import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')
데이터 정보
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준 (최종 학력)
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 수익 (예측해야 하는 값, target variable)
print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)
학습용 데이터 : (26049, 16) 테스트용 데이터 : (6512, 15)
y = train['income']
test['income'] = "blank"
all_dat = pd.concat([train, test], axis=0)
print(all_dat.shape)
(32561, 16)
all_dat.income.value_counts()
<=50K 19744 blank 6512 >50K 6305 Name: income, dtype: int64
sns.countplot(x="income", data=all_dat)
<AxesSubplot:xlabel='income', ylabel='count'>
all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")
all_dat.head()
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K | 0 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | Male | 0 | 0 | 16 | United-States | <=50K | 0 |
3 | 3 | 21 | Private | 151158 | Some-college | 10 | Never-married | Prof-specialty | Own-child | White | Female | 0 | 0 | 25 | United-States | <=50K | 0 |
4 | 4 | 24 | Private | 122234 | Some-college | 10 | Never-married | Adm-clerical | Not-in-family | Black | Female | 0 | 0 | 20 | ? | <=50K | 0 |
all_dat.columns
Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income', 'target'], dtype='object')
sel_cat = ['workclass', 'education', 'marital_status',
'occupation', 'relationship', 'race',
'sex', 'native_country' ]
X_cat = all_dat[sel_cat]
y = all_dat['target']
X_dummy = pd.get_dummies(X_cat)
X_dummy
workclass_? | workclass_Federal-gov | workclass_Local-gov | workclass_Never-worked | workclass_Private | workclass_Self-emp-inc | workclass_Self-emp-not-inc | workclass_State-gov | workclass_Without-pay | education_10th | ... | native_country_Portugal | native_country_Puerto-Rico | native_country_Scotland | native_country_South | native_country_Taiwan | native_country_Thailand | native_country_Trinadad&Tobago | native_country_United-States | native_country_Vietnam | native_country_Yugoslavia | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6507 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6508 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6509 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6510 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6511 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
32561 rows × 102 columns
all_dat_n = pd.concat([all_dat, X_dummy], axis=1)
all_dat_n
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | ... | native_country_Portugal | native_country_Puerto-Rico | native_country_Scotland | native_country_South | native_country_Taiwan | native_country_Thailand | native_country_Trinadad&Tobago | native_country_United-States | native_country_Vietnam | native_country_Yugoslavia | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 3 | 21 | Private | 151158 | Some-college | 10 | Never-married | Prof-specialty | Own-child | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 4 | 24 | Private | 122234 | Some-college | 10 | Never-married | Adm-clerical | Not-in-family | Black | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6507 | 6507 | 35 | Private | 61343 | Bachelors | 13 | Married-civ-spouse | Sales | Husband | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6508 | 6508 | 41 | Self-emp-inc | 32185 | Bachelors | 13 | Married-civ-spouse | Tech-support | Husband | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6509 | 6509 | 39 | Private | 409189 | 5th-6th | 3 | Married-civ-spouse | Other-service | Husband | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6510 | 6510 | 35 | Private | 180342 | HS-grad | 9 | Married-civ-spouse | Craft-repair | Husband | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6511 | 6511 | 28 | Private | 156819 | HS-grad | 9 | Divorced | Handlers-cleaners | Unmarried | White | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
32561 rows × 119 columns
sel_cat = ['workclass', 'education', 'marital_status',
'occupation', 'relationship', 'race',
'sex', 'native_country', 'income']
all_dat_n = all_dat_n.drop(sel_cat, axis=1)
train_n = all_dat_n.loc[ (all_dat_n['target']==0) |
(all_dat_n['target']==1) , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999 , : ]
print(train_n.shape, test_n.shape)
(26049, 110) (6512, 110)
X = train_n.drop(['target'], axis=1)
y = train_n['target']
test_X = test_n.drop(['target'], axis=1)
print(X.shape, y.shape, test_X.shape)
(26049, 109) (26049,) (6512, 109)
X.columns
Index(['id', 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov', ... 'native_country_Portugal', 'native_country_Puerto-Rico', 'native_country_Scotland', 'native_country_South', 'native_country_Taiwan', 'native_country_Thailand', 'native_country_Trinadad&Tobago', 'native_country_United-States', 'native_country_Vietnam', 'native_country_Yugoslavia'], dtype='object', length=109)
type(X)
pandas.core.frame.DataFrame
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
sel = ['age', 'fnlwgt', 'capital_gain']
X_tr_all = X[sel]
y_tr_all = y
X_test_all = test_X[sel]
X_train, X_test, y_train, y_test = train_test_split(X_tr_all,
y_tr_all,
test_size=0.3,
random_state=77)
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import time
model_list = ["RandomForestRegressor", "xgb_basic", "lightgbm-model",
"GradientBoostingClassifier", "LogisticRegression"]
model_score = []
model_time = []
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
now_time = time.time()
model = RandomForestRegressor(random_state=30)
model.fit(X_train, y_train)
score = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
print(score)
pro_time = time.time() - now_time
print("걸린 시간 :", pro_time) # 걸린 시간
print("RandomForestRegressor Score : {}".format(np.mean( score ) )) # 점수
[0.7204471 0.73955972 0.73611203 0.72815306 0.71192845] 걸린 시간 : 13.336974143981934 RandomForestRegressor Score : 0.7272400712073421
now_time = time.time()
xg_reg = xgb.XGBRegressor(objective ='reg:logistic',
colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
learning_rate = 0.1,
max_depth = 3,
alpha = 0.1,
n_estimators = 100) # n_estimators=100
xg_reg.fit(X_train, y_train)
score = cross_val_score(xg_reg, X_train, y_train, cv=5, scoring="roc_auc")
print(score)
pro_time = time.time() - now_time
print("걸린 시간 :", pro_time) # 걸린 시간
print("xgboosting Score : {}".format(np.mean( score ) )) # 점수
[0.79294036 0.79022102 0.77922751 0.7732279 0.76699142] 걸린 시간 : 1.699568271636963 xgboosting Score : 0.7805216400578529
now_time = time.time()
m_lgbm1 = lgb.LGBMRegressor()
m_lgbm1.fit(X_train, y_train)
score = cross_val_score(m_lgbm1, X_train, y_train, cv=5, scoring="roc_auc")
print(score)
pro_time = time.time() - now_time
print("걸린 시간 :", pro_time) # 걸린 시간
print("LightGBM 모델 Score : {}".format(np.mean( score ) )) # 점수
[0.77682267 0.78360496 0.77297815 0.76017757 0.75535851] 걸린 시간 : 0.670039176940918 LightGBM 모델 Score : 0.7697883701739261
model = xgb.XGBRegressor(objective ='reg:logistic',
colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
learning_rate = 0.1,
max_depth = 3,
alpha = 0.1,
n_estimators = 100) # n_estimators=100
model.fit(X_train, y_train)
pred = model.predict(X_test_all)
pred
array([0.10635718, 0.2864344 , 0.00805269, ..., 0.31812307, 0.23884018, 0.12535065], dtype=float32)
pred = np.where(pred > 0.32, 1, 0)
np.sum(pred==1)
1564
sub['prediction'] = pred
sub.to_csv("thirdSub4th_xgb2.csv", index=False)