import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')
데이터 정보
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준 (최종 학력)
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 수익 (예측해야 하는 값, target variable)
print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)
학습용 데이터 : (26049, 16) 테스트용 데이터 : (6512, 15)
y = train['income']
test['income'] = "blank"
all_dat = pd.concat([train, test], axis=0)
print(all_dat.shape)
(32561, 16)
all_dat.income.value_counts()
<=50K 19744 blank 6512 >50K 6305 Name: income, dtype: int64
sns.countplot(x="income", data=all_dat)
<AxesSubplot:xlabel='income', ylabel='count'>
all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")
all_dat.head()
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K | 0 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | Male | 0 | 0 | 16 | United-States | <=50K | 0 |
3 | 3 | 21 | Private | 151158 | Some-college | 10 | Never-married | Prof-specialty | Own-child | White | Female | 0 | 0 | 25 | United-States | <=50K | 0 |
4 | 4 | 24 | Private | 122234 | Some-college | 10 | Never-married | Adm-clerical | Not-in-family | Black | Female | 0 | 0 | 20 | ? | <=50K | 0 |
all_dat.columns
Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income', 'target'], dtype='object')
from sklearn.preprocessing import LabelEncoder
en_x = LabelEncoder()
all_dat['workclass_lbl'] = en_x.fit_transform(all_dat['workclass'])
all_dat.head(3)
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | target | workclass_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 | 4 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K | 0 | 4 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | Male | 0 | 0 | 16 | United-States | <=50K | 0 | 4 |
all_dat['education_lbl'] = en_x.fit_transform(all_dat['education'])
all_dat['marital_status_lbl'] = en_x.fit_transform(all_dat['marital_status'])
all_dat['occupation_lbl'] = en_x.fit_transform(all_dat['occupation'])
all_dat['relationship_lbl'] = en_x.fit_transform(all_dat['relationship'])
all_dat['race_lbl'] = en_x.fit_transform(all_dat['race'])
all_dat['native_country_lbl'] = en_x.fit_transform(all_dat['native_country'])
all_dat.head(3)
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | ... | native_country | income | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | ... | United-States | >50K | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | ... | United-States | <=50K | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | ... | United-States | <=50K | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
3 rows × 24 columns
all_dat['sex'].unique()
array(['Male', 'Female'], dtype=object)
mf_mapping = {"Male": 1, "Female": 2}
all_dat['sex'] = all_dat['sex'].map(mf_mapping)
all_dat.head(3)
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | ... | native_country | income | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | ... | United-States | >50K | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | ... | United-States | <=50K | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | ... | United-States | <=50K | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
3 rows × 24 columns
sel_cat = ['workclass', 'education', 'marital_status',
'occupation', 'relationship', 'race', 'native_country', 'income' ]
all_dat_n = all_dat.drop(sel_cat, axis=1)
all_dat_n
id | age | fnlwgt | education_num | sex | capital_gain | capital_loss | hours_per_week | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | 168538 | 9 | 1 | 0 | 0 | 60 | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | 101626 | 5 | 1 | 0 | 0 | 20 | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | 353358 | 10 | 1 | 0 | 0 | 16 | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
3 | 3 | 21 | 151158 | 10 | 2 | 0 | 0 | 25 | 0 | 4 | 15 | 4 | 10 | 3 | 4 | 39 |
4 | 4 | 24 | 122234 | 10 | 2 | 0 | 0 | 20 | 0 | 4 | 15 | 4 | 1 | 1 | 2 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6507 | 6507 | 35 | 61343 | 13 | 1 | 0 | 0 | 40 | 999 | 4 | 9 | 2 | 12 | 0 | 4 | 39 |
6508 | 6508 | 41 | 32185 | 13 | 1 | 0 | 0 | 40 | 999 | 5 | 9 | 2 | 13 | 0 | 4 | 39 |
6509 | 6509 | 39 | 409189 | 3 | 1 | 0 | 0 | 40 | 999 | 4 | 4 | 2 | 8 | 0 | 4 | 26 |
6510 | 6510 | 35 | 180342 | 9 | 1 | 0 | 0 | 40 | 999 | 4 | 11 | 2 | 3 | 0 | 4 | 39 |
6511 | 6511 | 28 | 156819 | 9 | 2 | 0 | 0 | 36 | 999 | 4 | 11 | 0 | 6 | 4 | 4 | 39 |
32561 rows × 16 columns
X_cat = all_dat_n.drop(['target'],axis=1)
y = all_dat_n['target']
train_n = all_dat_n.loc[ (all_dat_n['target']==0) | (all_dat_n['target']==1) , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999 , : ]
print(train_n.shape, test_n.shape)
(26049, 16) (6512, 16)
train_n.head(3)
id | age | fnlwgt | education_num | sex | capital_gain | capital_loss | hours_per_week | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | 168538 | 9 | 1 | 0 | 0 | 60 | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | 101626 | 5 | 1 | 0 | 0 | 20 | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | 353358 | 10 | 1 | 0 | 0 | 16 | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
test_n.head(3)
id | age | fnlwgt | education_num | sex | capital_gain | capital_loss | hours_per_week | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 28 | 67661 | 10 | 2 | 0 | 0 | 40 | 999 | 4 | 15 | 4 | 1 | 2 | 4 | 39 |
1 | 1 | 40 | 37869 | 9 | 1 | 0 | 0 | 50 | 999 | 5 | 11 | 2 | 4 | 0 | 4 | 39 |
2 | 2 | 20 | 109952 | 10 | 1 | 0 | 0 | 25 | 999 | 4 | 15 | 4 | 6 | 3 | 4 | 39 |
test_n = test_n.drop(['target'], axis=1)
print(train_n.shape, test_n.shape)
(26049, 16) (6512, 15)
train_n.columns
Index(['id', 'age', 'fnlwgt', 'education_num', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'target', 'workclass_lbl', 'education_lbl', 'marital_status_lbl', 'occupation_lbl', 'relationship_lbl', 'race_lbl', 'native_country_lbl'], dtype='object')
from sklearn.model_selection import train_test_split
sel = ['age', 'education_num', 'sex']
X_tr_all = train_n[sel]
y_tr_all = train_n['target']
X_test_all = test_n[sel]
X_train, X_test, y_train, y_test = train_test_split(X_tr_all,
y_tr_all,
test_size=0.3,
random_state=77)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test),
(0.7960403641548756, 0.7901471529110684)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
model_list = [RandomForestClassifier(),AdaBoostClassifier(), GradientBoostingClassifier()]
for model in model_list:
m = model
m.fit(X_train, y_train)
ac_tr = model.score(X_train, y_train)
ac_test = model.score(X_test, y_test)
print(ac_tr, ac_test)
0.8174838214324888 0.7850287907869482 0.8044861248217615 0.7982085732565579 0.8076669957222771 0.7980806142034549
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test_all)
sub['prediction'] = pred
sub.to_csv("secondSub4th_gb.csv", index=False)
### score : 0.80939