import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')
데이터 정보
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준 (최종 학력)
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 수익 (예측해야 하는 값, target variable)
print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)
학습용 데이터 : (26049, 16) 테스트용 데이터 : (6512, 15)
y = train['income']
test['income'] = "blank"
all_dat = pd.concat([train, test], axis=0)
print(all_dat.shape)
(32561, 16)
all_dat.income.value_counts()
<=50K 19744 blank 6512 >50K 6305 Name: income, dtype: int64
all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")
from sklearn.preprocessing import LabelEncoder
en_x = LabelEncoder()
all_dat['workclass_lbl'] = en_x.fit_transform(all_dat['workclass'])
all_dat.head(3)
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | target | workclass_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 | 4 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K | 0 | 4 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | Male | 0 | 0 | 16 | United-States | <=50K | 0 | 4 |
all_dat['education_lbl'] = en_x.fit_transform(all_dat['education'])
all_dat['marital_status_lbl'] = en_x.fit_transform(all_dat['marital_status'])
all_dat['occupation_lbl'] = en_x.fit_transform(all_dat['occupation'])
all_dat['relationship_lbl'] = en_x.fit_transform(all_dat['relationship'])
all_dat['race_lbl'] = en_x.fit_transform(all_dat['race'])
all_dat['native_country_lbl'] = en_x.fit_transform(all_dat['native_country'])
all_dat.head(3)
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | ... | native_country | income | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | ... | United-States | >50K | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | ... | United-States | <=50K | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | ... | United-States | <=50K | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
3 rows × 24 columns
mf_mapping = {"Male": 1, "Female": 2}
all_dat['sex'] = all_dat['sex'].map(mf_mapping)
all_dat.head(3)
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | ... | native_country | income | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | ... | United-States | >50K | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | ... | United-States | <=50K | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | ... | United-States | <=50K | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
3 rows × 24 columns
sel_cat = ['workclass', 'education', 'marital_status',
'occupation', 'relationship', 'race', 'native_country', 'income' ]
all_dat_n = all_dat.drop(sel_cat, axis=1)
all_dat_n
id | age | fnlwgt | education_num | sex | capital_gain | capital_loss | hours_per_week | target | workclass_lbl | education_lbl | marital_status_lbl | occupation_lbl | relationship_lbl | race_lbl | native_country_lbl | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | 168538 | 9 | 1 | 0 | 0 | 60 | 1 | 4 | 11 | 2 | 12 | 0 | 4 | 39 |
1 | 1 | 17 | 101626 | 5 | 1 | 0 | 0 | 20 | 0 | 4 | 6 | 4 | 7 | 3 | 4 | 39 |
2 | 2 | 18 | 353358 | 10 | 1 | 0 | 0 | 16 | 0 | 4 | 15 | 4 | 8 | 3 | 4 | 39 |
3 | 3 | 21 | 151158 | 10 | 2 | 0 | 0 | 25 | 0 | 4 | 15 | 4 | 10 | 3 | 4 | 39 |
4 | 4 | 24 | 122234 | 10 | 2 | 0 | 0 | 20 | 0 | 4 | 15 | 4 | 1 | 1 | 2 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6507 | 6507 | 35 | 61343 | 13 | 1 | 0 | 0 | 40 | 999 | 4 | 9 | 2 | 12 | 0 | 4 | 39 |
6508 | 6508 | 41 | 32185 | 13 | 1 | 0 | 0 | 40 | 999 | 5 | 9 | 2 | 13 | 0 | 4 | 39 |
6509 | 6509 | 39 | 409189 | 3 | 1 | 0 | 0 | 40 | 999 | 4 | 4 | 2 | 8 | 0 | 4 | 26 |
6510 | 6510 | 35 | 180342 | 9 | 1 | 0 | 0 | 40 | 999 | 4 | 11 | 2 | 3 | 0 | 4 | 39 |
6511 | 6511 | 28 | 156819 | 9 | 2 | 0 | 0 | 36 | 999 | 4 | 11 | 0 | 6 | 4 | 4 | 39 |
32561 rows × 16 columns
X_cat = all_dat_n.drop(['target'],axis=1)
y = all_dat_n['target']
train_n = all_dat_n.loc[ (all_dat_n['target']==0) | (all_dat_n['target']==1) , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999 , : ]
print(train_n.shape, test_n.shape)
(26049, 16) (6512, 16)
from sklearn.model_selection import train_test_split
sel = ['age', 'education_num', 'sex']
X_tr_all = train_n[sel]
y_tr_all = train_n['target']
X_test_all = test_n[sel]
X_train, X_test, y_train, y_test = train_test_split(X_tr_all,
y_tr_all,
test_size=0.3,
random_state=77)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
model = LogisticRegression()
model.fit(X_train, y_train)
pred_log = model.predict(X_test)
model.score(X_train, y_train), model.score(X_test, y_test),
(0.7960403641548756, 0.7901471529110684)
confusion = confusion_matrix(y_test, pred_log)
print("오차 행렬:\n{}".format(confusion))
오차 행렬: [[5509 380] [1260 666]]
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_log,
target_names=["50K>=", "50K<"]))
precision recall f1-score support 50K>= 0.81 0.94 0.87 5889 50K< 0.64 0.35 0.45 1926 accuracy 0.79 7815 macro avg 0.73 0.64 0.66 7815 weighted avg 0.77 0.79 0.77 7815
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred_rf = model.predict(X_test)
model = AdaBoostClassifier()
model.fit(X_train, y_train)
pred_ada = model.predict(X_test)
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
pred_gr = model.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_rf,
target_names=["50K>=", "50K<"]))
precision recall f1-score support 50K>= 0.82 0.91 0.86 5889 50K< 0.59 0.40 0.48 1926 accuracy 0.78 7815 macro avg 0.71 0.65 0.67 7815 weighted avg 0.77 0.78 0.77 7815
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_ada,
target_names=["50K>=", "50K<"]))
precision recall f1-score support 50K>= 0.82 0.94 0.88 5889 50K< 0.67 0.36 0.47 1926 accuracy 0.80 7815 macro avg 0.74 0.65 0.67 7815 weighted avg 0.78 0.80 0.78 7815
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_gr,
target_names=["50K>=", "50K<"]))
precision recall f1-score support 50K>= 0.83 0.93 0.87 5889 50K< 0.64 0.40 0.50 1926 accuracy 0.80 7815 macro avg 0.74 0.67 0.68 7815 weighted avg 0.78 0.80 0.78 7815
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2022 LIM Co. all rights reserved.