import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('data/4th_kaggle/train.csv')
test = pd.read_csv('data/4th_kaggle/test.csv')
sub = pd.read_csv('data/4th_kaggle/sample_submission.csv')
데이터 정보
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준 (최종 학력)
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 연소득 (예측해야 하는 값, target variable) - 50K - $50,000
train.columns
Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'], dtype='object')
test.columns
Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country'], dtype='object')
sub.columns
Index(['id', 'prediction'], dtype='object')
print("학습용 데이터 : ", train.shape)
print("테스트용 데이터 : ", test.shape)
학습용 데이터 : (26049, 16) 테스트용 데이터 : (6512, 15)
train.isnull().sum()
id 0 age 0 workclass 0 fnlwgt 0 education 0 education_num 0 marital_status 0 occupation 0 relationship 0 race 0 sex 0 capital_gain 0 capital_loss 0 hours_per_week 0 native_country 0 income 0 dtype: int64
test.isnull().sum()
id 0 age 0 workclass 0 fnlwgt 0 education 0 education_num 0 marital_status 0 occupation 0 relationship 0 race 0 sex 0 capital_gain 0 capital_loss 0 hours_per_week 0 native_country 0 dtype: int64
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 26049 entries, 0 to 26048 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 26049 non-null int64 1 age 26049 non-null int64 2 workclass 26049 non-null object 3 fnlwgt 26049 non-null int64 4 education 26049 non-null object 5 education_num 26049 non-null int64 6 marital_status 26049 non-null object 7 occupation 26049 non-null object 8 relationship 26049 non-null object 9 race 26049 non-null object 10 sex 26049 non-null object 11 capital_gain 26049 non-null int64 12 capital_loss 26049 non-null int64 13 hours_per_week 26049 non-null int64 14 native_country 26049 non-null object 15 income 26049 non-null object dtypes: int64(7), object(9) memory usage: 3.2+ MB
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6512 entries, 0 to 6511 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 6512 non-null int64 1 age 6512 non-null int64 2 workclass 6512 non-null object 3 fnlwgt 6512 non-null int64 4 education 6512 non-null object 5 education_num 6512 non-null int64 6 marital_status 6512 non-null object 7 occupation 6512 non-null object 8 relationship 6512 non-null object 9 race 6512 non-null object 10 sex 6512 non-null object 11 capital_gain 6512 non-null int64 12 capital_loss 6512 non-null int64 13 hours_per_week 6512 non-null int64 14 native_country 6512 non-null object dtypes: int64(7), object(8) memory usage: 763.2+ KB
train.income.unique()
array(['>50K', '<=50K'], dtype=object)
sns.countplot(x="income", data=train)
<AxesSubplot:xlabel='income', ylabel='count'>
train.loc[ train['income']=='>50K' , 'target'] = 1
train.loc[ train['income']=='<=50K' , 'target'] = 0
train['target'] = train.target.astype("int")
train.head()
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40 | Private | 168538 | HS-grad | 9 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 |
1 | 1 | 17 | Private | 101626 | 9th | 5 | Never-married | Machine-op-inspct | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K | 0 |
2 | 2 | 18 | Private | 353358 | Some-college | 10 | Never-married | Other-service | Own-child | White | Male | 0 | 0 | 16 | United-States | <=50K | 0 |
3 | 3 | 21 | Private | 151158 | Some-college | 10 | Never-married | Prof-specialty | Own-child | White | Female | 0 | 0 | 25 | United-States | <=50K | 0 |
4 | 4 | 24 | Private | 122234 | Some-college | 10 | Never-married | Adm-clerical | Not-in-family | Black | Female | 0 | 0 | 20 | ? | <=50K | 0 |
test.head()
id | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 28 | Private | 67661 | Some-college | 10 | Never-married | Adm-clerical | Other-relative | White | Female | 0 | 0 | 40 | United-States |
1 | 1 | 40 | Self-emp-inc | 37869 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 50 | United-States |
2 | 2 | 20 | Private | 109952 | Some-college | 10 | Never-married | Handlers-cleaners | Own-child | White | Male | 0 | 0 | 25 | United-States |
3 | 3 | 40 | Private | 114537 | Assoc-voc | 11 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 50 | United-States |
4 | 4 | 37 | Private | 51264 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 99 | France |
train.columns
Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income', 'target'], dtype='object')
sel = ['id', 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
X = train[sel]
y = train['target']
test_X = test[sel]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
stratify=train.target,
random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(19536, 7) (6513, 7) (19536,) (6513,)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
pred = model.predict(test_X)
sub.columns
Index(['id', 'prediction'], dtype='object')
print( sub.shape )
print( pred.shape )
(6512, 2) (6512,)
sub['prediction'] = pred
sub.to_csv("firstSub4th.csv", index=False)