import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

import xgboost as xgb
from xgboost import XGBClassifier

import warnings, gc
warnings.filterwarnings("ignore")


%%time

train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train = train.merge(label,how='inner',on="customer_ID")

CPU times: user 2min, sys: 16.8 s, total: 2min 17s
Wall time: 2min 24s


print(train.shape)
train.head(3)

(5531451, 191)


lab = LabelEncoder()
train['customer_ID']= lab.fit_transform(train['customer_ID'])
train['customer_ID'].head()

0    0
1    0
2    0
3    0
4    0
Name: customer_ID, dtype: int64


%%time

train = train.groupby(['customer_ID']).tail(1).set_index('customer_ID')

CPU times: user 1.08 s, sys: 625 ms, total: 1.7 s
Wall time: 1.7 s


print( train.shape )
train.head()
gc.collect()

(458913, 190)

42


%%time

test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")

CPU times: user 12.5 s, sys: 11.6 s, total: 24.1 s
Wall time: 35.8 s


print(test.shape)
test.head()
gc.collect()

(11363762, 190)

42


test['customer_ID']= lab.fit_transform(test['customer_ID'])
test = test.groupby(['customer_ID']).tail(1).set_index('customer_ID')


X = train.drop(["target","S_2"],axis=1)
y = train.target

test = test.drop(['S_2'],axis=1)

X = X.fillna(-123)
test = test.fillna(-123)


y.value_counts()

0    340085
1    118828
Name: target, dtype: int64


print(X.shape, y.shape, test.shape)
gc.collect()

(458913, 188) (458913,) (924621, 188)

42


# 범주형 컬럼
cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 
            'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']

# 수치형 컬럼
num_cols = [col for col in X.columns if col not in cat_cols]

all_cols = [cat_cols,num_cols]


print(all_cols)

[['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126'], ['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'B_36', 'B_37', 'R_26', 'R_27', 'D_108', 'D_109', 'D_110', 'D_111', 'B_39', 'D_112', 'B_40', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'B_41', 'B_42', 'D_130', 'D_131', 'D_132', 'D_133', 'R_28', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145']]


D_n_cols = [col for col in num_cols if col.startswith("D")]
S_n_cols = [col for col in num_cols if col.startswith("S")]
P_n_cols = [col for col in num_cols if col.startswith("P")]
B_n_cols = [col for col in num_cols if col.startswith("B")]
R_n_cols = [col for col in num_cols if col.startswith("R")]

# 범주형 컬럼
# cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 
#             'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']

D_c_cols = [col for col in cat_cols if col.startswith("D")]
B_c_cols = [col for col in cat_cols if col.startswith("B")]


print( len(D_n_cols),  len(S_n_cols), len(P_n_cols), len(B_n_cols),len(R_n_cols)  )
print( len(D_c_cols),len(B_c_cols) )

87 21 3 38 28
9 2


print(X.shape)
X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])

(458913, 188)


print(X_num_agg_D.shape)
print(X_num_agg_D.columns)

(458913, 261)
MultiIndex([( 'D_39', 'mean'),
            ( 'D_39',  'min'),
            ( 'D_39', 'last'),
            ( 'D_41', 'mean'),
            ( 'D_41',  'min'),
            ( 'D_41', 'last'),
            ( 'D_42', 'mean'),
            ( 'D_42',  'min'),
            ( 'D_42', 'last'),
            ( 'D_43', 'mean'),
            ...
            ('D_142', 'last'),
            ('D_143', 'mean'),
            ('D_143',  'min'),
            ('D_143', 'last'),
            ('D_144', 'mean'),
            ('D_144',  'min'),
            ('D_144', 'last'),
            ('D_145', 'mean'),
            ('D_145',  'min'),
            ('D_145', 'last')],
           length=261)


X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]
print( X_num_agg_D.columns)

del X_num_agg_D
gc.collect()

Index(['D_39_mean', 'D_39_min', 'D_39_last', 'D_41_mean', 'D_41_min',
       'D_41_last', 'D_42_mean', 'D_42_min', 'D_42_last', 'D_43_mean',
       ...
       'D_142_last', 'D_143_mean', 'D_143_min', 'D_143_last', 'D_144_mean',
       'D_144_min', 'D_144_last', 'D_145_mean', 'D_145_min', 'D_145_last'],
      dtype='object', length=261)

42


%%time 

print('피처 엔지니어링 전', X.shape)

X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]

X_num_agg_S = X.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
X_num_agg_S.columns = ['_'.join(x) for x in X_num_agg_S.columns]

X_num_agg_P = X.groupby("customer_ID")[P_n_cols].agg(['mean','min','max' ,'last'])
X_num_agg_P.columns = ['_'.join(x) for x in X_num_agg_P.columns]

X_num_agg_B = X.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
X_num_agg_B.columns = ['_'.join(x) for x in X_num_agg_B.columns]

X_num_agg_R = X.groupby("customer_ID")[R_n_cols].agg(['mean','min','last'])
X_num_agg_R.columns = ['_'.join(x) for x in X_num_agg_R.columns]

X_cat_agg_D = X.groupby("customer_ID")[D_c_cols].agg([ 'count','last','first','nunique'])
X_cat_agg_D.columns = ['_'.join(x) for x in X_cat_agg_D.columns]

X_cat_agg_B = X.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
X_cat_agg_B.columns = ['_'.join(x) for x in X_cat_agg_B.columns]

X = pd.concat([X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,
               X_cat_agg_D,X_cat_agg_B], axis=1)

del X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R, X_cat_agg_D,X_cat_agg_B
_ = gc.collect()

print('피처 엔지니어링 후', X.shape)

피처 엔지니어링 전 (458913, 188)
피처 엔지니어링 후 (458913, 576)
CPU times: user 6.41 s, sys: 397 ms, total: 6.81 s
Wall time: 6.81 s


X.head()


%%time 

print('피처 엔지니어링 전', test.shape)

test_num_agg_D = test.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
test_num_agg_D.columns = ['_'.join(x) for x in test_num_agg_D.columns]

test_num_agg_S = test.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
test_num_agg_S.columns = ['_'.join(x) for x in test_num_agg_S.columns]

test_num_agg_P = test.groupby("customer_ID")[P_n_cols].agg(['mean','min','max', 'last'])
test_num_agg_P.columns = ['_'.join(x) for x in test_num_agg_P.columns]

test_num_agg_B = test.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
test_num_agg_B.columns = ['_'.join(x) for x in test_num_agg_B.columns]

test_num_agg_R = test.groupby("customer_ID")[R_n_cols].agg(['mean','min', 'last'])
test_num_agg_R.columns = ['_'.join(x) for x in test_num_agg_R.columns]

test_cat_agg_D = test.groupby("customer_ID")[D_c_cols].agg(['count','first', 'last','nunique'])
test_cat_agg_D.columns = ['_'.join(x) for x in test_cat_agg_D.columns]

test_cat_agg_B = test.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
test_cat_agg_B.columns = ['_'.join(x) for x in test_cat_agg_B.columns]

test = pd.concat([test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,
                  test_cat_agg_D,test_cat_agg_B], axis=1)
del test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B
    
_ = gc.collect()

print('피처 엔지니어링 후', test.shape)

피처 엔지니어링 전 (924621, 188)
피처 엔지니어링 후 (924621, 576)
CPU times: user 12.7 s, sys: 704 ms, total: 13.5 s
Wall time: 13.5 s


xgb_parms ={
     'booster': 'dart',
     'n_jobs':4,
     'n_estimators':1000,
     'lambda': 4.091409953463271e-08,
     'alpha': 3.6353429991712695e-08,
     'subsample': 0.6423675532438815,
     'colsample_bytree': 0.7830450413657872,
     'max_depth': 9,
     'min_child_weight': 5,
     'eta': 0.3749337530972536,
     'gamma': 0.0745370910451703,
     'grow_policy': 'depthwise',
     'sample_type': 'uniform',
     'normalize_type': 'tree',
     'rate_drop': 0.0723975209176045,
     'skip_drop': 0.9026367296518939}


X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.25,stratify=y)


%%time

my_model = XGBClassifier(**xgb_parms)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=10, 
             eval_set=[(X_valid, y_valid)],
             verbose=1)

[0]	validation_0-logloss:0.48592
[1]	validation_0-logloss:0.38724
[2]	validation_0-logloss:0.33091
[3]	validation_0-logloss:0.29680
[4]	validation_0-logloss:0.27552
[5]	validation_0-logloss:0.26194
[6]	validation_0-logloss:0.25311
[7]	validation_0-logloss:0.24713
[8]	validation_0-logloss:0.24322
[9]	validation_0-logloss:0.24069
[10]	validation_0-logloss:0.23882
[11]	validation_0-logloss:0.23753
[12]	validation_0-logloss:0.23674
[13]	validation_0-logloss:0.23617
[14]	validation_0-logloss:0.23580
[15]	validation_0-logloss:0.23595
[16]	validation_0-logloss:0.23591
[17]	validation_0-logloss:0.23584
[18]	validation_0-logloss:0.23579
[19]	validation_0-logloss:0.23586
[20]	validation_0-logloss:0.23593
[21]	validation_0-logloss:0.23604
[22]	validation_0-logloss:0.23632
[23]	validation_0-logloss:0.23646
[24]	validation_0-logloss:0.23649
[25]	validation_0-logloss:0.23686
[26]	validation_0-logloss:0.23718
[27]	validation_0-logloss:0.23733
CPU times: user 30min 52s, sys: 2.95 s, total: 30min 55s
Wall time: 7min 56s

XGBClassifier(alpha=3.6353429991712695e-08, base_score=0.5, booster='dart',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.7830450413657872, early_stopping_rounds=None,
              enable_categorical=False, eta=0.3749337530972536,
              eval_metric=None, gamma=0.0745370910451703, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', lambda=4.091409953463271e-08,
              learning_rate=0.374933749, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=9, max_leaves=0, min_child_weight=5,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=4, normalize_type='tree', num_parallel_tree=1, ...)


pred_val = my_model.predict(X_valid)


cf = classification_report(y_valid,pred_val)
print(cf)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     85022
           1       0.80      0.79      0.80     29707

    accuracy                           0.90    114729
   macro avg       0.86      0.86      0.86    114729
weighted avg       0.89      0.90      0.90    114729


cm = confusion_matrix(y_valid,pred_val)

plt.figure(figsize=(10,7))

sns.heatmap(cm,annot=True,fmt='d')

plt.xlabel('Predicted')
plt.ylabel('Truth')

Text(69.0, 0.5, 'Truth')


pred_test = my_model.predict_proba(test)


preds = pd.DataFrame(pred_test)
pred_final = np.array(preds[1])
pred_final

array([0.03312937, 0.00237539, 0.0565865 , ..., 0.6953294 , 0.31301746,
       0.10683217], dtype=float32)


submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")


submission['prediction']=pred_final
submission


submission.to_csv("submission.csv",index=False)

파라미터 이름	상세 설명	기타
booster	사용할 Booster (gblinear, dart, gbtree, dart 등)	default = gbtree, 트리 기반의 모델(gbtree) gblinear(선형 모델)
n_estimators	사용할 트리의 개	ooo
subsample	학습 인스턴스의 하위 샘플 비율.0.5로 설정시, XGBoost가 나무를 성장시키기 전에 학습 데이터의 절반을 무작위로 샘플링.	default=1
max_depth	나무의 최대 깊이. 깊은 트리는 메모리 소비 크다.	default=6
min_child_weight	자식에게 필요한 인스턴스 가중치의 최소 합계. min_child_weight가 클수록 알고리즘이 더 보수적	default=1
eta	과적합을 방지하기 위해 업데이트에 사용되는 단계 크기 축소	default=0.3 - learning_rate
lambda	가중치에 대한 L2 정규화 항. 커지면 보수적.	default=1 (reg_lambda)
alpha	가중치에 대한 L1 정규화 항. 커지면 보수적.	default=0 (reg_alpha)
gamma	트리의 리프 노드에서 추가 파티션을 만드는데 필요한 최소 손실 감소. 클수록 더 보수적.	default=0, alias: min_split_loss
grow_policy	depthwise : 루틍 가장 가까운 노드에서 분할. lossguide : 손실 변화가 가장 큰 노드에서 분할.	default=depthwise
sample_type	샘플링 알고리즘 타	default='uniform'
normalize_type	정규화 알고리즘의 유형	default='tree'
rate_drop	드롭아웃 비율(드롭아웃 동안 드롭할 이전 트리의 일부)	default=0.0
skip_drop	드롭아웃을 건너뛸 확률.	range[0.0, 1.0]

	customer_ID	prediction
0	00000469ba478561f23a92a868bd366de6f6527a684c9a...	0.033129
1	00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...	0.002375
2	0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...	0.056587
3	00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...	0.318178
4	00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...	0.908346
...	...	...
924616	ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...	0.025777
924617	ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...	0.715930
924618	ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...	0.695329
924619	ffffddef1fc3643ea179c93245b68dca0f36941cd83977...	0.313017
924620	fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61...	0.106832

American Express - Default Prediction¶

참조 URL 링크¶

학습 목표¶

목차

대회 이해하기¶

01. 라이브러리 불러오기

02. 데이터 불러오기

03. customer_ID의 컬럼의 라벨 인코딩을 수행 및 인덱스 지정

04. 테스트 데이터 불러오기

05. test 데이터 셋도 customer_ID로 라벨 인코딩

06. 데이터 나누기 및 결측치 처리

07. 범주형과 수치형 컬럼을 나누기

08. 변수 구분

종류별 피처의 개수¶

09. 변수별 컬럼명 확인 및 새로운 변수 생성

각 구분별 피처 생성¶

10. xgboost 모델 파라미터 설정

11. 데이터 나누기 및 학습, 평가

12. 혼동 행렬를 이용한 시각화

13. 예측을 수행(1을 예측할 확률), 제출

	customer_ID	S_2	P_2	D_39	B_1	B_2	R_1	S_3	B_3	...	D_137	D_138	D_142	D_144
0	0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...	2017-03-09	0.938469	0	0.008724	1.006838	0.009228	0.124035	0.004709	...	-1	-1	NaN	0.000610
1	0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...	2017-04-07	0.936665	0	0.004923	1.000653	0.006151	0.126750	0.002714	...	-1	-1	NaN	0.005492
2	0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...	2017-05-28	0.954180	3	0.021655	1.009672	0.006815	0.123977	0.009423	...	-1	-1	NaN	0.006986

	D_39_mean	D_39_min	D_39_last	D_41_mean	D_41_min	D_41_last	D_42_mean	D_42_min	D_42_last	D_43_mean	...	D_126_count	D_126_last	D_126_first	D_126_nunique	B_30_count	B_30_last	B_30_nunique	B_38_count	B_38_last	B_38_nunique
customer_ID
0	0.0	0	0	0.0	0.0	0.0	-123.0	-123.0	-123.0	-123.000000	...	1	2	2	1	1	0	1	1	2	1
1	6.0	6	6	0.0	0.0	0.0	-123.0	-123.0	-123.0	0.060646	...	1	2	2	1	1	0	1	1	2	1
2	0.0	0	0	0.0	0.0	0.0	-123.0	-123.0	-123.0	-123.000000	...	1	2	2	1	1	0	1	1	1	1
3	0.0	0	0	0.0	0.0	0.0	-123.0	-123.0	-123.0	0.046104	...	1	2	2	1	1	0	1	1	2	1
4	0.0	0	0	0.0	0.0	0.0	-123.0	-123.0	-123.0	0.044671	...	1	2	2	1	1	0	1	1	1	1