대회 평가 : M = 0.5 * (G + D)
평가 파일
주어진 데이터 셋 - 총 50.31GB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
import warnings, gc
warnings.filterwarnings("ignore")
%%time
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train = train.merge(label,how='inner',on="customer_ID")
CPU times: user 2min, sys: 16.8 s, total: 2min 17s Wall time: 2min 24s
print(train.shape)
train.head(3)
(5531451, 191)
customer_ID | S_2 | P_2 | D_39 | B_1 | B_2 | R_1 | S_3 | D_41 | B_3 | ... | D_137 | D_138 | D_139 | D_140 | D_141 | D_142 | D_143 | D_144 | D_145 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-03-09 | 0.938469 | 0 | 0.008724 | 1.006838 | 0.009228 | 0.124035 | 0.0 | 0.004709 | ... | -1 | -1 | 0 | 0 | 0.0 | NaN | 0 | 0.000610 | 0 | 0 |
1 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-04-07 | 0.936665 | 0 | 0.004923 | 1.000653 | 0.006151 | 0.126750 | 0.0 | 0.002714 | ... | -1 | -1 | 0 | 0 | 0.0 | NaN | 0 | 0.005492 | 0 | 0 |
2 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-05-28 | 0.954180 | 3 | 0.021655 | 1.009672 | 0.006815 | 0.123977 | 0.0 | 0.009423 | ... | -1 | -1 | 0 | 0 | 0.0 | NaN | 0 | 0.006986 | 0 | 0 |
3 rows × 191 columns
D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables
lab = LabelEncoder()
train['customer_ID']= lab.fit_transform(train['customer_ID'])
train['customer_ID'].head()
0 0 1 0 2 0 3 0 4 0 Name: customer_ID, dtype: int64
%%time
train = train.groupby(['customer_ID']).tail(1).set_index('customer_ID')
CPU times: user 1.08 s, sys: 625 ms, total: 1.7 s Wall time: 1.7 s
print( train.shape )
train.head()
gc.collect()
(458913, 190)
42
%%time
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")
CPU times: user 12.5 s, sys: 11.6 s, total: 24.1 s Wall time: 35.8 s
print(test.shape)
test.head()
gc.collect()
(11363762, 190)
42
test['customer_ID']= lab.fit_transform(test['customer_ID'])
test = test.groupby(['customer_ID']).tail(1).set_index('customer_ID')
X = train.drop(["target","S_2"],axis=1)
y = train.target
test = test.drop(['S_2'],axis=1)
X = X.fillna(-123)
test = test.fillna(-123)
y.value_counts()
0 340085 1 118828 Name: target, dtype: int64
print(X.shape, y.shape, test.shape)
gc.collect()
(458913, 188) (458913,) (924621, 188)
42
# 범주형 컬럼
cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66',
'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']
# 수치형 컬럼
num_cols = [col for col in X.columns if col not in cat_cols]
all_cols = [cat_cols,num_cols]
print(all_cols)
[['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126'], ['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'B_36', 'B_37', 'R_26', 'R_27', 'D_108', 'D_109', 'D_110', 'D_111', 'B_39', 'D_112', 'B_40', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'B_41', 'B_42', 'D_130', 'D_131', 'D_132', 'D_133', 'R_28', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145']]
D_n_cols = [col for col in num_cols if col.startswith("D")]
S_n_cols = [col for col in num_cols if col.startswith("S")]
P_n_cols = [col for col in num_cols if col.startswith("P")]
B_n_cols = [col for col in num_cols if col.startswith("B")]
R_n_cols = [col for col in num_cols if col.startswith("R")]
# 범주형 컬럼
# cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66',
# 'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']
D_c_cols = [col for col in cat_cols if col.startswith("D")]
B_c_cols = [col for col in cat_cols if col.startswith("B")]
print( len(D_n_cols), len(S_n_cols), len(P_n_cols), len(B_n_cols),len(R_n_cols) )
print( len(D_c_cols),len(B_c_cols) )
87 21 3 38 28 9 2
print(X.shape)
X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
(458913, 188)
print(X_num_agg_D.shape)
print(X_num_agg_D.columns)
(458913, 261) MultiIndex([( 'D_39', 'mean'), ( 'D_39', 'min'), ( 'D_39', 'last'), ( 'D_41', 'mean'), ( 'D_41', 'min'), ( 'D_41', 'last'), ( 'D_42', 'mean'), ( 'D_42', 'min'), ( 'D_42', 'last'), ( 'D_43', 'mean'), ... ('D_142', 'last'), ('D_143', 'mean'), ('D_143', 'min'), ('D_143', 'last'), ('D_144', 'mean'), ('D_144', 'min'), ('D_144', 'last'), ('D_145', 'mean'), ('D_145', 'min'), ('D_145', 'last')], length=261)
X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]
print( X_num_agg_D.columns)
del X_num_agg_D
gc.collect()
Index(['D_39_mean', 'D_39_min', 'D_39_last', 'D_41_mean', 'D_41_min', 'D_41_last', 'D_42_mean', 'D_42_min', 'D_42_last', 'D_43_mean', ... 'D_142_last', 'D_143_mean', 'D_143_min', 'D_143_last', 'D_144_mean', 'D_144_min', 'D_144_last', 'D_145_mean', 'D_145_min', 'D_145_last'], dtype='object', length=261)
42
%%time
print('피처 엔지니어링 전', X.shape)
X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]
X_num_agg_S = X.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
X_num_agg_S.columns = ['_'.join(x) for x in X_num_agg_S.columns]
X_num_agg_P = X.groupby("customer_ID")[P_n_cols].agg(['mean','min','max' ,'last'])
X_num_agg_P.columns = ['_'.join(x) for x in X_num_agg_P.columns]
X_num_agg_B = X.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
X_num_agg_B.columns = ['_'.join(x) for x in X_num_agg_B.columns]
X_num_agg_R = X.groupby("customer_ID")[R_n_cols].agg(['mean','min','last'])
X_num_agg_R.columns = ['_'.join(x) for x in X_num_agg_R.columns]
X_cat_agg_D = X.groupby("customer_ID")[D_c_cols].agg([ 'count','last','first','nunique'])
X_cat_agg_D.columns = ['_'.join(x) for x in X_cat_agg_D.columns]
X_cat_agg_B = X.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
X_cat_agg_B.columns = ['_'.join(x) for x in X_cat_agg_B.columns]
X = pd.concat([X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,
X_cat_agg_D,X_cat_agg_B], axis=1)
del X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R, X_cat_agg_D,X_cat_agg_B
_ = gc.collect()
print('피처 엔지니어링 후', X.shape)
피처 엔지니어링 전 (458913, 188) 피처 엔지니어링 후 (458913, 576) CPU times: user 6.41 s, sys: 397 ms, total: 6.81 s Wall time: 6.81 s
X.head()
D_39_mean | D_39_min | D_39_last | D_41_mean | D_41_min | D_41_last | D_42_mean | D_42_min | D_42_last | D_43_mean | ... | D_126_count | D_126_last | D_126_first | D_126_nunique | B_30_count | B_30_last | B_30_nunique | B_38_count | B_38_last | B_38_nunique | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
customer_ID | |||||||||||||||||||||
0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | -123.0 | -123.0 | -123.0 | -123.000000 | ... | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 2 | 1 |
1 | 6.0 | 6 | 6 | 0.0 | 0.0 | 0.0 | -123.0 | -123.0 | -123.0 | 0.060646 | ... | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 2 | 1 |
2 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | -123.0 | -123.0 | -123.0 | -123.000000 | ... | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
3 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | -123.0 | -123.0 | -123.0 | 0.046104 | ... | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 2 | 1 |
4 | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | -123.0 | -123.0 | -123.0 | 0.044671 | ... | 1 | 2 | 2 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
5 rows × 576 columns
%%time
print('피처 엔지니어링 전', test.shape)
test_num_agg_D = test.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
test_num_agg_D.columns = ['_'.join(x) for x in test_num_agg_D.columns]
test_num_agg_S = test.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
test_num_agg_S.columns = ['_'.join(x) for x in test_num_agg_S.columns]
test_num_agg_P = test.groupby("customer_ID")[P_n_cols].agg(['mean','min','max', 'last'])
test_num_agg_P.columns = ['_'.join(x) for x in test_num_agg_P.columns]
test_num_agg_B = test.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
test_num_agg_B.columns = ['_'.join(x) for x in test_num_agg_B.columns]
test_num_agg_R = test.groupby("customer_ID")[R_n_cols].agg(['mean','min', 'last'])
test_num_agg_R.columns = ['_'.join(x) for x in test_num_agg_R.columns]
test_cat_agg_D = test.groupby("customer_ID")[D_c_cols].agg(['count','first', 'last','nunique'])
test_cat_agg_D.columns = ['_'.join(x) for x in test_cat_agg_D.columns]
test_cat_agg_B = test.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
test_cat_agg_B.columns = ['_'.join(x) for x in test_cat_agg_B.columns]
test = pd.concat([test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,
test_cat_agg_D,test_cat_agg_B], axis=1)
del test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B
_ = gc.collect()
print('피처 엔지니어링 후', test.shape)
피처 엔지니어링 전 (924621, 188) 피처 엔지니어링 후 (924621, 576) CPU times: user 12.7 s, sys: 704 ms, total: 13.5 s Wall time: 13.5 s
파라미터 이름 | 상세 설명 | 기타 |
---|---|---|
booster | 사용할 Booster (gblinear, dart, gbtree, dart 등) | default = gbtree, 트리 기반의 모델(gbtree) gblinear(선형 모델) |
n_estimators | 사용할 트리의 개 | ooo |
subsample | 학습 인스턴스의 하위 샘플 비율.0.5로 설정시, XGBoost가 나무를 성장시키기 전에 학습 데이터의 절반을 무작위로 샘플링. | default=1 |
max_depth | 나무의 최대 깊이. 깊은 트리는 메모리 소비 크다. | default=6 |
min_child_weight | 자식에게 필요한 인스턴스 가중치의 최소 합계. min_child_weight가 클수록 알고리즘이 더 보수적 | default=1 |
eta | 과적합을 방지하기 위해 업데이트에 사용되는 단계 크기 축소 | default=0.3 - learning_rate |
lambda | 가중치에 대한 L2 정규화 항. 커지면 보수적. | default=1 (reg_lambda) |
alpha | 가중치에 대한 L1 정규화 항. 커지면 보수적. | default=0 (reg_alpha) |
gamma | 트리의 리프 노드에서 추가 파티션을 만드는데 필요한 최소 손실 감소. 클수록 더 보수적. | default=0, alias: min_split_loss |
grow_policy | depthwise : 루틍 가장 가까운 노드에서 분할. lossguide : 손실 변화가 가장 큰 노드에서 분할. |
default=depthwise |
sample_type | 샘플링 알고리즘 타 | default='uniform' |
normalize_type | 정규화 알고리즘의 유형 | default='tree' |
rate_drop | 드롭아웃 비율(드롭아웃 동안 드롭할 이전 트리의 일부) | default=0.0 |
skip_drop | 드롭아웃을 건너뛸 확률. | range[0.0, 1.0] |
xgb_parms ={
'booster': 'dart',
'n_jobs':4,
'n_estimators':1000,
'lambda': 4.091409953463271e-08,
'alpha': 3.6353429991712695e-08,
'subsample': 0.6423675532438815,
'colsample_bytree': 0.7830450413657872,
'max_depth': 9,
'min_child_weight': 5,
'eta': 0.3749337530972536,
'gamma': 0.0745370910451703,
'grow_policy': 'depthwise',
'sample_type': 'uniform',
'normalize_type': 'tree',
'rate_drop': 0.0723975209176045,
'skip_drop': 0.9026367296518939}
X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.25,stratify=y)
%%time
my_model = XGBClassifier(**xgb_parms)
my_model.fit(X_train, y_train,
early_stopping_rounds=10,
eval_set=[(X_valid, y_valid)],
verbose=1)
[0] validation_0-logloss:0.48592 [1] validation_0-logloss:0.38724 [2] validation_0-logloss:0.33091 [3] validation_0-logloss:0.29680 [4] validation_0-logloss:0.27552 [5] validation_0-logloss:0.26194 [6] validation_0-logloss:0.25311 [7] validation_0-logloss:0.24713 [8] validation_0-logloss:0.24322 [9] validation_0-logloss:0.24069 [10] validation_0-logloss:0.23882 [11] validation_0-logloss:0.23753 [12] validation_0-logloss:0.23674 [13] validation_0-logloss:0.23617 [14] validation_0-logloss:0.23580 [15] validation_0-logloss:0.23595 [16] validation_0-logloss:0.23591 [17] validation_0-logloss:0.23584 [18] validation_0-logloss:0.23579 [19] validation_0-logloss:0.23586 [20] validation_0-logloss:0.23593 [21] validation_0-logloss:0.23604 [22] validation_0-logloss:0.23632 [23] validation_0-logloss:0.23646 [24] validation_0-logloss:0.23649 [25] validation_0-logloss:0.23686 [26] validation_0-logloss:0.23718 [27] validation_0-logloss:0.23733 CPU times: user 30min 52s, sys: 2.95 s, total: 30min 55s Wall time: 7min 56s
XGBClassifier(alpha=3.6353429991712695e-08, base_score=0.5, booster='dart', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7830450413657872, early_stopping_rounds=None, enable_categorical=False, eta=0.3749337530972536, eval_metric=None, gamma=0.0745370910451703, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', lambda=4.091409953463271e-08, learning_rate=0.374933749, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=9, max_leaves=0, min_child_weight=5, missing=nan, monotone_constraints='()', n_estimators=1000, n_jobs=4, normalize_type='tree', num_parallel_tree=1, ...)
pred_val = my_model.predict(X_valid)
cf = classification_report(y_valid,pred_val)
print(cf)
precision recall f1-score support 0 0.93 0.93 0.93 85022 1 0.80 0.79 0.80 29707 accuracy 0.90 114729 macro avg 0.86 0.86 0.86 114729 weighted avg 0.89 0.90 0.90 114729
cm = confusion_matrix(y_valid,pred_val)
plt.figure(figsize=(10,7))
sns.heatmap(cm,annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
Text(69.0, 0.5, 'Truth')
pred_test = my_model.predict_proba(test)
preds = pd.DataFrame(pred_test)
pred_final = np.array(preds[1])
pred_final
array([0.03312937, 0.00237539, 0.0565865 , ..., 0.6953294 , 0.31301746, 0.10683217], dtype=float32)
submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
submission['prediction']=pred_final
submission
customer_ID | prediction | |
---|---|---|
0 | 00000469ba478561f23a92a868bd366de6f6527a684c9a... | 0.033129 |
1 | 00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39... | 0.002375 |
2 | 0000210045da4f81e5f122c6bde5c2a617d03eef67f82c... | 0.056587 |
3 | 00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c... | 0.318178 |
4 | 00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9... | 0.908346 |
... | ... | ... |
924616 | ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c... | 0.025777 |
924617 | ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3... | 0.715930 |
924618 | ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475... | 0.695329 |
924619 | ffffddef1fc3643ea179c93245b68dca0f36941cd83977... | 0.313017 |
924620 | fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61... | 0.106832 |
924621 rows × 2 columns
submission.to_csv("submission.csv",index=False)