import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import datetime
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from lightgbm import early_stopping
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")
NUM_FOLDS = 5
%%time
df_train = pd.read_parquet("/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet")
# S_2를 datatime으로
df_train["S_2"] = pd.to_datetime(df_train["S_2"])
df_train["days"] = (df_train["S_2"] - df_train.groupby(["customer_ID"])["S_2"].transform("min")).dt.days.astype("int16") + 1
# float32 -> float16
for col in df_train[df_train.columns[df_train.dtypes=="float32"]]:
df_train[col] = df_train[col].astype("float16")
CPU times: user 33.1 s, sys: 28.9 s, total: 1min 2s Wall time: 1min 3s
gc.collect()
68
print( df_train.shape)
print( df_train['customer_ID'].value_counts().shape )
print( df_train['customer_ID'].value_counts() )
print( )
(5531451, 191) (458913,) 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a 13 a3111280bfa1ed8fafd0b06839eb707f4538497e8087cb62958bb03e1bdde214 13 a31376930229162f886c091e5a56a528f81c10a523285828ed05a6e9ccf56722 13 a312c595dfaee96c8a597107d2754a49b1acfd127400d98991762d87837b1b65 13 a312aff722e7230f9d6a313ff777d6f00166c6bada21a333982426758a2e2a9d 13 .. a84839802f1f37a86a7fe34ddba4791d33d878df3937b509841def0a9e252748 1 01f4f7b14d83b6a8f88e4355279224615da083b19e3e5f15b98f274ced8cf752 1 eef07ea56302cebcd57374c6565bb3e5c7af856796d9cbc31ed42aa0fc73b7fc 1 d192480082e86e3b4da68f014b284f2a2624b45956eed279416c796de043b7ce 1 d9ea3cffff889b522a69bde89aee382dcff8bffe32c9a38653bdaa2ff4330041 1 Name: customer_ID, Length: 458913, dtype: int64
df_train.groupby(["customer_ID"]).tail(1)
customer_ID | S_2 | P_2 | D_39 | B_1 | B_2 | R_1 | S_3 | D_41 | B_3 | ... | D_137 | D_138 | D_139 | D_140 | D_141 | D_142 | D_143 | D_144 | D_145 | days | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2018-03-13 | 0.934570 | 0 | 0.009384 | 1.007812 | 0.006104 | 0.135010 | 0.0 | 0.007175 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.002970 | 0 | 370 |
25 | 00000fd6641609c6ece5454664794f0340ad84dddce9a2... | 2018-03-25 | 0.880371 | 6 | 0.034698 | 1.003906 | 0.006912 | 0.165527 | 0.0 | 0.005070 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.003170 | 0 | 390 |
38 | 00001b22f846c82c51f6e3958ccd81970162bae8b007e8... | 2018-03-12 | 0.880859 | 0 | 0.004284 | 0.812500 | 0.006451 | NaN | 0.0 | 0.007195 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.000834 | 0 | 367 |
51 | 000041bdba6ecadd89a52d11886e8eaaec9325906c9723... | 2018-03-29 | 0.621582 | 0 | 0.012566 | 1.005859 | 0.007828 | 0.287842 | 0.0 | 0.009941 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.005558 | 0 | 364 |
64 | 00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a... | 2018-03-30 | 0.872070 | 0 | 0.007679 | 0.815918 | 0.001247 | NaN | 0.0 | 0.005527 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.006943 | 0 | 366 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5531398 | ffff41c8a52833b56430603969b9ca48d208e7c192c6a4... | 2018-03-31 | 0.844238 | 15 | 0.028519 | 1.009766 | 0.001928 | 0.128662 | 0.0 | 0.005894 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.003010 | 0 | 377 |
5531411 | ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd... | 2018-03-22 | 0.831055 | 1 | 0.292480 | 0.055664 | 0.006954 | NaN | 0.0 | 0.233032 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.009232 | 0 | 366 |
5531424 | ffff9984b999fccb2b6127635ed0736dda94e544e67e02... | 2018-03-07 | 0.800293 | 9 | 0.020569 | 1.006836 | 0.000957 | 0.066650 | 0.0 | 0.006313 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.000340 | 0 | 357 |
5531437 | ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814... | 2018-03-23 | 0.753906 | 0 | 0.015839 | 0.714355 | 0.000993 | 0.408936 | 0.0 | 0.050049 | ... | -1 | -1 | 1 | 0 | 0.949707 | 0.446289 | 1 | 0.002502 | 2 | 364 |
5531450 | fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea... | 2018-03-14 | 0.981934 | 0 | 0.000077 | 0.992676 | 0.000809 | 0.119141 | 0.0 | 0.014091 | ... | -1 | -1 | 0 | 0 | 0.000000 | NaN | 0 | 0.003183 | 0 | 360 |
458913 rows × 191 columns
df_train = df_train.groupby(["customer_ID"]).tail(1).set_index('customer_ID')
%%time
df_train_labels = pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv")
df_train_labels["target"] = df_train_labels["target"].astype("int8")
print(df_train_labels.shape)
df_train_labels.head()
(458913, 2) CPU times: user 495 ms, sys: 126 ms, total: 620 ms Wall time: 1.03 s
customer_ID | target | |
---|---|---|
0 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 0 |
1 | 00000fd6641609c6ece5454664794f0340ad84dddce9a2... | 0 |
2 | 00001b22f846c82c51f6e3958ccd81970162bae8b007e8... | 0 |
3 | 000041bdba6ecadd89a52d11886e8eaaec9325906c9723... | 0 |
4 | 00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a... | 0 |
%%time
df_train = df_train.merge(df_train_labels, on="customer_ID", how='left')
print(df_train.shape)
print(df_train.head())
del df_train_labels
gc.collect()
(458913, 192) customer_ID S_2 P_2 \ 0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2018-03-13 0.934570 1 00000fd6641609c6ece5454664794f0340ad84dddce9a2... 2018-03-25 0.880371 2 00001b22f846c82c51f6e3958ccd81970162bae8b007e8... 2018-03-12 0.880859 3 000041bdba6ecadd89a52d11886e8eaaec9325906c9723... 2018-03-29 0.621582 4 00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a... 2018-03-30 0.872070 D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_138 \ 0 0 0.009384 1.007812 0.006104 0.135010 0.0 0.007175 ... -1 1 6 0.034698 1.003906 0.006912 0.165527 0.0 0.005070 ... -1 2 0 0.004284 0.812500 0.006451 NaN 0.0 0.007195 ... -1 3 0 0.012566 1.005859 0.007828 0.287842 0.0 0.009941 ... -1 4 0 0.007679 0.815918 0.001247 NaN 0.0 0.005527 ... -1 D_139 D_140 D_141 D_142 D_143 D_144 D_145 days target 0 0 0 0.0 NaN 0 0.002970 0 370 0 1 0 0 0.0 NaN 0 0.003170 0 390 0 2 0 0 0.0 NaN 0 0.000834 0 367 0 3 0 0 0.0 NaN 0 0.005558 0 364 0 4 0 0 0.0 NaN 0 0.006943 0 366 0 [5 rows x 192 columns] CPU times: user 1.01 s, sys: 77 ms, total: 1.09 s Wall time: 1.09 s
0
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793/notebook
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):
labels = np.transpose(np.array([y_true, y_pred]))
labels = labels[labels[:, 1].argsort()[::-1]]
weights = np.where(labels[:,0]==0, 20, 1)
cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
gini = [0,0]
for i in [1,0]:
labels = np.transpose(np.array([y_true, y_pred]))
labels = labels[labels[:, i].argsort()[::-1]]
weight = np.where(labels[:,0]==0, 20, 1)
weight_random = np.cumsum(weight / np.sum(weight))
total_pos = np.sum(labels[:, 0] * weight)
cum_pos_found = np.cumsum(labels[:, 0] * weight)
lorentz = cum_pos_found / total_pos
gini[i] = np.sum((lorentz - weight_random) * weight)
return 0.5 * (gini[1]/gini[0] + top_four),_
FEATURES = df_train.columns.drop(["target","customer_ID","S_2"])
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_col=[]
n=0
for col in df_train[FEATURES]:
for coll in categorical_cols:
if col==coll:
cat_col.append(n)
break
n+=1
cat_col
[51, 52, 58, 60, 103, 143, 153, 155, 156, 159, 165]
params = {}
feature_importances = []
scores = []
models = []
pred_val=[]
yval=[]
# 교차 검증 클래스 - 학습용 데이터 셋 인덱스, 검증용 데이터 셋 인덱스
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)
list( enumerate(skf.split(df_train[FEATURES],df_train["target"])) )
[(0, (array([ 0, 1, 2, ..., 458910, 458911, 458912]), array([ 4, 7, 9, ..., 458889, 458892, 458896]))), (1, (array([ 0, 2, 3, ..., 458910, 458911, 458912]), array([ 1, 5, 12, ..., 458888, 458906, 458909]))), (2, (array([ 1, 2, 3, ..., 458909, 458911, 458912]), array([ 0, 6, 25, ..., 458904, 458907, 458910]))), (3, (array([ 0, 1, 3, ..., 458909, 458910, 458911]), array([ 2, 8, 10, ..., 458905, 458908, 458912]))), (4, (array([ 0, 1, 2, ..., 458909, 458910, 458912]), array([ 3, 18, 19, ..., 458902, 458903, 458911])))]
%%time
params = {}
feature_importances = [] # 특성 중요도
scores = [] # fold 별 점수
models = [] # 모델
pred_val=[]
yval=[]
# 교차 검증 클래스ㅁ
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)
# 폴드별 데이터 나누기
for fold,(train_idx, val_idx) in enumerate(skf.split(df_train[FEATURES],df_train["target"])):
print('FOLD:',fold)
# 데이터 나누기
X_train = df_train.loc[train_idx, FEATURES].values
y_train = df_train.loc[train_idx, 'target'].values
X_val = df_train.loc[val_idx, FEATURES].values
y_val = df_train.loc[val_idx, 'target'].values
print("y_train t=0 count:", len(y_train[y_train==0]))
print("y_train t=1 count:", len(y_train[y_train==1]))
print("y_val t=0 count:", len(y_val[y_val==0]))
print("y_val t=1 count:", len(y_val[y_val==1]))
params = {
"num_iterations":10000,
'learning_rate': 0.05,
}
# LGBM 알고리즘
model = lgbm.LGBMClassifier(**params).fit(
X_train,y_train,
eval_set=[(X_val,y_val),(X_train,y_train)],
verbose=100,
callbacks=[early_stopping(100)],
categorical_feature=cat_col
)
# 특성 중요도
feature_importances.append(model.feature_importances_)
models.append(model)
pred_val = np.append(pred_val,model.predict_proba(X_val)[:,1])
yval = np.append(yval,y_val)
del X_train,y_train,X_val,y_val,model
gc.collect()
score = amex_metric_mod(yval, pred_val)[0]
print('score:', score)
f=open("score.txt","a");f.write(str(score));f.close()
FOLD: 0 y_train t=0 count: 272068 y_train t=1 count: 95062 y_val t=0 count: 68017 y_val t=1 count: 23766 [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN Training until validation scores don't improve for 100 rounds [100] training's binary_logloss: 0.227728 valid_0's binary_logloss: 0.234454 [200] training's binary_logloss: 0.21466 valid_0's binary_logloss: 0.225913 [300] training's binary_logloss: 0.207758 valid_0's binary_logloss: 0.223877 [400] training's binary_logloss: 0.202529 valid_0's binary_logloss: 0.223334 [500] training's binary_logloss: 0.19774 valid_0's binary_logloss: 0.223063 [600] training's binary_logloss: 0.193427 valid_0's binary_logloss: 0.222929 [700] training's binary_logloss: 0.189213 valid_0's binary_logloss: 0.222829 [800] training's binary_logloss: 0.185204 valid_0's binary_logloss: 0.222752 [900] training's binary_logloss: 0.181626 valid_0's binary_logloss: 0.222781 Early stopping, best iteration is: [844] training's binary_logloss: 0.183684 valid_0's binary_logloss: 0.222714 FOLD: 1 y_train t=0 count: 272068 y_train t=1 count: 95062 y_val t=0 count: 68017 y_val t=1 count: 23766 [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN Training until validation scores don't improve for 100 rounds [100] training's binary_logloss: 0.22796 valid_0's binary_logloss: 0.233707 [200] training's binary_logloss: 0.21483 valid_0's binary_logloss: 0.225265 [300] training's binary_logloss: 0.207848 valid_0's binary_logloss: 0.223467 [400] training's binary_logloss: 0.202537 valid_0's binary_logloss: 0.223081 [500] training's binary_logloss: 0.197747 valid_0's binary_logloss: 0.222808 [600] training's binary_logloss: 0.193478 valid_0's binary_logloss: 0.222646 [700] training's binary_logloss: 0.189421 valid_0's binary_logloss: 0.222571 [800] training's binary_logloss: 0.185549 valid_0's binary_logloss: 0.22242 [900] training's binary_logloss: 0.181787 valid_0's binary_logloss: 0.222353 [1000] training's binary_logloss: 0.178203 valid_0's binary_logloss: 0.222323 [1100] training's binary_logloss: 0.174714 valid_0's binary_logloss: 0.222343 Early stopping, best iteration is: [1039] training's binary_logloss: 0.176822 valid_0's binary_logloss: 0.222275 FOLD: 2 y_train t=0 count: 272068 y_train t=1 count: 95062 y_val t=0 count: 68017 y_val t=1 count: 23766 [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN Training until validation scores don't improve for 100 rounds [100] training's binary_logloss: 0.228552 valid_0's binary_logloss: 0.230742 [200] training's binary_logloss: 0.215559 valid_0's binary_logloss: 0.222494 [300] training's binary_logloss: 0.20872 valid_0's binary_logloss: 0.220646 [400] training's binary_logloss: 0.203427 valid_0's binary_logloss: 0.220264 [500] training's binary_logloss: 0.198725 valid_0's binary_logloss: 0.220085 [600] training's binary_logloss: 0.194203 valid_0's binary_logloss: 0.220071 [700] training's binary_logloss: 0.190108 valid_0's binary_logloss: 0.219981 [800] training's binary_logloss: 0.186118 valid_0's binary_logloss: 0.219856 [900] training's binary_logloss: 0.182398 valid_0's binary_logloss: 0.219868 Early stopping, best iteration is: [854] training's binary_logloss: 0.184117 valid_0's binary_logloss: 0.219801 FOLD: 3 y_train t=0 count: 272068 y_train t=1 count: 95063 y_val t=0 count: 68017 y_val t=1 count: 23765 [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN Training until validation scores don't improve for 100 rounds [100] training's binary_logloss: 0.228169 valid_0's binary_logloss: 0.232889 [200] training's binary_logloss: 0.215069 valid_0's binary_logloss: 0.22476 [300] training's binary_logloss: 0.208194 valid_0's binary_logloss: 0.223038 [400] training's binary_logloss: 0.202907 valid_0's binary_logloss: 0.222563 [500] training's binary_logloss: 0.198291 valid_0's binary_logloss: 0.222378 [600] training's binary_logloss: 0.193936 valid_0's binary_logloss: 0.222334 [700] training's binary_logloss: 0.189832 valid_0's binary_logloss: 0.222225 Early stopping, best iteration is: [683] training's binary_logloss: 0.190511 valid_0's binary_logloss: 0.222218 FOLD: 4 y_train t=0 count: 272068 y_train t=1 count: 95063 y_val t=0 count: 68017 y_val t=1 count: 23765 [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN [LightGBM] [Warning] Met negative value in categorical features, will convert it to NaN Training until validation scores don't improve for 100 rounds [100] training's binary_logloss: 0.227779 valid_0's binary_logloss: 0.233671 [200] training's binary_logloss: 0.214614 valid_0's binary_logloss: 0.225815 [300] training's binary_logloss: 0.207545 valid_0's binary_logloss: 0.224018 [400] training's binary_logloss: 0.202383 valid_0's binary_logloss: 0.223598 [500] training's binary_logloss: 0.197574 valid_0's binary_logloss: 0.223389 [600] training's binary_logloss: 0.193204 valid_0's binary_logloss: 0.223233 [700] training's binary_logloss: 0.189147 valid_0's binary_logloss: 0.223266 Early stopping, best iteration is: [606] training's binary_logloss: 0.19293 valid_0's binary_logloss: 0.223223 score: 0.785467448595047 CPU times: user 32min 56s, sys: 4.86 s, total: 33min 1s Wall time: 8min 34s
del df_train,train_idx,val_idx,yval,pred_val
gc.collect()
21
len(feature_importances[0])
feature_importances
[array([876, 324, 237, 279, 290, 479, 154, 418, 312, 555, 130, 416, 332, 350, 36, 542, 418, 341, 211, 208, 197, 86, 294, 42, 262, 116, 289, 418, 164, 339, 275, 225, 22, 82, 7, 342, 239, 117, 284, 236, 239, 38, 185, 292, 205, 170, 230, 310, 253, 168, 286, 49, 39, 70, 80, 348, 203, 127, 69, 112, 16, 290, 205, 54, 232, 283, 31, 58, 247, 40, 69, 144, 21, 70, 61, 66, 130, 255, 60, 190, 164, 222, 23, 28, 27, 25, 218, 40, 26, 65, 230, 21, 25, 213, 36, 148, 0, 10, 10, 21, 13, 31, 132, 9, 5, 12, 2, 2, 0, 17, 0, 227, 2, 9, 2, 21, 0, 3, 1, 3, 0, 33, 1, 0, 0, 0, 0, 8, 177, 281, 208, 254, 277, 208, 0, 124, 257, 53, 19, 217, 148, 49, 191, 113, 12, 0, 41, 5, 58, 178, 210, 257, 30, 34, 231, 4, 80, 187, 139, 48, 417, 54, 4, 126, 10, 10, 14, 76, 12, 24, 35, 52, 71, 163, 275, 0, 102, 2, 9, 0, 26, 0, 37, 104, 143, 0, 235, 35, 170], dtype=int32), array([986, 374, 286, 309, 382, 588, 160, 479, 365, 642, 139, 467, 453, 415, 30, 591, 504, 410, 210, 233, 243, 129, 384, 49, 309, 131, 406, 501, 256, 333, 374, 253, 31, 91, 11, 408, 294, 169, 365, 311, 339, 50, 213, 414, 269, 220, 266, 391, 338, 209, 345, 36, 46, 60, 85, 404, 212, 142, 60, 105, 20, 362, 306, 64, 330, 387, 19, 71, 320, 49, 77, 175, 49, 87, 53, 77, 124, 340, 72, 256, 260, 267, 13, 20, 21, 27, 366, 37, 38, 77, 328, 20, 60, 334, 44, 253, 3, 21, 30, 21, 11, 43, 129, 12, 0, 22, 1, 5, 0, 18, 12, 300, 0, 10, 1, 25, 1, 2, 3, 1, 0, 37, 6, 2, 0, 1, 3, 17, 217, 390, 289, 280, 295, 289, 0, 131, 299, 35, 21, 257, 166, 56, 283, 131, 14, 0, 33, 2, 61, 184, 217, 371, 33, 32, 306, 5, 115, 226, 228, 46, 489, 83, 9, 163, 6, 13, 7, 97, 12, 21, 49, 86, 84, 204, 359, 0, 129, 6, 4, 0, 20, 0, 48, 113, 196, 0, 303, 41, 201], dtype=int32), array([861, 361, 229, 273, 290, 479, 142, 465, 325, 502, 127, 458, 355, 374, 38, 463, 414, 344, 203, 208, 205, 74, 329, 41, 268, 129, 271, 464, 192, 325, 293, 224, 23, 88, 10, 290, 204, 112, 285, 230, 237, 37, 205, 323, 245, 176, 240, 294, 253, 152, 265, 55, 42, 74, 73, 325, 251, 116, 64, 106, 14, 306, 205, 50, 274, 303, 23, 67, 225, 41, 72, 120, 14, 48, 74, 68, 141, 257, 73, 228, 171, 216, 28, 21, 21, 39, 232, 30, 37, 62, 234, 28, 33, 244, 31, 175, 4, 28, 29, 23, 8, 44, 126, 11, 1, 19, 1, 2, 0, 18, 8, 200, 1, 11, 4, 25, 0, 0, 7, 0, 0, 25, 4, 1, 0, 0, 1, 24, 202, 276, 203, 218, 252, 211, 3, 132, 266, 37, 16, 232, 159, 46, 234, 108, 13, 0, 22, 0, 72, 153, 218, 248, 48, 35, 219, 4, 86, 189, 169, 45, 427, 55, 11, 130, 8, 3, 13, 81, 10, 27, 48, 70, 64, 163, 230, 0, 102, 1, 16, 0, 23, 0, 36, 80, 183, 0, 216, 28, 148], dtype=int32), array([744, 304, 206, 188, 253, 445, 134, 363, 311, 430, 118, 375, 293, 312, 37, 415, 369, 270, 184, 166, 153, 72, 267, 31, 205, 109, 241, 350, 168, 218, 225, 176, 4, 79, 8, 234, 150, 101, 216, 187, 160, 42, 148, 252, 169, 149, 184, 215, 155, 128, 205, 47, 33, 58, 53, 214, 149, 103, 60, 97, 12, 234, 190, 38, 185, 246, 20, 51, 190, 35, 53, 96, 16, 39, 57, 61, 88, 197, 42, 150, 126, 151, 15, 26, 15, 29, 162, 21, 38, 52, 165, 23, 40, 159, 30, 155, 0, 21, 24, 22, 10, 28, 111, 7, 1, 16, 0, 1, 0, 21, 8, 154, 0, 9, 2, 31, 1, 0, 6, 3, 0, 25, 3, 1, 0, 1, 0, 21, 133, 190, 198, 149, 195, 160, 0, 85, 196, 36, 7, 163, 108, 42, 161, 111, 5, 0, 26, 0, 55, 156, 187, 203, 26, 26, 237, 4, 61, 142, 147, 45, 331, 48, 2, 111, 5, 3, 10, 65, 9, 28, 44, 50, 54, 144, 186, 0, 73, 2, 8, 0, 25, 1, 35, 81, 128, 0, 160, 35, 132], dtype=int32), array([776, 300, 177, 172, 222, 400, 136, 362, 300, 378, 104, 372, 289, 274, 38, 399, 359, 237, 179, 146, 157, 62, 211, 42, 165, 108, 219, 295, 133, 215, 208, 161, 9, 74, 10, 192, 124, 85, 205, 158, 140, 41, 145, 229, 146, 117, 157, 213, 133, 111, 176, 45, 23, 64, 44, 201, 153, 80, 69, 101, 21, 201, 113, 42, 133, 163, 18, 52, 174, 38, 50, 105, 21, 37, 52, 39, 83, 188, 44, 148, 106, 126, 4, 22, 23, 26, 155, 19, 27, 50, 122, 16, 31, 140, 33, 97, 1, 20, 9, 19, 10, 21, 103, 6, 3, 9, 1, 0, 0, 7, 10, 116, 2, 9, 0, 22, 1, 2, 0, 2, 0, 33, 3, 0, 1, 1, 0, 16, 118, 170, 144, 120, 179, 116, 0, 63, 154, 25, 16, 105, 116, 42, 176, 110, 10, 0, 31, 0, 54, 136, 145, 148, 19, 24, 145, 2, 65, 140, 113, 46, 285, 42, 1, 84, 5, 4, 6, 46, 9, 24, 35, 25, 55, 141, 162, 0, 78, 4, 5, 0, 16, 1, 38, 67, 93, 0, 142, 17, 76], dtype=int32)]
df_feat_imp = pd.DataFrame(index=FEATURES)
df_feat_imp["imp0"] = feature_importances[0]
df_feat_imp["imp1"] = feature_importances[1]
df_feat_imp["imp2"] = feature_importances[2]
df_feat_imp["imp3"] = feature_importances[3]
df_feat_imp["imp4"] = feature_importances[4]
df_feat_imp["mean_imp"] = df_feat_imp.mean(axis=1).values
df_feat_imp = df_feat_imp.sort_values(by="mean_imp",ascending=False)
df_feat_imp.to_csv("feat_imp.csv")
fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(x=df_feat_imp.index,y=df_feat_imp["mean_imp"])
plt.xticks([])
print(df_feat_imp)
#del df_feat_imp, feature_importances
#gc.collect()
imp0 imp1 imp2 imp3 imp4 mean_imp P_2 876 986 861 744 776 848.6 D_43 555 642 502 430 378 501.4 D_46 542 591 463 415 399 482.0 S_3 479 588 479 445 400 478.2 B_4 416 467 458 375 372 417.6 ... ... ... ... ... ... ... D_109 0 0 0 0 0 0.0 R_18 0 0 0 0 0 0.0 D_137 0 0 0 0 0 0.0 D_143 0 0 0 0 0 0.0 R_23 0 0 0 0 0 0.0 [189 rows x 6 columns]
df_test = pd.read_parquet("/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet")
print("convert float32 columns to float16")
for col in df_test[df_test.columns[df_test.dtypes=="float32"]]:
df_test[col] = df_test[col].astype("float16")
print("date and time")
df_test["S_2"] = pd.to_datetime(df_test["S_2"])
df_test["days"] = (df_test["S_2"] - df_test.groupby(["customer_ID"])["S_2"].transform("min")).dt.days.astype("int16") + 1
print("grouping")
df_test = df_test.groupby(["customer_ID"]).tail(1).set_index('customer_ID')
convert float32 columns to float16 date and time grouping
print("prediction")
pred=[]
for fold in range(5):
print('FOLD:',fold)
if len(pred)==0:
pred = models[fold].predict_proba(df_test.drop(["S_2"],axis=1))[:,1]
else:
pred += models[fold].predict_proba(df_test.drop(["S_2"],axis=1))[:,1]
pred = pred/5
prediction FOLD: 0 FOLD: 1 FOLD: 2 FOLD: 3 FOLD: 4
subm = pd.read_csv("/kaggle/input/amex-default-prediction/sample_submission.csv")
subm["prediction"] = pred
subm.to_csv("submission.csv", index=False)