import `polars as pl
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
import dill
import gc
import time
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("this notebook training time is ", current_time)
class Config():
seed=2024
num_folds=10
TARGET_NAME ='target'
batch_size=1000
import random
def seed_everything(seed):
np.random.seed(seed)
random.seed(seed)
seed_everything(Config.seed)
colname2dtype=pd.read_csv("/kaggle/input/home-credit-inconsistent-data-types/colname2dtype.csv")
colname=colname2dtype['Column'].values
dtype=colname2dtype['DataType'].values
dtype2pl={}
dtype2pl['Int64']=pl.Int64
dtype2pl['Float64']=pl.Float64
dtype2pl['String']=pl.String
dtype2pl['Boolean']=pl.String
colname2dtype={}
for idx in range(len(colname)):
colname2dtype[colname[idx]]=dtype2pl[dtype[idx]]
def find_df_null_col(df,margin=0.975):
cols=[]
for col in df.columns:
if df[col].isna().mean()>margin:
cols.append(col)
return cols
def find_last_case_id(df,id='case_id'):
df_copy=df.clone()
df_tail=df.tail(1)
df_copy=df_copy.with_columns(pl.col(id).shift(-1).alias(f"{id}_shift_-1"))
df_last=df_copy.filter(pl.col(id)-pl.col(f'{id}_shift_-1')!=0).drop(f'{id}_shift_-1')
df_last=pl.concat([df_last,df_tail])
del df_copy,df_tail
gc.collect()
return df_last
def df_fillna(df,col,method=None):
if method ==None:
pass
if method == "forward":
df = df.select([pl.col(col).fill_null('forward')])
else:
df=df.with_columns(pl.col(col).fill_null(method).alias(col))
return df
def one_hot_encoder(df,col,unique):
if len(unique)==2:
df=df.with_columns((pl.col(col)==unique[0]).cast(pl.Int8).alias(f"{col}_{unique[0]}"))
else:
for idx in range(len(unique)):
df=df.with_columns((pl.col(col)==unique[idx]).cast(pl.Int8).alias(f"{col}_{unique[idx]}"))
return df.drop(col)
def last_features_merge(feats,last_df,last_features=[]):
last_df=last_df.select(['case_id']+[last[0] for last in last_features])
for last in last_features:
col,fill=last
last_df=df_fillna(last_df,col,method=fill)
feats=feats.join(last_df,on='case_id',how='left')
return feats
def group_features_merge(feats,group_df,group_features=[],group_name='applprev2'):
group_df=group_df.select(['case_id']+[g[0] for g in group_features])
for group in group_features:
if group_df[group[0]].dtype==pl.String:
col,fill,one_hot=group
group_df=df_fillna(group_df,col,method=fill)
if one_hot==None:
group_df=group_df.drop(col)
else:
group_df=one_hot_encoder(group_df,col,one_hot)
for value in one_hot:
new_col=f"{col}_{value}"
feat=feat=group_df.group_by('case_id').agg(
pl.mean(new_col).alias(f"mean_{group_name}_{new_col}"),
pl.std(new_col).alias(f"std_{group_name}_{new_col}"),
pl.count(new_col).alias(f"count_{group_name}_{new_col}"),
)
feats=feats.join(feat,on='case_id',how='left')
else:
col,fill=group
group_df=df_fillna(group_df,col,method=fill)
feat=group_df.group_by('case_id').agg( pl.max(col).alias(f"max_{group_name}_{col}"),
pl.mean(col).alias(f"mean_{group_name}_{col}"),
pl.median(col).alias(f"median_{group_name}_{col}"),
pl.std(col).alias(f"std_{group_name}_{col}"),
pl.min(col).alias(f"min_{group_name}_{col}"),
pl.count(col).alias(f"count_{group_name}_{col}"),
pl.sum(col).alias(f"sum_{group_name}_{col}"),
pl.n_unique(col).alias(f"n_unique_{group_name}_{col}"),
pl.first(col).alias(f"first_{group_name}_{col}"),
pl.last(col).alias(f"last_{group_name}_{col}")
)
feats=feats.join(feat,on='case_id',how='left')
return feats
def set_table_dtypes(df):
for col in df.columns:
df=df.with_columns(pl.col(col).cast(colname2dtype[col]).alias(col))
return df
def preprocessor(mode='train'):
print(f"{mode} base file after break.number 1")
feats=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_base.csv").pipe(set_table_dtypes)
feats=feats.drop(['date_decision','MONTH','WEEK_NUM'])
print("-"*30)
print(f"{mode} applprev_2 file after break. number:1")
applprev2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_applprev_2.csv").pipe(set_table_dtypes)
applprev2=applprev2.with_columns(
( (pl.col('cacccardblochreas_147M')!=pl.col('cacccardblochreas_147M'))&(pl.col('conts_type_509L')!=pl.col('conts_type_509L')) )\
.alias("no_credit")
)
applprev2=applprev2.with_columns(
( (pl.col('cacccardblochreas_147M')!=pl.col('cacccardblochreas_147M'))&(pl.col('conts_type_509L')==pl.col('conts_type_509L'))) \
.alias("no_frozen_credit").cast(pl.Int8)
)
applprev2=applprev2.with_columns(
(pl.col('cacccardblochreas_147M')==pl.col('cacccardblochreas_147M'))\
.alias("frozen_credit").cast(pl.Int8)
)
applprev2_last=find_last_case_id(applprev2)
"""
这些列有些是要取最新的特征,有些是需要groupby.
联系方式要最新的
看一个人最新状态是不是还没有信用卡
有没有信用卡冻结也考虑一下最新状态吧,反正就一个特征.
信用卡冻结列特征可以从冻结原因那列构造
"""
last_features=[['conts_type_509L','WHATSAPP'],
['no_credit',0],
['no_frozen_credit',0],
['frozen_credit',0]
]
feats=last_features_merge(feats,applprev2_last,last_features)
group_features=[['cacccardblochreas_147M','a55475b1',\
["P19_60_110","P17_56_144","a55475b1","P201_63_60","P127_74_114","P133_119_56","P41_107_150","P23_105_103""P33_145_161"]],
['credacc_cards_status_52L','UNCONFIRMED',\
['BLOCKED','UNCONFIRMED','RENEWED', 'CANCELLED', 'INACTIVE', 'ACTIVE']],
['num_group1',0],
['num_group2',0],
]
feats=group_features_merge(feats,applprev2,group_features,group_name='applprev2')
del applprev2,applprev2_last
gc.collect()
print("-"*30)
print("credit bureau b num 2")
bureau_b_1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_credit_bureau_b_1.csv").pipe(set_table_dtypes)
bureau_b_2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_credit_bureau_b_2.csv").pipe(set_table_dtypes)
bureau_b_1_last=find_last_case_id(bureau_b_1,id='case_id')
bureau_b_2_last=find_last_case_id(bureau_b_2,id='case_id')
feats=feats.join(bureau_b_1_last,on='case_id',how='left')
feats=feats.join(bureau_b_2_last,on='case_id',how='left')
del bureau_b_1,bureau_b_1_last,bureau_b_2,bureau_b_2_last
gc.collect()
print(f"{mode} debitcard file after break num 1")
debitcard=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_debitcard_1.csv").pipe(set_table_dtypes)
debitcard_last=find_last_case_id(debitcard,id='case_id')
last_features=[['last180dayaveragebalance_704A',0],
['last180dayturnover_1134A',30000],
['last30dayturnover_651A',0]
]
feats=last_features_merge(feats,debitcard_last,last_features)
group_features=[['num_group1',0]
]
feats=group_features_merge(feats,debitcard,group_features,group_name='debitcard')
del debitcard,debitcard_last
gc.collect()
print(f"{mode} deposit file num 1")
deposit=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_deposit_1.csv").pipe(set_table_dtypes)
for idx in range(1,len(deposit.columns)):
col=deposit.columns[idx]
column_type = deposit[col].dtype
is_numeric = (column_type == pl.datatypes.Int64) or (column_type == pl.datatypes.Float64)
if is_numeric:
feat=deposit.group_by('case_id').agg( pl.max(col).alias(f"max_deposit_{col}"),
pl.mean(col).alias(f"mean_deposit_{col}"),
pl.median(col).alias(f"median_deposit_{col}"),
pl.std(col).alias(f"std_deposit_{col}"),
pl.min(col).alias(f"min_deposit_{col}"),
pl.count(col).alias(f"count_deposit_{col}"),
pl.sum(col).alias(f"sum_deposit_{col}"),
pl.n_unique(col).alias(f"n_unique_deposit_{col}"),
pl.first(col).alias(f"first_deposit_{col}"),
pl.last(col).alias(f"last_deposit_{col}")
)
feats=feats.join(feat,on='case_id',how='left')
del deposit
gc.collect()
print(f"{mode} other file after break number 1")
other=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_other_1.csv").pipe(set_table_dtypes)
other_last=find_last_case_id(other)
last_features=[['amtdepositbalance_4809441A',0]
]
feats=last_features_merge(feats,other_last,last_features)
group_features=[['amtdebitincoming_4809443A',0],
['amtdebitoutgoing_4809440A',0],
['amtdepositincoming_4809444A',0],
['amtdepositoutgoing_4809442A',0]
]
feats=group_features_merge(feats,other,group_features,group_name='other')
del other,other_last
gc.collect()
print("person 1 num 1")
person1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_person_1.csv").pipe(set_table_dtypes)
person1=person1.drop(['birthdate_87D','childnum_185L','gender_992L','housingtype_772L','isreference_387L','maritalst_703L','role_993L'])
person1=person1.select(['case_id','contaddr_matchlist_1032L','contaddr_smempladdr_334L','empl_employedtotal_800L','language1_981M',
'persontype_1072L','persontype_792L','remitter_829L','role_1084L','safeguarantyflag_411L','sex_738L'])
person1_last=find_last_case_id(person1)
feats=feats.join(person1_last,on='case_id',how='left')
del person1,person1_last
gc.collect()
print(f"{mode} person2 file after break number 1")
person2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_person_2.csv").pipe(set_table_dtypes)
person2=person2.drop(['addres_role_871L','empls_employedfrom_796D','relatedpersons_role_762T'])
person2=person2.drop(['addres_district_368M','addres_zip_823M','empls_employer_name_740M'])
group_features=[['conts_role_79M','a55475b1',
['a55475b1', 'P38_92_157', 'P7_147_157', 'P177_137_98', 'P125_14_176',
'P125_105_50', 'P115_147_77', 'P58_79_51','P124_137_181', 'P206_38_166', 'P42_134_91']
],
['empls_economicalst_849M','a55475b1',
['a55475b1', 'P164_110_33', 'P22_131_138', 'P28_32_178','P148_57_109', 'P7_47_145', 'P164_122_65', 'P112_86_147','P82_144_169', 'P191_80_124']
],
['num_group1',0],
['num_group2',0],
]
del person2
gc.collect()
print(f"static_0 file num 2(3)")
static_0_0=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_0.csv").pipe(set_table_dtypes)
static_0_1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_1.csv").pipe(set_table_dtypes)
static=pl.concat([static_0_0,static_0_1],how="vertical_relaxed")
if mode=='test':
static_0_2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_2.csv").pipe(set_table_dtypes)
static=pl.concat([static,static_0_2],how="vertical_relaxed")
feats=feats.join(static,on='case_id',how='left')
del static,static_0_0,static_0_1
gc.collect()
print(f"{mode} static_cb_file after break num 1")
static_cb=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_cb_0.csv").pipe(set_table_dtypes)
static_cb=static_cb.drop(['assignmentdate_4955616D', 'dateofbirth_342D','for3years_128L',
'for3years_504L','for3years_584L','formonth_118L','formonth_206L','formonth_535L',
'forquarter_1017L', 'forquarter_462L','forquarter_634L','fortoday_1092L',
'forweek_1077L','forweek_528L','forweek_601L','foryear_618L','foryear_818L','foryear_850L','pmtaverage_4955615A','pmtcount_4955617L','riskassesment_302T','riskassesment_940T'])
static_cb=static_cb.drop(['birthdate_574D','dateofbirth_337D',
'assignmentdate_238D','assignmentdate_4527235D',
'responsedate_1012D','responsedate_4527233D','responsedate_4917613D',
])
last_features=[ ['contractssum_5085716L',0],
['days120_123L',0],
['days180_256L',0],
['days30_165L',0],
['days360_512L',1],
['days90_310L',0],
['description_5085714M','a55475b1'],
['education_88M','a55475b1'],
['firstquarter_103L',0],
['secondquarter_766L',0],
['thirdquarter_1082L',0],
['fourthquarter_440L',0],
['maritalst_385M','a55475b1'],
['numberofqueries_373L',1],
['pmtaverage_3A',0],
['pmtcount_693L', 6],
['pmtscount_423L',6.0],
['pmtssum_45A',0],
['requesttype_4525192L','DEDUCTION_6'],
]
feats=last_features_merge(feats,static_cb,last_features)
feats=feats.with_columns( (pl.col('days180_256L')-pl.col('days120_123L')).alias("daysgap60"))
feats=feats.with_columns( (pl.col('days180_256L')-pl.col('days30_165L')).alias("daysgap150"))
feats=feats.with_columns( (pl.col('days120_123L')-pl.col('days30_165L')).alias("daysgap90"))
feats=feats.with_columns( (pl.col('firstquarter_103L')+pl.col('secondquarter_766L')+pl.col('thirdquarter_1082L')+pl.col('fourthquarter_440L')).alias("totalyear_result"))
del static_cb
gc.collect()
print("-"*30)
print(f"{mode} tax_a file after break num 1")
tax_a=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_a_1.csv").pipe(set_table_dtypes)
group_features=[['amount_4527230A',850],
['num_group1',0]
]
feats=group_features_merge(feats,tax_a,group_features,group_name='tax_a')
del tax_a
gc.collect()
print("-"*30)
print(f"{mode} tax_b file after break num 1")
tax_b=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_b_1.csv").pipe(set_table_dtypes)
group_features=[['amount_4917619A',6885],
['num_group1',0]
]
feats=group_features_merge(feats,tax_b,group_features,group_name='tax_b')
del tax_b
gc.collect()
print("-"*30)
print(f"{mode} tax_c file after break num 1")
tax_c=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_c_1.csv").pipe(set_table_dtypes)
if len(tax_c)==0:
tax_c=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_tax_registry_c_1.csv").pipe(set_table_dtypes)
tax_c=tax_c.drop(['employername_160M','processingdate_168D'])
group_features=[['pmtamount_36A',850],
['num_group1',0]
]
feats=group_features_merge(feats,tax_c,group_features,group_name='tax_c')
del tax_c
gc.collect()
print("-"*30)
return feats
train_feats=preprocessor(mode='train')
test_feats=preprocessor(mode='test')
train_feats=train_feats.to_pandas()
test_feats=test_feats.to_pandas()
mode_values = train_feats.mode().iloc[0]
train_feats = train_feats.fillna(mode_values)
test_feats = test_feats.fillna(mode_values)
print("----------string one hot encoder ****")
for col in test_feats.columns:
n_unique=train_feats[col].nunique()
if n_unique==2 and train_feats[col].dtype=='object':
print(f"one_hot_2:{col}")
unique=train_feats[col].unique()
train_feats[col]=(train_feats[col]==unique[0]).astype(int)
test_feats[col]=(test_feats[col]==unique[0]).astype(int)
elif (n_unique<10) and train_feats[col].dtype=='object':
print(f"one_hot_10:{col}")
unique=train_feats[col].unique()
for idx in range(len(unique)):
if unique[idx]==unique[idx]:
train_feats[col+"_"+str(idx)]=(train_feats[col]==unique[idx]).astype(int)
test_feats[col+"_"+str(idx)]=(test_feats[col]==unique[idx]).astype(int)
train_feats.drop([col],axis=1,inplace=True)
test_feats.drop([col],axis=1,inplace=True)
print("----------drop other string or unique value or full null value ****")
drop_cols=[]
for col in test_feats.columns:
if (train_feats[col].dtype=='object') or (test_feats[col].dtype=='object') \
or (train_feats[col].nunique()==1) or train_feats[col].isna().mean()>0.99:
drop_cols+=[col]
drop_cols+=['case_id']
train_feats.drop(drop_cols,axis=1,inplace=True)
test_feats.drop(drop_cols,axis=1,inplace=True)
print(f"len(train_feats):{len(train_feats)},total_features_counts:{len(test_feats.columns)}")
train_feats.head()
def reduce_mem_usage(df, float16_as32=True):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min,c_max = df[col].min(),df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
if float16_as32:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
train_feats = reduce_mem_usage(train_feats)
test_feats = reduce_mem_usage(test_feats)
def pearson_corr(x1,x2):
"""
x1,x2:np.array
"""
mean_x1=np.mean(x1)
mean_x2=np.mean(x2)
std_x1=np.std(x1)
std_x2=np.std(x2)
pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2)
return pearson
choose_cols=[]
for col in train_feats.columns:
if col!='target':
pearson=pearson_corr(train_feats[col].values,train_feats['target'].values)
if abs(pearson)>0.0025:
choose_cols.append(col)
print(f"len(choose_cols):{len(choose_cols)},choose_cols:{choose_cols}")
from sklearn.linear_model import LinearRegression
X=train_feats[choose_cols].copy()
y=train_feats[Config.TARGET_NAME].copy()
test_X=test_feats[choose_cols].copy()
oof_pred_pro=np.zeros((len(X)))
test_pred_pro=np.zeros((Config.num_folds,len(test_X)))
del train_feats,test_feats
gc.collect()
skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)
for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
print(f"fold:{fold}")
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
model = LinearRegression()
model.fit(X_train,y_train)
oof_pred_pro[valid_index]=model.predict(X_valid)
for idx in range(0,len(test_X),Config.batch_size):
test_pred_pro[fold][idx:idx+Config.batch_size]=model.predict(test_X[idx:idx+Config.batch_size])
del model,X_train, X_valid,y_train, y_valid
gc.collect()
gini=2*roc_auc_score(y.values,oof_pred_pro)-1
print(f"mean_gini:{gini}")
test_preds=test_pred_pro.mean(axis=0)
submission=pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
submission['score']=np.clip(np.nan_to_num(test_preds,nan=0.3),0,1)
submission.to_csv("submission.csv",index=None)
submission.head()