当前位置：首页 > article >正文
polars as pl

article 2025/1/27 4:05:07
import `polars as pl#`和pandas类似,但是处理大型数据集有更好的性能.
#necessary
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
#metric
from sklearn.metrics import roc_auc_score#导入roc_auc曲线
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD#截断奇异值分解,是一种数据降维的方法
import dill#对对象进行序列化和反序列化(例如保存和加载树模型)
import gc#垃圾回收模块
import time#标准库的时间模块
#为了方便后期调用训练的模型时不会调用错版本,提供模型训练的时间
#time.strftime()函数用于将时间对象格式化为字符串，time.localtime()函数返回表示当前本地时间的time.s`truct_time对象
current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("this notebook training time is ", current_time)


#config
class Config():
    seed=2024
    num_folds=10
    TARGET_NAME ='target'
    batch_size=1000#由于不知道测试数据的大小,所以分批次放入模型.
    
import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(Config.seed)

#读取训练数据中每个特征的dtype
colname2dtype=pd.read_csv("/kaggle/input/home-credit-inconsistent-data-types/colname2dtype.csv")
colname=colname2dtype['Column'].values
dtype=colname2dtype['DataType'].values

dtype2pl={}
dtype2pl['Int64']=pl.Int64
dtype2pl['Float64']=pl.Float64
dtype2pl['String']=pl.String
dtype2pl['Boolean']=pl.String

colname2dtype={}
for idx in range(len(colname)):
    colname2dtype[colname[idx]]=dtype2pl[dtype[idx]]


#找出表格df里缺失值占比大于margin的列,pandas
def find_df_null_col(df,margin=0.975):
    cols=[]
    for col in df.columns:
        if df[col].isna().mean()>margin:
            cols.append(col)
    return cols
#对于某个文件有很多个相同的case_id保留最后一个.
#有些文件我们就需要某个用户最新的某些信息,这时候就可以用这个函数.
def find_last_case_id(df,id='case_id'):#假设传入的df已经按照'case_id'排序好了.
    df_copy=df.clone()
    df_tail=df.tail(1)#最后的一个'case_id'单独取出
    #找出除了最后一个的其他的case_id,shift没用了,也要drop掉
    df_copy=df_copy.with_columns(pl.col(id).shift(-1).alias(f"{id}_shift_-1"))
    df_last=df_copy.filter(pl.col(id)-pl.col(f'{id}_shift_-1')!=0).drop(f'{id}_shift_-1')
    #每个case_id只保留最新的信息.
    df_last=pl.concat([df_last,df_tail])
    #这个比赛有很多文件,为了节省内存一定要及时清理.
    del df_copy,df_tail
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    return df_last
#对表格df的某列col用method进行填充
def df_fillna(df,col,method=None):
    if method ==None:#我不打算填充这列的缺失值
        pass
    if method == "forward":#使用前一个值填充缺失值
        df = df.select([pl.col(col).fill_null('forward')])
    else:#method=['NaN',0].如果把缺失本身当作一种信息可以填充为"NaN",二分类0和1中0占大多数的列可能会用0填充.
        df=df.with_columns(pl.col(col).fill_null(method).alias(col))
    return df#返回填充后的表格

#对表格df的某列col进行独热编码,为了保证训练集和测试集增加同样多的列,这里直接给出独热编码的类别unique.
def one_hot_encoder(df,col,unique):
    #如果类别为2的话,直接选择其中一个=
    if len(unique)==2:
        df=df.with_columns((pl.col(col)==unique[0]).cast(pl.Int8).alias(f"{col}_{unique[0]}"))
    else:#类别为多的时候才一个一个类别考虑过去.
        for idx in range(len(unique)):
            df=df.with_columns((pl.col(col)==unique[idx]).cast(pl.Int8).alias(f"{col}_{unique[idx]}"))
    return df.drop(col)#drop掉col这列,因为有独热编码了.
#由于last_features是每个case_id最新的信息,所以case_id不会有重复的,所以直接按case_id merge到原来表格里就行了.
#last_df是每个case_id保留最新信息的表格,last_features是哪些特征要统计最新信息,feats是总特征表格.
def last_features_merge(feats,last_df,last_features=[]):
    #从last_df中选出要统计最新信息的几列
    last_df=last_df.select(['case_id']+[last[0] for last in last_features])
    #对last_df的那几列填充缺失值
    for last in last_features:
        col,fill=last
        last_df=df_fillna(last_df,col,method=fill)
    #填充好缺失值之后就merge进feats表格.feats填充列还有缺失值是因为那些列有些case_id没有数据在last_df中.
    feats=feats.join(last_df,on='case_id',how='left')
    return feats

#feats是总特征,group_df是有多个相同case_id的表格,group_features是要用来group的特征,name是csv文件名.
#fillna+one-hot,groupby
def group_features_merge(feats,group_df,group_features=[],group_name='applprev2'):
    #挑选出group_features这些列
    group_df=group_df.select(['case_id']+[g[0] for g in group_features])
    #先把字符串列单独处理.
    for group in group_features:
        if group_df[group[0]].dtype==pl.String:#如果是字符串类型是one-hot
            col,fill,one_hot=group
            group_df=df_fillna(group_df,col,method=fill)#填充是第一步
            if one_hot==None:#如果不要one-hot直接drop col
                group_df=group_df.drop(col) 
            else:#或者one-hot-encoding
                group_df=one_hot_encoder(group_df,col,one_hot)
                for value in one_hot:
                    new_col=f"{col}_{value}"
                    feat=feat=group_df.group_by('case_id').agg( 
                                               pl.mean(new_col).alias(f"mean_{group_name}_{new_col}"),
                                               pl.std(new_col).alias(f"std_{group_name}_{new_col}"),
                                               pl.count(new_col).alias(f"count_{group_name}_{new_col}"),
                                             )
                    feats=feats.join(feat,on='case_id',how='left')
        else:#如果不是字符串,是数值列,对col填充为fill
            col,fill=group
            group_df=df_fillna(group_df,col,method=fill)#填充是第一步
            feat=group_df.group_by('case_id').agg( pl.max(col).alias(f"max_{group_name}_{col}"),
                                   pl.mean(col).alias(f"mean_{group_name}_{col}"),
                                   pl.median(col).alias(f"median_{group_name}_{col}"),
                                   pl.std(col).alias(f"std_{group_name}_{col}"),
                                   pl.min(col).alias(f"min_{group_name}_{col}"),
                                   pl.count(col).alias(f"count_{group_name}_{col}"),
                                   pl.sum(col).alias(f"sum_{group_name}_{col}"),
                                   pl.n_unique(col).alias(f"n_unique_{group_name}_{col}"),
                                   pl.first(col).alias(f"first_{group_name}_{col}"),
                                   pl.last(col).alias(f"last_{group_name}_{col}")
                                 )
            feats=feats.join(feat,on='case_id',how='left')
    return feats

def set_table_dtypes(df):
    for col in df.columns:
        df=df.with_columns(pl.col(col).cast(colname2dtype[col]).alias(col))
    return df

#after break 就是仔细研究过文件每个特征含义的意思.
def preprocessor(mode='train'):#mode='train'|'test'
    print(f"{mode} base file after break.number 1")
    feats=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_base.csv").pipe(set_table_dtypes)
    feats=feats.drop(['date_decision','MONTH','WEEK_NUM'])
    print("-"*30)
    
    print(f"{mode} applprev_2 file after break. number:1")
    applprev2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_applprev_2.csv").pipe(set_table_dtypes)
    applprev2=applprev2.with_columns(
                #账户没有被冻结,所以没有冻结的原因, 以前没有申请过信用卡,也没有留下联系方式
               ( (pl.col('cacccardblochreas_147M')!=pl.col('cacccardblochreas_147M'))&(pl.col('conts_type_509L')!=pl.col('conts_type_509L')) )\
                .alias("no_credit")#.cast(pl.Int8)
                )
    applprev2=applprev2.with_columns(
                #账户没有被冻结,所以没有冻结的原因,但是申请过信用卡
                ( (pl.col('cacccardblochreas_147M')!=pl.col('cacccardblochreas_147M'))&(pl.col('conts_type_509L')==pl.col('conts_type_509L'))) \
                .alias("no_frozen_credit").cast(pl.Int8)
                )
    applprev2=applprev2.with_columns(
                #有冻结的原因,所以账户被冻结过,也自然有信用卡
                (pl.col('cacccardblochreas_147M')==pl.col('cacccardblochreas_147M'))\
                .alias("frozen_credit").cast(pl.Int8)
                )
    
    applprev2_last=find_last_case_id(applprev2)
    """
    这些列有些是要取最新的特征,有些是需要groupby.
    联系方式要最新的
    看一个人最新状态是不是还没有信用卡
    有没有信用卡冻结也考虑一下最新状态吧,反正就一个特征.
    信用卡冻结列特征可以从冻结原因那列构造
    """
    #这里只需要把缺失值填充就可以merge了,后续训练数据和测试数据字符串一起one-hot.
    last_features=[['conts_type_509L','WHATSAPP'],#WHATSAPP只有1个,那就把NaN当成WHATSAPP吧.
                   ['no_credit',0],
                   ['no_frozen_credit',0],
                   ['frozen_credit',0]
                  ]
    feats=last_features_merge(feats,applprev2_last,last_features)
    
    #groupby需要考虑fillna,onehot(对于字符串如果是None就是不要one-hot,直接drop掉,如果要one-hot,搞出个类别的列表),然后groupby,merge
    group_features=[['cacccardblochreas_147M','a55475b1',\
                     ["P19_60_110","P17_56_144","a55475b1","P201_63_60","P127_74_114","P133_119_56","P41_107_150","P23_105_103""P33_145_161"]],
                    ['credacc_cards_status_52L','UNCONFIRMED',\
                     ['BLOCKED','UNCONFIRMED','RENEWED', 'CANCELLED', 'INACTIVE', 'ACTIVE']],
                     ['num_group1',0],#'num_group1', 'num_group2',暂时不考虑.
                   ['num_group2',0],#'num_group1', 'num_group2',暂时不考虑.
                   ]
    feats=group_features_merge(feats,applprev2,group_features,group_name='applprev2')
    del applprev2,applprev2_last
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    print("-"*30)
    
    print("credit bureau b num 2")
    bureau_b_1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_credit_bureau_b_1.csv").pipe(set_table_dtypes)
    bureau_b_2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_credit_bureau_b_2.csv").pipe(set_table_dtypes)
    bureau_b_1_last=find_last_case_id(bureau_b_1,id='case_id')
    bureau_b_2_last=find_last_case_id(bureau_b_2,id='case_id')
    feats=feats.join(bureau_b_1_last,on='case_id',how='left')
    feats=feats.join(bureau_b_2_last,on='case_id',how='left')

    del bureau_b_1,bureau_b_1_last,bureau_b_2,bureau_b_2_last
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存

    print(f"{mode} debitcard file after break num 1")#'openingdate_857D':借记卡开户日期.暂时不处理.
    debitcard=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_debitcard_1.csv").pipe(set_table_dtypes)
    debitcard_last=find_last_case_id(debitcard,id='case_id')
    
    last_features=[['last180dayaveragebalance_704A',0],#过去180天内借记卡平均余额,用众数0来填充.
                   ['last180dayturnover_1134A',30000],#借记卡过去180天营业额,这里没有特别明显的众数,中位数数填充.
                   ['last30dayturnover_651A',0]#用众数0来填充.
                  ]
    feats=last_features_merge(feats,debitcard_last,last_features)
    group_features=[['num_group1',0]#用众数来填充.
                  ]
    feats=group_features_merge(feats,debitcard,group_features,group_name='debitcard')
    del debitcard,debitcard_last
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    

    print(f"{mode} deposit file num 1")
    deposit=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_deposit_1.csv").pipe(set_table_dtypes)
    #数值列的特征工程  从1开始是为了把'case_id'去掉    
    for idx in range(1,len(deposit.columns)):
        col=deposit.columns[idx]
        column_type = deposit[col].dtype
        is_numeric = (column_type == pl.datatypes.Int64) or (column_type == pl.datatypes.Float64) 
        if is_numeric:#数值列构造特征
            feat=deposit.group_by('case_id').agg( pl.max(col).alias(f"max_deposit_{col}"),
                                           pl.mean(col).alias(f"mean_deposit_{col}"),
                                           pl.median(col).alias(f"median_deposit_{col}"),
                                           pl.std(col).alias(f"std_deposit_{col}"),
                                           pl.min(col).alias(f"min_deposit_{col}"),
                                           pl.count(col).alias(f"count_deposit_{col}"),
                                           pl.sum(col).alias(f"sum_deposit_{col}"),
                                           pl.n_unique(col).alias(f"n_unique_deposit_{col}"),
                                           pl.first(col).alias(f"first_deposit_{col}"),
                                           pl.last(col).alias(f"last_deposit_{col}")
                                         )
            feats=feats.join(feat,on='case_id',how='left')
    del deposit
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    
    print(f"{mode} other file after break number 1")
    other=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_other_1.csv").pipe(set_table_dtypes)
    other_last=find_last_case_id(other)
    
    #这里只需要把缺失值填充就可以merge了,后续训练数据和测试数据字符串一起one-hot.
    last_features=[['amtdepositbalance_4809441A',0]#amtdepositbalance_4809441A:客户存款余额.用众数0来填充.
                  ]
    feats=last_features_merge(feats,other_last,last_features)

    group_features=[['amtdebitincoming_4809443A',0],#amtdebitincoming_4809443A,0传入借记卡交易金额,用众数0来填充.
                     ['amtdebitoutgoing_4809440A',0],#amtdebitoutgoing_4809440A传出借记卡交易金额,用众数0来填充.
                     ['amtdepositincoming_4809444A',0], #amtdepositincoming_4809444A客户账户入金金额.众数为0.
                     ['amtdepositoutgoing_4809442A',0]#amtdepositoutgoing_4809442A:客户账户出金金额.众数为0.
                   ]
    feats=group_features_merge(feats,other,group_features,group_name='other')
    
    del other,other_last
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    
    
    print("person 1 num 1")
    person1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_person_1.csv").pipe(set_table_dtypes)
    #缺失值>=0.99的列直接drop掉.
    person1=person1.drop(['birthdate_87D','childnum_185L','gender_992L','housingtype_772L','isreference_387L','maritalst_703L','role_993L'])                   
    
    person1=person1.select(['case_id','contaddr_matchlist_1032L','contaddr_smempladdr_334L','empl_employedtotal_800L','language1_981M',
                           'persontype_1072L','persontype_792L','remitter_829L','role_1084L','safeguarantyflag_411L','sex_738L'])
    person1_last=find_last_case_id(person1)
    feats=feats.join(person1_last,on='case_id',how='left')
    
    del person1,person1_last
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    

    print(f"{mode} person2 file after break number 1")
    #经过检查person2训练集和测试集对应的列dtype都对应的上
    person2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_person_2.csv").pipe(set_table_dtypes)
    #这些特征缺失值占比>=0.96,不用填充,直接drop吧.
    person2=person2.drop(['addres_role_871L','empls_employedfrom_796D','relatedpersons_role_762T'])
    #个人地址,地址邮政编码,雇主名字算私人信息,不拿来训练.
    person2=person2.drop(['addres_district_368M','addres_zip_823M','empls_employer_name_740M'])
    
    group_features=[['conts_role_79M','a55475b1',#人员的联系人角色类型.
                     ['a55475b1', 'P38_92_157', 'P7_147_157', 'P177_137_98', 'P125_14_176', 
                      'P125_105_50', 'P115_147_77', 'P58_79_51','P124_137_181', 'P206_38_166', 'P42_134_91']
                    ],
                    ['empls_economicalst_849M','a55475b1',
                    ['a55475b1', 'P164_110_33', 'P22_131_138', 'P28_32_178','P148_57_109', 'P7_47_145', 'P164_122_65', 'P112_86_147','P82_144_169', 'P191_80_124']
                    ],
                    ['num_group1',0],#用众数0填充.
                   ['num_group2',0],#用众数0填充.
                   ]
    del person2
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    
    print(f"static_0 file num 2(3)")
    #pipe用于在DataFrame上自定义自己的函数
    static_0_0=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_0.csv").pipe(set_table_dtypes)
    static_0_1=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_1.csv").pipe(set_table_dtypes)
    
    static=pl.concat([static_0_0,static_0_1],how="vertical_relaxed")#垂直合并,并且放宽了数据类型匹配的限制
    if mode=='test':#如果是测试数据的话还有一个文件
        static_0_2=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_0_2.csv").pipe(set_table_dtypes)
        static=pl.concat([static,static_0_2],how="vertical_relaxed")
    feats=feats.join(static,on='case_id',how='left')
    del static,static_0_0,static_0_1
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    
    print(f"{mode} static_cb_file after break num 1")
    static_cb=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_static_cb_0.csv").pipe(set_table_dtypes)
    #缺失值占比>=0.95的直接drop掉.
    static_cb=static_cb.drop(['assignmentdate_4955616D', 'dateofbirth_342D','for3years_128L',
                            'for3years_504L','for3years_584L','formonth_118L','formonth_206L','formonth_535L',
                           'forquarter_1017L', 'forquarter_462L','forquarter_634L','fortoday_1092L',
                           'forweek_1077L','forweek_528L','forweek_601L','foryear_618L','foryear_818L','foryear_850L','pmtaverage_4955615A','pmtcount_4955617L','riskassesment_302T','riskassesment_940T'])
    static_cb=static_cb.drop(['birthdate_574D','dateofbirth_337D',#两个都是客户的出生日期,暂时不用这个数据.
                             'assignmentdate_238D','assignmentdate_4527235D',#税务机关数据:分配日期和转让日期.
                              'responsedate_1012D','responsedate_4527233D','responsedate_4917613D',#税务机关回复日期有3个特征.
                             ])
    
    #static_cb中每个case_id都是1个数据,所以需要填充缺失值,然后merge即可.
    last_features=[ ['contractssum_5085716L',0],#从外部信贷机构检索到的合同价值总额
                    ['days120_123L',0],#过去120天信用局查询数,0是众数但是不突出.
                    ['days180_256L',0],#过去180天的信用局查询数,0是众数但是不突出.
                    ['days30_165L',0],#过去30天的信用局查询数,这里0突出一点.
                    ['days360_512L',1],#1略比0多一点.
                    ['days90_310L',0],#0稍微多一点.
                    ['description_5085714M','a55475b1'],#按信贷局对客户进行分类.10:1的二分类.
                    #['education_1103M','a55475b1'],#外部来源的客户受教育水平,5个类别,
                    ['education_88M','a55475b1'],#客户受教育水平.
                    ['firstquarter_103L',0],#第一季度从信贷局获得的业绩数量
                    ['secondquarter_766L',0],#第二季度的业绩数.
                    ['thirdquarter_1082L',0],#第3季度的业绩数量.
                    ['fourthquarter_440L',0],#第4季度的业绩数.
                    ['maritalst_385M','a55475b1'],#客户的婚姻状况.
                    #['maritalst_893M', 'a55475b1'],#客户的婚姻状况.
                    ['numberofqueries_373L',1],#向征信机构查询的数量.
                    ['pmtaverage_3A',0],#'税收减免的平均值
                    #['pmtaverage_4527227A',7222.2],#'税收减免的平均值.
                    #['pmtcount_4527229L', 6],#税收减免数量
                    ['pmtcount_693L', 6],#'税收减免数量'
                    ['pmtscount_423L',6.0],#'税款扣减付款的数量.
                    ['pmtssum_45A',0],#客户的税收减免总额.
                    ['requesttype_4525192L','DEDUCTION_6'],#税务机关请求类型
                  ]
    feats=last_features_merge(feats,static_cb,last_features)
    #60天的信用局查询数.
    feats=feats.with_columns( (pl.col('days180_256L')-pl.col('days120_123L')).alias("daysgap60"))
    feats=feats.with_columns( (pl.col('days180_256L')-pl.col('days30_165L')).alias("daysgap150"))
    feats=feats.with_columns( (pl.col('days120_123L')-pl.col('days30_165L')).alias("daysgap90"))
    #一年的业绩数.
    feats=feats.with_columns( (pl.col('firstquarter_103L')+pl.col('secondquarter_766L')+pl.col('thirdquarter_1082L')+pl.col('fourthquarter_440L')).alias("totalyear_result"))
    
    del static_cb
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    print("-"*30)
    
    print(f"{mode} tax_a file after break num 1")
    tax_a=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_a_1.csv").pipe(set_table_dtypes)
    #雇主名字属于私人信息,表格中的数据很可能是加密过的,所以没什么用.recorddate_4527225D暂时不使用.
    group_features=[['amount_4527230A',850],#政府登记的税收减免金额,如果有缺失值用众数850填充
                     ['num_group1',0]
                   ]
    feats=group_features_merge(feats,tax_a,group_features,group_name='tax_a')
    del tax_a
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    print("-"*30)
    
    print(f"{mode} tax_b file after break num 1")
    tax_b=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_b_1.csv").pipe(set_table_dtypes)
    #雇主名字是私人信息,不能用来训练模型.num_group1,'deductiondate_4917603D'暂时不使用.
    group_features=[['amount_4917619A',6885],#政府登记处跟踪的税收减免金额,如果有缺失值用众数填充
                    ['num_group1',0]
                  ]
    feats=group_features_merge(feats,tax_b,group_features,group_name='tax_b')
    del tax_b
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    print("-"*30)
    
    print(f"{mode} tax_c file after break num 1")
    tax_c=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/{mode}/{mode}_tax_registry_c_1.csv").pipe(set_table_dtypes)
    if len(tax_c)==0:
        tax_c=pl.read_csv(f"/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_tax_registry_c_1.csv").pipe(set_table_dtypes)
        
    #employername_160M:雇主的名字,隐私信息不使用.processingdate_168D:处理税款扣减的日期.
    tax_c=tax_c.drop(['employername_160M','processingdate_168D'])
    
    group_features=[['pmtamount_36A',850],#pmtamount_36A:信贷局付款的税收减免额,用众数850填充
                    ['num_group1',0]#0是众数但是并不是特别突出.
                  ]
    feats=group_features_merge(feats,tax_c,group_features,group_name='tax_c')
    del tax_c
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
    print("-"*30)
    
    return feats
train_feats=preprocessor(mode='train')
test_feats=preprocessor(mode='test')

train_feats=train_feats.to_pandas()
test_feats=test_feats.to_pandas()

# 计算每列的众数，忽略含有缺失值的列
mode_values = train_feats.mode().iloc[0]
# 使用众数填充训练集中的缺失值
train_feats = train_feats.fillna(mode_values)
# 使用众数填充测试集中的缺失值
test_feats = test_feats.fillna(mode_values)


#对字符串特征列进行独热编码的转换
print("----------string one hot encoder ****")
for col in test_feats.columns:
    n_unique=train_feats[col].nunique()
    #如果是类别型变量的话,独热编码转换
    #如果类别是2类,像性别一样,如果是(0,1)了,或者说数值类型的话,没必要转换.如果是字符串类型的话,转换成数值
    if n_unique==2 and train_feats[col].dtype=='object':
        print(f"one_hot_2:{col}")
        unique=train_feats[col].unique()
        #随便选择一个类别进行转换,比如gender='Female'
        train_feats[col]=(train_feats[col]==unique[0]).astype(int)
        test_feats[col]=(test_feats[col]==unique[0]).astype(int)
    elif (n_unique<10) and train_feats[col].dtype=='object':#由于内存有限 类别型变量的n_unique设置为10
        print(f"one_hot_10:{col}")
        unique=train_feats[col].unique()
        for idx in range(len(unique)):
            if unique[idx]==unique[idx]:#这里是为了避免字符串中存在nan值的情况
                train_feats[col+"_"+str(idx)]=(train_feats[col]==unique[idx]).astype(int)
                test_feats[col+"_"+str(idx)]=(test_feats[col]==unique[idx]).astype(int)
        train_feats.drop([col],axis=1,inplace=True)
        test_feats.drop([col],axis=1,inplace=True)
print("----------drop other string or unique value or full null value ****")
drop_cols=[]
for col in test_feats.columns:
    if (train_feats[col].dtype=='object') or (test_feats[col].dtype=='object') \
        or (train_feats[col].nunique()==1) or train_feats[col].isna().mean()>0.99:
        drop_cols+=[col]
#'case_id'没什么用.
drop_cols+=['case_id']
train_feats.drop(drop_cols,axis=1,inplace=True)
test_feats.drop(drop_cols,axis=1,inplace=True)
print(f"len(train_feats):{len(train_feats)},total_features_counts:{len(test_feats.columns)}")
train_feats.head()


#遍历表格df的所有列修改数据类型减少内存使用
def reduce_mem_usage(df, float16_as32=True):
    #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:#遍历每列的列名
        col_type = df[col].dtype#列名的type
        if col_type != object:#不是object也就是说这里处理的是数值类型的变量
            c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
            if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
                #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:#如果是浮点数类型.
                #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:#如果数据需要更高的精度可以选择float32
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)  
                #如果数值在float32的取值范围内，对它进行类型转换
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                #如果数值在float64的取值范围内，对它进行类型转换
                else:
                    df[col] = df[col].astype(np.float64)
    #计算一下结束后的内存
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #相比一开始的内存减少了百分之多少
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
train_feats = reduce_mem_usage(train_feats)
test_feats = reduce_mem_usage(test_feats)

def pearson_corr(x1,x2):
    """
    x1,x2:np.array
    """
    mean_x1=np.mean(x1)
    mean_x2=np.mean(x2)
    std_x1=np.std(x1)
    std_x2=np.std(x2)
    pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2)
    return pearson
#有没有和target相关性特别高的特征,拿来做逻辑回归
choose_cols=[]
for col in train_feats.columns:
    if col!='target':
        pearson=pearson_corr(train_feats[col].values,train_feats['target'].values) 
        if abs(pearson)>0.0025:
            choose_cols.append(col)
print(f"len(choose_cols):{len(choose_cols)},choose_cols:{choose_cols}")


#mean_gini:0.5428968427934477
from sklearn.linear_model import LinearRegression

X=train_feats[choose_cols].copy()
y=train_feats[Config.TARGET_NAME].copy()
test_X=test_feats[choose_cols].copy()
oof_pred_pro=np.zeros((len(X)))
test_pred_pro=np.zeros((Config.num_folds,len(test_X)))
del train_feats,test_feats
gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存

#10折交叉验证
skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)

for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
    print(f"fold:{fold}")

    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    # 创建线性回归模型
    model = LinearRegression()
    model.fit(X_train,y_train)

    oof_pred_pro[valid_index]=model.predict(X_valid)
    #将数据分批次进行预测.
    for idx in range(0,len(test_X),Config.batch_size):
        test_pred_pro[fold][idx:idx+Config.batch_size]=model.predict(test_X[idx:idx+Config.batch_size]) 
    del model,X_train, X_valid,y_train, y_valid#模型用完直接删掉
    gc.collect()#手动触发垃圾回收,强制回收由垃圾回收器标记为未使用的内存
gini=2*roc_auc_score(y.values,oof_pred_pro)-1
print(f"mean_gini:{gini}")

test_preds=test_pred_pro.mean(axis=0)
submission=pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
submission['score']=np.clip(np.nan_to_num(test_preds,nan=0.3),0,1)
submission.to_csv("submission.csv",index=None)
submission.head()