```python
import pandas as pd
import numpy as np
import json
from lightgbm import LGBMClassifier,log_evaluation,early_stopping
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
class Config():
seed=2024
num_folds=10
TARGET_NAME ='label'
import random
def seed_everything(seed):
np.random.seed(seed)
random.seed(seed)
seed_everything(Config.seed)
path='/kaggle/input/'
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/train_author.json") as f:
train_author=json.load(f)
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/pid_to_info_all.json") as f:
pid_to_info=json.load(f)
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/ind_valid_author.json") as f:
valid_author=json.load(f)
with open(path+"whoiswho-ind-kdd-2024/IND-WhoIsWho/ind_valid_author_submit.json") as f:
submission=json.load(f)
train_feats=[]
labels=[]
for id,person_info in train_author.items():
for text_id in person_info['normal_data']:
feat=pid_to_info[text_id]
try:
train_feats.append(
[len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
,len(feat['keywords']),int(feat['year'])]
)
except:
train_feats.append(
[len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
,len(feat['keywords']),2000]
)
labels.append(1)
for text_id in person_info['outliers']:
feat=pid_to_info[text_id]
try:
train_feats.append(
[len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
,len(feat['keywords']),int(feat['year'])]
)
except:
train_feats.append(
[len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
,len(feat['keywords']),2000]
)
labels.append(0)
train_feats=np.array(train_feats)
labels=np.array(labels)
print(f"train_feats.shape:{train_feats.shape},labels.shape:{labels.shape}")
print(f"np.mean(labels):{np.mean(labels)}")
train_feats=pd.DataFrame(train_feats)
train_feats['label']=labels
train_feats.head()
valid_feats=[]
for id,person_info in valid_author.items():
for text_id in person_info['papers']:
feat=pid_to_info[text_id]
try:
valid_feats.append(
[len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
,len(feat['keywords']),int(feat['year'])]
)
except:
valid_feats.append(
[len(feat['title']),len(feat['abstract']),len(feat['keywords']),len(feat['authors'])
,len(feat['keywords']),2000]
)
valid_feats=np.array(valid_feats)
print(f"valid_feats.shape:{valid_feats.shape}")
valid_feats=pd.DataFrame(valid_feats)
valid_feats.head()
choose_cols=[col for col in valid_feats.columns]
def fit_and_predict(model,train_feats=train_feats,test_feats=valid_feats,name=0):
X=train_feats[choose_cols].copy()
y=train_feats[Config.TARGET_NAME].copy()
test_X=test_feats[choose_cols].copy()
oof_pred_pro=np.zeros((len(X),2))
test_pred_pro=np.zeros((Config.num_folds,len(test_X),2))
skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)
for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
print(f"name:{name},fold:{fold}")
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
callbacks=[log_evaluation(100),early_stopping(100)]
)
oof_pred_pro[valid_index]=model.predict_proba(X_valid)
test_pred_pro[fold]=model.predict_proba(test_X)
print(f"roc_auc:{roc_auc_score(y.values,oof_pred_pro[:,1])}")
return oof_pred_pro,test_pred_pro
lgb_params={
"boosting_type": "gbdt",
"objective": "binary",
"metric": "auc",
"max_depth": 12,
"learning_rate": 0.05,
"n_estimators":3072,
"colsample_bytree": 0.9,
"colsample_bynode": 0.9,
"verbose": -1,
"random_state": Config.seed,
"reg_alpha": 0.1,
"reg_lambda": 10,
"extra_trees":True,
'num_leaves':64,
"verbose": -1,
"max_bin":255,
}
lgb_oof_pred_pro,lgb_test_pred_pro=fit_and_predict(model= LGBMClassifier(**lgb_params),name='lgb'
)
test_preds=lgb_test_pred_pro.mean(axis=0)[:,1]
cnt=0
for id,names in submission.items():
for name in names:
submission[id][name]=test_preds[cnt]
cnt+=1
with open('baseline.json', 'w', encoding='utf-8') as f:
json.dump(submission, f, ensure_ascii=False, indent=4)