import pandas as pd
import numpy as np
import polars as pl
from collections import Counter,defaultdict
import re
from scipy.stats import skew, kurtosis
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
import random
seed=2024
np.random.seed(seed)
random.seed(seed)
import warnings
warnings.filterwarnings('ignore')
def get_Essays(df):
USER_ID = df["id"].iloc[0]
textInputDf = df[['activity', 'cursor_position', 'text_change']]
currTextInput = textInputDf[textInputDf.activity != 'Nonproduction']
essayText = ""
for Input in currTextInput.values:
if Input[0] == 'Replace':
replaceTxt = Input[2].split(' => ')
essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] +essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
continue
if Input[0] == 'Paste':
essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
continue
if Input[0] == 'Remove/Cut':
essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
continue
if "M" in Input[0]:
croppedTxt = Input[0][10:]
splitTxt = croppedTxt.split(' To ')
valueArr = [item.split(', ') for item in splitTxt]
moveData = (int(valueArr[0][0][1:]),
int(valueArr[0][1][:-1]),
int(valueArr[1][0][1:]),
int(valueArr[1][1][:-1]))
if moveData[0] != moveData[2]:
if moveData[0] < moveData[2]:
essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
else:
essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
continue
essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
return USER_ID, essayText
AGGREGATIONS = ['count','min','max','first','last', 'median','sum','std']
def word_feats(df):
essay_df = df
essay_df['word'] = essay_df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!|\\,',x))
essay_df = essay_df.explode('word')
essay_df['word_len'] = essay_df['word'].apply(lambda x: len(x))
word_df = essay_df[essay_df['word_len'] != 0]
word_agg_df = word_df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
word_agg_df['id'] = word_agg_df.index
word_agg_df = word_agg_df.reset_index(drop=True)
return word_agg_df
def sent_feats(df):
essay_df = df
essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
essay_df = essay_df.explode('sent')
essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
sent_agg_df = pd.concat(
[df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
)
sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
sent_agg_df['id'] = sent_agg_df.index
sent_agg_df = sent_agg_df.reset_index(drop=True)
sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
return sent_agg_df
def parag_feats(df):
essay_df = df
essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
essay_df = essay_df.explode('paragraph')
essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x))
essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
essay_df['paragraph_sent_count'] = essay_df['paragraph'].apply(lambda x: len(re.split('\\.|\\?|\\!',x)))
df = essay_df[essay_df.paragraph_len>2].reset_index(drop=True)
paragraph_agg_df = pd.concat(
[df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS),
df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS),
df[['id','paragraph_sent_count']].groupby(['id']).agg(AGGREGATIONS)
], axis=1
)
paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
paragraph_agg_df['id'] = paragraph_agg_df.index
paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
paragraph_agg_df.drop(columns=["paragraph_word_count_count", "paragraph_sent_count_count"], inplace=True)
paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
return paragraph_agg_df
def ARI(txt):
characters=len(txt)
words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
sentence=len(re.split('\\.|\\?|\\!',txt))
ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
return ari_score
"""
http://www.supermagnus.com/mac/Word_Counter/index.html
McAlpine EFLAW© Test
(W + SW) / S
McAlpine EFLAW© Readability
Scale:
1-20: Easy
21-25: Quite Easy
26-29: Mildly Difficult
≥ 30: Very Confusing
S:total sentences
W:total words
"""
def McAlpine_EFLAW(txt):
W=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
S=len(re.split('\\.|\\?|\\!',txt))
mcalpine_eflaw_score=(W+S*W)/S
return mcalpine_eflaw_score
"""
https://readable.com/readability/coleman-liau-readability-index/
=0.0588*L-0.296*S-15.8
L是每100个单词有多少个字母,S是平均每100个单词有多少句子.
"""
def CLRI(txt):
characters=len(txt)
words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
sentence=len(re.split('\\.|\\?|\\!',txt))
L=100*characters/words
S=100*sentence/words
clri_score=0.0588*L-0.296*S-15.8
return clri_score
def get_text_chunk_features(df):
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
df['text_length'] = df['essay'].apply(len)
df['num_newlines'] = df['essay'].apply(lambda x: x.count('\n'))
df['automated_readability_index'] = df['essay'].apply(ARI)
df['mcalpine_eflaw'] = df['essay'].apply(McAlpine_EFLAW)
df['coleman_liau'] = df['essay'].apply(CLRI)
df['repetitiveness'] = df['essay'].apply(lambda x: x.count('q') / max(len(x), 1))
df['avg_word_length'] = df['essay'].apply(lambda x: sum(len(word) for word in x.split()) / max(1, len(x.split())))
df['word_lexical_diversity'] = df['essay'].apply(lambda x: len(set(x.split())) / len(x.split()))
df['num_s_quotations'] = df['essay'].apply(lambda x: x.count("'"))
df['num_d_quotations'] = df['essay'].apply(lambda x: x.count('"'))
df['qm_count'] = df['essay'].apply(lambda x: x.count('?'))
df['excm_count'] = df['essay'].apply(lambda x: x.count('!'))
df['comma_count'] = df['essay'].apply(lambda x: x.count(','))
df['dot_count'] = df['essay'].apply(lambda x: x.count('.'))
df['num_prelist_count'] = df['essay'].apply(lambda x: x.count(':')) +\
df['essay'].apply(lambda x: x.count(";"))
df["space_n_dot_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\s\.', x)))
df["space_n_comma_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\s\,', x)))
df["comma_n_nonspace_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\,\S', x)))
df["dot_n_nonspace_mistake"] = df['essay'].apply(lambda x: len(re.findall(r'\.\S', x)))
df["total_punc_mistake"] = (
df["space_n_dot_mistake"] +
df["space_n_comma_mistake"] +
df["comma_n_nonspace_mistake"] +
df["dot_n_nonspace_mistake"]
)
df["punc_mistake_ratio"] = df["total_punc_mistake"] / (df['qm_count'] +
df['excm_count'] +
df['comma_count'] +
df['dot_count'])
df['unique_word_count'] = df['essay'].apply(lambda x: len(set(re.findall(r'\w+', x.lower()))))
df['punctuation_count'] = df['essay'].apply(lambda x: sum(x.count(p) for p in punctuation))
return df
def standardize_text(txt):
txt = re.sub(r'\t' , '', txt)
txt = re.sub(r'\n {1,}' , '\n', txt)
txt = re.sub(r' {1,}\n' , '\n', txt)
txt = re.sub(r'\n{2,}' , '\n', txt)
txt = re.sub(r' {2,}' , ' ', txt)
txt = txt.strip()
return txt
def TextProcessor(inp_df):
for rowi in range(len(inp_df)):
if inp_df.loc[rowi, "essay"].replace(" ", "") == "":
inp_df.loc[rowi, "essay"] = "q"
inp_df["essay"] = inp_df["essay"].apply(lambda x: standardize_text(txt=x))
print("creating complete features")
inp_df = get_text_chunk_features(inp_df)
wf_df = word_feats(inp_df)
sf_df = sent_feats(inp_df)
pf_df = parag_feats(inp_df)
inp_df = inp_df.merge(wf_df, how="left", on="id")
inp_df = inp_df.merge(sf_df, how="left", on="id")
inp_df = inp_df.merge(pf_df, how="left", on="id")
inp_df.drop(["essay", "word", "sent", "paragraph"],axis=1,inplace=True)
return inp_df
num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count', 'event_id']
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
def count_by_values(df, colname, values):
fts = df.select(pl.col('id').unique(maintain_order=True))
for i, value in enumerate(values):
tmp_df = df.group_by('id').agg(pl.col(colname).is_in([value]).sum().alias(f'{colname}_{i}_cnt'))
fts = fts.join(tmp_df, on='id', how='left')
return fts
def pause_stat_aggregator(df, prefix="iw"):
temp = df.group_by("id").agg(
pl.max('time_diff').alias(f"{prefix}_max_pause_time"),
pl.median('time_diff').alias(f"{prefix}_median_pause_time"),
pl.mean('time_diff').alias(f"{prefix}_mean_pause_time"),
pl.min('time_diff').alias(f"{prefix}_min_pause_time"),
pl.std('time_diff').alias(f"{prefix}_std_pause_time"),
pl.sum('time_diff').alias(f"{prefix}_total_pause_time"),
pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') <= 1)).count().alias(f"{prefix}_pauses_half_sec"),
pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') <= 2)).count().alias(f"{prefix}_pauses_1_sec"),
pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') <= 3)).count().alias(f"{prefix}_pauses_2_sec"),
pl.col('time_diff').filter(pl.col('time_diff') > 3).count().alias(f"{prefix}_pauses_3_sec")
)
return temp
def dev_feats(df):
print("< Count by values features >")
feats = count_by_values(df, 'activity', activities)
feats = feats.join(count_by_values(df, 'text_change', text_changes), on='id', how='left')
feats = feats.join(count_by_values(df, 'down_event', events), on='id', how='left')
print("< Numerical columns features >")
temp = df.group_by("id").agg(pl.sum('action_time').suffix('_sum'),pl.std(num_cols).suffix('_std'),
pl.median(num_cols).suffix('_median'), pl.min(num_cols).suffix('_min'),
pl.max(num_cols).suffix('_max'),
)
feats = feats.join(temp, on='id', how='left')
print("< Categorical columns features >")
temp = df.group_by("id").agg(pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
feats = feats.join(temp, on='id', how='left')
print("< Creating pause features >")
temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
temp = temp.with_columns((pl.col("up_event") == "Space").alias("is_space"))
temp = temp.with_columns((pl.col("up_event") == ".").alias("is_dot"))
temp = temp.with_columns((pl.col("up_event") == "Enter").alias("is_enter"))
temp = temp.with_columns(
pl.col("is_space").cumsum().shift().backward_fill().over("id").alias("word_id"),
pl.col("is_dot").cumsum().shift().backward_fill().over("id").alias("sentence_id"),
pl.col("is_enter").cumsum().shift().backward_fill().over("id").alias("paragraph_id"),
)
temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
iw_df = pause_stat_aggregator(df=temp, prefix="iw")
bww_df = temp.group_by("id", "word_id").agg(pl.col("time_diff").first())
bww_df = pause_stat_aggregator(df=bww_df, prefix="bww")
bws_df = temp.group_by("id", "sentence_id").agg(pl.col("time_diff").first())
bws_df = pause_stat_aggregator(df=bws_df, prefix="bws")
bwp_df = temp.group_by("id", "paragraph_id").agg(pl.col("time_diff").first())
bwp_df = pause_stat_aggregator(df=bwp_df, prefix="bwp")
feats = (feats.join(iw_df, on="id", how="left")
.join(bww_df, on="id", how="left")
.join(bws_df, on="id", how="left")
.join(bwp_df, on="id", how="left")
)
feats=feats.to_pandas()
return feats
def get_keys_pressed_per_second(logs):
temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
temp_df = temp_df.merge(temp_df_2, on='id', how='left')
temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
return temp_df[['id', 'keys_per_second']]
def burst_features(df, burst_type="p"):
temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
if burst_type == "p":
temp = temp.with_columns(pl.col('activity').is_in(['Input']))
elif burst_type == "r":
temp = temp.with_columns(pl.col('activity').is_in(['Remove/Cut']))
temp = temp.with_columns((pl.col('action_time') / 1000).alias("action_time_s"))
temp = temp.with_columns((pl.col('up_time') / 1000).alias("up_time_s"))
temp = temp.with_columns(pl.when(pl.col("activity")).then(pl.col("activity").rle_id()).alias(f'{burst_type}_burst_group'))
temp = temp.drop_nulls()
temp = temp.group_by("id", f"{burst_type}_burst_group").agg(
pl.count('activity').alias(f'{burst_type}_burst_group_keypress_count'),
pl.sum('action_time_s').alias(f'{burst_type}_burst_group_timespent'),
pl.mean('action_time_s').alias(f'{burst_type}_burst_keypress_timespent_mean'),
pl.std('action_time_s').alias(f'{burst_type}_burst_keypress_timespent_std'),
pl.min('up_time_s').alias(f'{burst_type}_burst_keypress_timestamp_first'),
pl.max('up_time_s').alias(f'{burst_type}_burst_keypress_timestamp_last')
)
temp = temp.group_by("id").agg(
pl.sum(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_sum'),
pl.mean(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_mean'),
pl.std(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_std'),
pl.max(f'{burst_type}_burst_group_keypress_count').alias(f'{burst_type}_burst_keypress_count_max'),
pl.sum(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_sum'),
pl.mean(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_mean'),
pl.std(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_std'),
pl.max(f'{burst_type}_burst_group_timespent').alias(f'{burst_type}_burst_timespent_max'),
pl.mean(f'{burst_type}_burst_keypress_timespent_mean').alias(f'{burst_type}_burst_keypress_timespent_mean'),
pl.mean(f'{burst_type}_burst_keypress_timespent_std').alias(f'{burst_type}_burst_keypress_timespent_std'),
pl.min(f'{burst_type}_burst_keypress_timestamp_first').alias(f'{burst_type}_burst_keypress_timestamp_first'),
pl.max(f'{burst_type}_burst_keypress_timestamp_last').alias(f'{burst_type}_burst_keypress_timestamp_last')
)
temp = temp.to_pandas()
return temp
def Preprocessor(logs):
pl_logs = pl.from_pandas(logs)
print("< Creating keys_pressed_per_second features >")
feat_df = get_keys_pressed_per_second(logs)
feat_df = feat_df.merge(dev_feats(df=pl_logs), how="left", on="id")
print("< Creating PR-Burst features >")
feat_df = feat_df.merge(burst_features(df=pl_logs, burst_type="p"), how="left", on="id")
feat_df = feat_df.merge(burst_features(df=pl_logs, burst_type="r"), how="left", on="id")
essays = logs.groupby("id").apply(get_Essays)
essays = pd.DataFrame(essays.tolist(), columns=["id", "essay"])
essay_feats = TextProcessor(essays)
feat_df=feat_df.merge(essay_feats,how="left", on="id")
feat_df["p_bursts_time_ratio"] = feat_df["p_burst_timespent_sum"] / (feat_df["up_time_max"] / 1000)
feat_df["r_bursts_time_ratio"] = feat_df["r_burst_timespent_sum"] / (feat_df["up_time_max"] / 1000)
feat_df["action_time_ratio"] = feat_df["action_time_sum"] / feat_df["up_time_max"]
feat_df["pause_time_ratio"] = feat_df["iw_total_pause_time"] / (feat_df["up_time_max"] / 1000)
feat_df["pausecount_time_ratio"] = feat_df["iw_pauses_2_sec"] / (feat_df["up_time_max"] / 1000)
feat_df['word_time_ratio'] = feat_df['word_count_max'] / (feat_df["up_time_max"] / 1000)
feat_df['word_event_ratio'] = feat_df['word_count_max'] / feat_df["event_id_max"]
feat_df['event_time_ratio'] = feat_df['event_id_max'] / (feat_df["up_time_max"] / 1000)
feat_df["text_length_time_ratio"] = feat_df["text_length"] / (feat_df["up_time_max"] / 1000)
return feat_df
train_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
print(f"len(train_logs):{len(train_logs)}")
train_logs=train_logs.sort_values(by=['id', 'down_time'])
train_logs = train_logs.reset_index(drop=True)
train_logs['event_id'] = train_logs.groupby('id').cumcount() + 1
train_scores=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")
test_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
print(f"len(test_logs):{len(test_logs)}")
test_logs=test_logs.sort_values(by=['id', 'down_time'])
test_logs = test_logs.reset_index(drop=True)
test_logs['event_id'] = test_logs.groupby('id').cumcount() + 1
print("feature engineer")
train_feats = Preprocessor(train_logs)
train_feats = train_feats.merge(train_scores, how="left", on="id")
test_feats = Preprocessor(test_logs)
keys=train_feats.keys().values
unique_cols=[key for key in keys if train_feats[key].nunique()<2]
print(f"drop unique_cols:{unique_cols}")
train_feats = train_feats.drop(columns=unique_cols)
test_feats = test_feats.drop(columns=unique_cols)
train_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
test_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
train_feats.drop(['id'],axis=1,inplace=True)
print(f"total_feats_counts:{len(test_feats.keys().values)}")
def make_model():
cat_params = {'learning_rate': 0.024906985231770738, 'depth': 5,
'l2_leaf_reg': 3.7139894959529283, 'subsample': 0.18527466886647015,
'colsample_bylevel': 0.6552973951000719, 'min_data_in_leaf': 93,
"silent": True,"iterations": 1000, "random_state": seed,"use_best_model":False
}
lgb_params={'reg_alpha': 1.0894488472899402, 'reg_lambda': 6.290929934336985,
'colsample_bytree': 0.6218522907548012, 'subsample': 0.9579924238280629,
'learning_rate': 0.0027076430412427566, 'max_depth': 8, 'num_leaves': 947,
'min_child_samples': 57,'n_estimators': 2500,'metric': 'rmse',
'random_state': seed,'verbosity': -1,'force_col_wise': True
}
xgb_params={'max_depth': 2, 'learning_rate': 0.009998236038809146,
'n_estimators': 1000, 'min_child_weight': 17,
'gamma': 0.1288249858838246, 'subsample': 0.5078057280148618,
'colsample_bytree': 0.7355762136239921, 'reg_alpha': 0.670956206987811,
'reg_lambda': 0.06818351284100388, 'random_state': seed
}
model1 = LGBMRegressor(**lgb_params)
model2 = CatBoostRegressor(**cat_params)
model3 = XGBRegressor(**xgb_params)
models = []
models.append((model1, 'lgb'))
models.append((model2, 'cat'))
models.append((model3, 'xgb'))
return models
def RMSE(y_true,y_pred):
return np.sqrt(np.mean((y_true-y_pred)**2))
X=train_feats.drop(['score'],axis=1)
y=train_feats['score']
models_and_errors_dict = {}
y_hats = dict()
submission_df = pd.DataFrame(test_feats['id'])
submission_df['score'] = 3.5
X_unseen = test_feats.drop(['id'],axis=1).copy()
num_folds=10
for model, model_type in make_model():
oof_pred=np.zeros((len(y)))
y_hats[model_type] = []
skf = StratifiedKFold(n_splits=num_folds,random_state=seed, shuffle=True)
for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
X_train, X_test = X.iloc[train_index], X.iloc[valid_index]
y_train, y_test = y.iloc[train_index], y.iloc[valid_index]
X_train_copy, X_test_copy = X_train.copy(), X_test.copy()
model.fit(X_train_copy, y_train)
y_hat = model.predict(X_test_copy)
oof_pred[valid_index]=y_hat
rmse = RMSE(y_test, y_hat)
print(f'RMSE: {rmse} on fold {fold}')
X_unseen_copy = X_unseen.copy()
X_unseen_copy=X_unseen_copy
y_hats[model_type].append(model.predict(X_unseen_copy))
if model_type not in models_and_errors_dict:
models_and_errors_dict[model_type] = []
models_and_errors_dict[model_type].append((model, rmse, None, None,oof_pred))
for key in y_hats.keys():
if y_hats[key]:
y_hat_avg = np.mean(y_hats[key], axis=0)
submission_df['score_' + key] = y_hat_avg
submission_df.head()
blending_weights = {
'lgb': 0.4,
'cat': 0.4,
'xgb': 0.2,
}
lgb_oof_pred=models_and_errors_dict['lgb'][num_folds-1][4]
cat_oof_pred=models_and_errors_dict['cat'][num_folds-1][4]
xgb_oof_pred=models_and_errors_dict['xgb'][num_folds-1][4]
margin=1000
target=y.values
current_RMSE=RMSE(target,(lgb_oof_pred+cat_oof_pred+xgb_oof_pred)/3)
best_i=0
best_j=0
for i in range(0,margin):
for j in range(0,margin-i):
blend_oof_pred=(i*lgb_oof_pred+j*cat_oof_pred+(margin-i-j)*xgb_oof_pred)/margin
if RMSE(target,blend_oof_pred)<current_RMSE:
current_RMSE=RMSE(target,blend_oof_pred)
best_i=i
best_j=j
blending_weights['lgb']=best_i/margin
blending_weights['cat']=best_j/margin
blending_weights['xgb']=(margin-best_i-best_j)/margin
print(f"current_RMSE:{current_RMSE},blending_weights:{blending_weights}")
print("blending")
blended_score=np.zeros((len(test_feats)))
for k, v in blending_weights.items():
blended_score += submission_df['score_' + k] * v
print(f"blended_score:{blended_score}")
submission=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv")
submission['score']=blended_score
submission.to_csv("submission.csv",index=None)
submission.head()