# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
!pip install -Uq fastkaggle
from fastkaggle import *The playground series is devoted to tabular datasets and are the most accessible competitions for beginners to learn and develop skills. This blog/notebook is to showcase some streamlined approach to achieve a relatively good performance. The notebook is written for Kaggle playground series season 3 episode 26 (last one in 2023). At the time of writing this blog, the submission achieves 149/871 (top 18%) ranking. final ranking private leaderboard: 332/1663 (top 20%)
Outline:
- use
fastkagglemodule to quickly set up the competition (download and unzip data) and submit it later. - preprocess data: add additional dataset offered by the competition owner
- modelling: compute cv score of
- base models: logistic regression, random forest (not supporting
np.nan) - gradient boosting: hist gradient boosting, light gbm, xgboost
- base models: logistic regression, random forest (not supporting
- cross validate the best model (lgbm), return classifiers and average out the predictions on test data.
Getting set up
comp = 'playground-series-s3e26'
path = setup_comp(comp, install='')Downloading playground-series-s3e26.zip to /home/xy/git/1principle/posts/gist
100%|██████████████████████████████████████████████████████| 350k/350k [00:00<00:00, 1.56MB/s]
pathPath('playground-series-s3e26')
import pandas as pd
trn_path = path/'train.csv'
trn = pd.read_csv(trn_path)
trn.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 7905 non-null int64
1 N_Days 7905 non-null int64
2 Drug 7905 non-null object
3 Age 7905 non-null int64
4 Sex 7905 non-null object
5 Ascites 7905 non-null object
6 Hepatomegaly 7905 non-null object
7 Spiders 7905 non-null object
8 Edema 7905 non-null object
9 Bilirubin 7905 non-null float64
10 Cholesterol 7905 non-null float64
11 Albumin 7905 non-null float64
12 Copper 7905 non-null float64
13 Alk_Phos 7905 non-null float64
14 SGOT 7905 non-null float64
15 Tryglicerides 7905 non-null float64
16 Platelets 7905 non-null float64
17 Prothrombin 7905 non-null float64
18 Stage 7905 non-null float64
19 Status 7905 non-null object
dtypes: float64(10), int64(3), object(7)
memory usage: 1.2+ MB
Preprocessing data
get_dataset(path, 'joebeachcapital/cirrhosis-patient-survival-prediction', force=True) # filename= cirrhosis.csvdef preprocess(df, train=True, dropna=False):
df_ = df.copy()
df_['is_gen']='Y'
if train:
df1 = pd.read_csv(path/'cirrhosis.csv') # original data based on which the dataset is synthesized
df1 = pd.concat([df1.drop('Status', axis=1), df1['Status']], axis=1) # move status to last col, same as df_
df1['is_gen']='N'
df1.columns = df_.columns
df_ = pd.concat([df_,df1], axis=0).reset_index(drop=True)
if dropna: df_=df_.dropna()
df_['Status']= df_.Status.map({'C':0, 'CL':1,'D':2})
return df_Modelling
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder,PowerTransformer,LabelEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')models not supporting nan
df = preprocess(pd.read_csv(trn_path),train=True, dropna=True)
X, y = df.drop('Status', axis=1).iloc[:,1:], df['Status']ct = make_column_transformer(
(PowerTransformer(), make_column_selector(dtype_include = np.number)),
(OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)),
remainder = 'passthrough')%%time
logit_cv =cross_val_score(
make_pipeline(ct, LogisticRegression(max_iter=1000)),
X,y, scoring = 'neg_log_loss', cv=10, n_jobs=-1)
print(f'logitstic regression {-logit_cv.mean()=}')/home/xy/miniforge3/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [6] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
logitstic regression -logit_cv.mean()=0.5110572273752632
CPU times: user 92.2 ms, sys: 20.3 ms, total: 113 ms
Wall time: 1.05 s
%%time
RF_cv = cross_val_score(make_pipeline(ct, RandomForestClassifier(**{'n_estimators': 1000,
'criterion': 'log_loss',
'max_depth': 14,
'min_samples_split': 3,
'min_samples_leaf': 1,
'max_features': 4,
'random_state': 1,
'n_jobs': -1})),
X, y, scoring = 'neg_log_loss', cv = 10, n_jobs=-1)
print(f"random forrest {-RF_cv.mean()=}")random forrest -RF_cv.mean()=0.44559989137605854
CPU times: user 2min 23s, sys: 33.3 s, total: 2min 56s
Wall time: 48.5 s
models supporting nan
df = preprocess(pd.read_csv(trn_path),train=True, dropna=False)
X, y = df.drop('Status', axis=1).iloc[:,1:], df['Status']%%time
HB_cv = cross_val_score(make_pipeline(ct, HistGradientBoostingClassifier(**{'l2_regularization': 8.876168706639714,
'early_stopping': False,
'learning_rate': 0.009956485590638034,
'max_iter': 500,
'max_depth': 16,
'max_bins': 255,
'min_samples_leaf': 16,
'max_leaf_nodes': 18,
'random_state': 3})),
X, y, scoring = 'neg_log_loss', cv = 10, n_jobs = -1)
print(f"histGB {-HB_cv.mean()=}")/home/xy/miniforge3/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [0, 2, 3, 4, 6] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
histGB -HB_cv.mean()=0.43771002014457927
CPU times: user 162 ms, sys: 118 ms, total: 281 ms
Wall time: 15.5 s
%%time
LGBM_cv = cross_val_score(make_pipeline(ct,LGBMClassifier(**{'n_estimators': 1000,
'learning_rate': 0.013657589160895923,
'max_depth': 17,
'reg_alpha': 1.9791969860931342,
'reg_lambda': 1.2857088172765347,
'num_leaves': 37,
'subsample': 0.6351453342675659,
'colsample_bytree': 0.2644509924064132})),
X, y, scoring = 'neg_log_loss', cv = 10, n_jobs = -1)print(f"Light GBM {-LGBM_cv.mean()=}") Light GBM -LGBM_cv.mean()=0.42275781396747264
%%time
XGB_cv = cross_val_score(make_pipeline(ct, XGBClassifier(**{'max_depth': 7,
'learning_rate': 0.03570188608151033,
'n_estimators': 1000,
'gamma': 0.6440001307764849,
'min_child_weight': 2,
'colsample_bytree': 0.27034458854562116,
'subsample': 0.8435412915999765})),
X, y, scoring = 'neg_log_loss', cv = 10, n_jobs = -1)print(f"XGBoost {-XGB_cv.mean()=}")XGBoost -XGB_cv.mean()=0.42872410511564896
def cv(X,y,cv=10):
clf = LGBMClassifier(**{'n_estimators': 1000,
'learning_rate': 0.013657589160895923,
'max_depth': 17,
'reg_alpha': 1.9791969860931342,
'reg_lambda': 1.2857088172765347,
'num_leaves': 37,
'subsample': 0.6351453342675659,
'colsample_bytree': 0.2644509924064132})
ct = make_column_transformer(
(PowerTransformer(), make_column_selector(dtype_include = np.number)),
(OneHotEncoder(drop='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)),
remainder = 'passthrough')
model = make_pipeline(ct, clf)
return cross_validate(model, X, y, cv=cv, scoring='neg_log_loss', return_estimator=True, n_jobs=-1)%%time
cv_output = cv(X,y,cv=10)print(f"{-cv_output['test_score'].mean()=}, {cv_output['test_score'].std()=}")-cv_output['test_score'].mean()=0.4206909352553819, cv_output['test_score'].std()=0.04075158479629894
Submitting to Kaggle
ss = pd.read_csv(path/'sample_submission.csv')
ss.head()| id | Status_C | Status_CL | Status_D | |
|---|---|---|---|---|
| 0 | 7905 | 0.628084 | 0.034788 | 0.337128 |
| 1 | 7906 | 0.628084 | 0.034788 | 0.337128 |
| 2 | 7907 | 0.628084 | 0.034788 | 0.337128 |
| 3 | 7908 | 0.628084 | 0.034788 | 0.337128 |
| 4 | 7909 | 0.628084 | 0.034788 | 0.337128 |
tst = preprocess(pd.read_csv(path/'test.csv'), train=False)
tst.head()| id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | is_gen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7905 | 3839 | D-penicillamine | 19724 | F | N | Y | N | N | 1.2 | 546.0 | 3.37 | 65.0 | 1636.0 | 151.90 | 90.0 | 430.0 | 10.6 | 2.0 | Y |
| 1 | 7906 | 2468 | D-penicillamine | 14975 | F | N | N | N | N | 1.1 | 660.0 | 4.22 | 94.0 | 1257.0 | 151.90 | 155.0 | 227.0 | 10.0 | 2.0 | Y |
| 2 | 7907 | 51 | Placebo | 13149 | F | N | Y | N | Y | 2.0 | 151.0 | 2.96 | 46.0 | 961.0 | 69.75 | 101.0 | 213.0 | 13.0 | 4.0 | Y |
| 3 | 7908 | 2330 | D-penicillamine | 20510 | F | N | N | N | N | 0.6 | 293.0 | 3.85 | 40.0 | 554.0 | 125.55 | 56.0 | 270.0 | 10.6 | 2.0 | Y |
| 4 | 7909 | 1615 | D-penicillamine | 21904 | F | N | Y | N | N | 1.4 | 277.0 | 2.97 | 121.0 | 1110.0 | 125.00 | 126.0 | 221.0 | 9.8 | 1.0 | Y |
tst_pred = np.stack([est.predict_proba(tst.iloc[:,1:]) for est in cv_output['estimator']]).mean(0)ss.iloc[:,1:] = tst_predss.to_csv('subm.csv', index=False)
!head subm.csvid,Status_C,Status_CL,Status_D
7905,0.3034480206600728,0.02175049687406757,0.6748014824658597
7906,0.464722990035046,0.17105995489987008,0.3642170550650838
7907,0.034054616093133115,0.011479074721858778,0.954466309185008
7908,0.9778662946803056,0.002733559845527006,0.01940014547416722
7909,0.8730251010963693,0.042703149687327954,0.08427174921630272
7910,0.9909153131787145,0.0011266786376778267,0.007958008183607803
7911,0.9843376622366685,0.0014776242979965683,0.014184713465334847
7912,0.0945863204842192,0.026772389955302976,0.8786412895604778
7913,0.009370330198415863,0.0019239529424342871,0.98870571685915
if not iskaggle:
from kaggle import api
api.competition_submit_cli('subm.csv', 'lgbm 10fold avg', comp)Conclusion
The hypterparameters for each model should be found before hand with e.g. optuna. Here we copy those from this excellent notebook. Note however that the ensemble method adopted here is less sophisticated than the said notebook, the purpose of which is to reach a reasonable place faster.
Apart from careful ensemble (such as weighted average), it would be useful to exploit/create more predictive features. Some domain knowledge might come in handy.
A different direction is to replace tree-based models by neural nets. This is worth another post and hopefully I will come back to it soon.