# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
!pip install -Uq fastkaggle
from fastkaggle import *
The playground series is devoted to tabular datasets and are the most accessible competitions for beginners to learn and develop skills. This blog/notebook is to showcase some streamlined approach to achieve a relatively good performance. The notebook is written for Kaggle playground series season 3 episode 26 (last one in 2023). At the time of writing this blog, the submission achieves 149/871 (top 18%) ranking. final ranking private leaderboard: 332/1663 (top 20%)
Outline:
- use
fastkaggle
module to quickly set up the competition (download and unzip data) and submit it later. - preprocess data: add additional dataset offered by the competition owner
- modelling: compute cv score of
- base models: logistic regression, random forest (not supporting
np.nan
) - gradient boosting: hist gradient boosting, light gbm, xgboost
- base models: logistic regression, random forest (not supporting
- cross validate the best model (lgbm), return classifiers and average out the predictions on test data.
Getting set up
= 'playground-series-s3e26'
comp = setup_comp(comp, install='') path
Downloading playground-series-s3e26.zip to /home/xy/git/1principle/posts/gist
100%|██████████████████████████████████████████████████████| 350k/350k [00:00<00:00, 1.56MB/s]
path
Path('playground-series-s3e26')
import pandas as pd
= path/'train.csv'
trn_path = pd.read_csv(trn_path)
trn trn.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 7905 non-null int64
1 N_Days 7905 non-null int64
2 Drug 7905 non-null object
3 Age 7905 non-null int64
4 Sex 7905 non-null object
5 Ascites 7905 non-null object
6 Hepatomegaly 7905 non-null object
7 Spiders 7905 non-null object
8 Edema 7905 non-null object
9 Bilirubin 7905 non-null float64
10 Cholesterol 7905 non-null float64
11 Albumin 7905 non-null float64
12 Copper 7905 non-null float64
13 Alk_Phos 7905 non-null float64
14 SGOT 7905 non-null float64
15 Tryglicerides 7905 non-null float64
16 Platelets 7905 non-null float64
17 Prothrombin 7905 non-null float64
18 Stage 7905 non-null float64
19 Status 7905 non-null object
dtypes: float64(10), int64(3), object(7)
memory usage: 1.2+ MB
Preprocessing data
'joebeachcapital/cirrhosis-patient-survival-prediction', force=True) # filename= cirrhosis.csv get_dataset(path,
def preprocess(df, train=True, dropna=False):
= df.copy()
df_ 'is_gen']='Y'
df_[if train:
= pd.read_csv(path/'cirrhosis.csv') # original data based on which the dataset is synthesized
df1 = pd.concat([df1.drop('Status', axis=1), df1['Status']], axis=1) # move status to last col, same as df_
df1 'is_gen']='N'
df1[= df_.columns
df1.columns = pd.concat([df_,df1], axis=0).reset_index(drop=True)
df_ if dropna: df_=df_.dropna()
'Status']= df_.Status.map({'C':0, 'CL':1,'D':2})
df_[return df_
Modelling
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder,PowerTransformer,LabelEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, classification_report, log_loss
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from tqdm import tqdm
import warnings
'ignore') warnings.filterwarnings(
models not supporting nan
= preprocess(pd.read_csv(trn_path),train=True, dropna=True)
df = df.drop('Status', axis=1).iloc[:,1:], df['Status'] X, y
= make_column_transformer(
ct = np.number)),
(PowerTransformer(), make_column_selector(dtype_include ='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)),
(OneHotEncoder(drop= 'passthrough') remainder
%%time
=cross_val_score(
logit_cv =1000)),
make_pipeline(ct, LogisticRegression(max_iter= 'neg_log_loss', cv=10, n_jobs=-1)
X,y, scoring print(f'logitstic regression {-logit_cv.mean()=}')
/home/xy/miniforge3/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [6] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
logitstic regression -logit_cv.mean()=0.5110572273752632
CPU times: user 92.2 ms, sys: 20.3 ms, total: 113 ms
Wall time: 1.05 s
%%time
= cross_val_score(make_pipeline(ct, RandomForestClassifier(**{'n_estimators': 1000,
RF_cv 'criterion': 'log_loss',
'max_depth': 14,
'min_samples_split': 3,
'min_samples_leaf': 1,
'max_features': 4,
'random_state': 1,
'n_jobs': -1})),
= 'neg_log_loss', cv = 10, n_jobs=-1)
X, y, scoring print(f"random forrest {-RF_cv.mean()=}")
random forrest -RF_cv.mean()=0.44559989137605854
CPU times: user 2min 23s, sys: 33.3 s, total: 2min 56s
Wall time: 48.5 s
models supporting nan
= preprocess(pd.read_csv(trn_path),train=True, dropna=False)
df = df.drop('Status', axis=1).iloc[:,1:], df['Status'] X, y
%%time
= cross_val_score(make_pipeline(ct, HistGradientBoostingClassifier(**{'l2_regularization': 8.876168706639714,
HB_cv 'early_stopping': False,
'learning_rate': 0.009956485590638034,
'max_iter': 500,
'max_depth': 16,
'max_bins': 255,
'min_samples_leaf': 16,
'max_leaf_nodes': 18,
'random_state': 3})),
= 'neg_log_loss', cv = 10, n_jobs = -1)
X, y, scoring
print(f"histGB {-HB_cv.mean()=}")
/home/xy/miniforge3/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [0, 2, 3, 4, 6] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
histGB -HB_cv.mean()=0.43771002014457927
CPU times: user 162 ms, sys: 118 ms, total: 281 ms
Wall time: 15.5 s
%%time
= cross_val_score(make_pipeline(ct,LGBMClassifier(**{'n_estimators': 1000,
LGBM_cv 'learning_rate': 0.013657589160895923,
'max_depth': 17,
'reg_alpha': 1.9791969860931342,
'reg_lambda': 1.2857088172765347,
'num_leaves': 37,
'subsample': 0.6351453342675659,
'colsample_bytree': 0.2644509924064132})),
= 'neg_log_loss', cv = 10, n_jobs = -1) X, y, scoring
print(f"Light GBM {-LGBM_cv.mean()=}")
Light GBM -LGBM_cv.mean()=0.42275781396747264
%%time
= cross_val_score(make_pipeline(ct, XGBClassifier(**{'max_depth': 7,
XGB_cv 'learning_rate': 0.03570188608151033,
'n_estimators': 1000,
'gamma': 0.6440001307764849,
'min_child_weight': 2,
'colsample_bytree': 0.27034458854562116,
'subsample': 0.8435412915999765})),
= 'neg_log_loss', cv = 10, n_jobs = -1) X, y, scoring
print(f"XGBoost {-XGB_cv.mean()=}")
XGBoost -XGB_cv.mean()=0.42872410511564896
def cv(X,y,cv=10):
= LGBMClassifier(**{'n_estimators': 1000,
clf 'learning_rate': 0.013657589160895923,
'max_depth': 17,
'reg_alpha': 1.9791969860931342,
'reg_lambda': 1.2857088172765347,
'num_leaves': 37,
'subsample': 0.6351453342675659,
'colsample_bytree': 0.2644509924064132})
= make_column_transformer(
ct = np.number)),
(PowerTransformer(), make_column_selector(dtype_include ='if_binary', handle_unknown='ignore'), make_column_selector(dtype_include=object)),
(OneHotEncoder(drop= 'passthrough')
remainder = make_pipeline(ct, clf)
model return cross_validate(model, X, y, cv=cv, scoring='neg_log_loss', return_estimator=True, n_jobs=-1)
%%time
= cv(X,y,cv=10) cv_output
print(f"{-cv_output['test_score'].mean()=}, {cv_output['test_score'].std()=}")
-cv_output['test_score'].mean()=0.4206909352553819, cv_output['test_score'].std()=0.04075158479629894
Submitting to Kaggle
= pd.read_csv(path/'sample_submission.csv')
ss ss.head()
id | Status_C | Status_CL | Status_D | |
---|---|---|---|---|
0 | 7905 | 0.628084 | 0.034788 | 0.337128 |
1 | 7906 | 0.628084 | 0.034788 | 0.337128 |
2 | 7907 | 0.628084 | 0.034788 | 0.337128 |
3 | 7908 | 0.628084 | 0.034788 | 0.337128 |
4 | 7909 | 0.628084 | 0.034788 | 0.337128 |
= preprocess(pd.read_csv(path/'test.csv'), train=False)
tst tst.head()
id | N_Days | Drug | Age | Sex | Ascites | Hepatomegaly | Spiders | Edema | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | is_gen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7905 | 3839 | D-penicillamine | 19724 | F | N | Y | N | N | 1.2 | 546.0 | 3.37 | 65.0 | 1636.0 | 151.90 | 90.0 | 430.0 | 10.6 | 2.0 | Y |
1 | 7906 | 2468 | D-penicillamine | 14975 | F | N | N | N | N | 1.1 | 660.0 | 4.22 | 94.0 | 1257.0 | 151.90 | 155.0 | 227.0 | 10.0 | 2.0 | Y |
2 | 7907 | 51 | Placebo | 13149 | F | N | Y | N | Y | 2.0 | 151.0 | 2.96 | 46.0 | 961.0 | 69.75 | 101.0 | 213.0 | 13.0 | 4.0 | Y |
3 | 7908 | 2330 | D-penicillamine | 20510 | F | N | N | N | N | 0.6 | 293.0 | 3.85 | 40.0 | 554.0 | 125.55 | 56.0 | 270.0 | 10.6 | 2.0 | Y |
4 | 7909 | 1615 | D-penicillamine | 21904 | F | N | Y | N | N | 1.4 | 277.0 | 2.97 | 121.0 | 1110.0 | 125.00 | 126.0 | 221.0 | 9.8 | 1.0 | Y |
= np.stack([est.predict_proba(tst.iloc[:,1:]) for est in cv_output['estimator']]).mean(0) tst_pred
1:] = tst_pred ss.iloc[:,
'subm.csv', index=False)
ss.to_csv(!head subm.csv
id,Status_C,Status_CL,Status_D
7905,0.3034480206600728,0.02175049687406757,0.6748014824658597
7906,0.464722990035046,0.17105995489987008,0.3642170550650838
7907,0.034054616093133115,0.011479074721858778,0.954466309185008
7908,0.9778662946803056,0.002733559845527006,0.01940014547416722
7909,0.8730251010963693,0.042703149687327954,0.08427174921630272
7910,0.9909153131787145,0.0011266786376778267,0.007958008183607803
7911,0.9843376622366685,0.0014776242979965683,0.014184713465334847
7912,0.0945863204842192,0.026772389955302976,0.8786412895604778
7913,0.009370330198415863,0.0019239529424342871,0.98870571685915
if not iskaggle:
from kaggle import api
'subm.csv', 'lgbm 10fold avg', comp) api.competition_submit_cli(
Conclusion
The hypterparameters for each model should be found before hand with e.g. optuna
. Here we copy those from this excellent notebook. Note however that the ensemble method adopted here is less sophisticated than the said notebook, the purpose of which is to reach a reasonable place faster.
Apart from careful ensemble (such as weighted average), it would be useful to exploit/create more predictive features. Some domain knowledge might come in handy.
A different direction is to replace tree-based models by neural nets. This is worth another post and hopefully I will come back to it soon.