1. Fraud detection
Applied project on fraud detection with catboosting
[1]:
#!pip install -q -U catboost
[17]:
import os
import pickle
import tempfile
from datetime import datetime
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
import optuna
import catboost as catb
import lightgbm as lgb
%matplotlib inline
[3]:
# from https://www.kaggle.com/mlg-ulb/creditcardfraud?select=creditcard.csv
df = pd.read_csv('creditcard.csv')
[4]:
df.head()
[4]:
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
[5]:
x_full = df.iloc[:,:-1].to_numpy()
y_full = df.iloc[:, -1].to_numpy()
x_train_val, x_test, y_train_val, y_test = train_test_split(
x_full, y_full, test_size=0.1, random_state=0, stratify=y_full)
x_train, x_val, y_train, y_val = train_test_split(
x_train_val, y_train_val, test_size=0.1, random_state=0, stratify=y_train_val)
y_full.mean(), x_train.shape, x_val.shape, x_test.shape
[5]:
(0.001727485630620034, (230693, 30), (25633, 30), (28481, 30))
[6]:
try:
study
except NameError:
study = optuna.create_study(direction="maximize")
try:
tempdir
except NameError:
tempdir = tempfile.TemporaryDirectory().name
os.mkdir(tempdir)
print(tempdir)
[I 2021-05-22 18:38:00,297] A new study created in memory with name: no-name-8e99faa3-d344-4da3-bbb9-ad2a54e7443e
/tmp/pytmpfiles/tmpifnbk7lw
[7]:
def metrics_calculator(estimator, x_true, y_true, trial=None):
y_ppred = estimator.predict_proba(x_true)[:,1]
y_bpred = estimator.predict(x_true)
res = dict(
ks = stats.ks_2samp(y_ppred[y_true==0], y_ppred[y_true==1]).statistic,
auc = metrics.roc_auc_score(y_true, y_ppred),
nll = -metrics.log_loss(y_true, y_ppred),
f1 = metrics.f1_score(y_true, y_bpred),
mcc = metrics.matthews_corrcoef(y_true, y_bpred),
acc = metrics.accuracy_score(y_true, y_bpred),
)
if trial is not None:
[trial.set_user_attr('score_'+k, v) for k, v in res.items()]
return res
[8]:
# Since there is a large imbalance in our class, we will consider
# weighting, but we must transform your probabilities back after
# the estimation procedure is finished to give accurate
# probabilities or scores
class WeightCorrectedEstimator():
def __init__(self, estimator, bweight):
self.estimator = estimator
self.bweight = bweight
def predict(self, x):
return self.estimator.predict(x)
def predict_proba(self, x):
uncorr_est = self.estimator.predict_proba(x)[:,1]
corr_est = self.bweight/(1/uncorr_est + self.bweight - 1)
corr_est = np.column_stack([1-corr_est, corr_est])
return corr_est
[16]:
#%%capture cap_out --no-stderr
def objective(trial: optuna.trial.Trial) -> float:
param = {
'boosting_type': trial.suggest_categorical('catb_boosting_type', ['Plain', 'Ordered']),
'verbose': 200,
'iterations': 10_000,
'early_stopping_rounds': 50,
'task_type': 'GPU',
}
bweight = trial.suggest_float('catb_bweight', 0, 1)
train_poll = catb.Pool(data=x_train, label=y_train, weight=[1 if y else bweight for y in y_train])
val_pool = catb.Pool(data=x_val, label=y_val, weight=[1 if y else bweight for y in y_val])
estimator = catb.CatBoostClassifier(**param)
estimator.fit(train_poll, eval_set=val_pool)
estimator = WeightCorrectedEstimator(estimator, bweight)
with open(f"{os.path.join(tempdir, str(trial.number))}.pkl", "wb") as f:
pickle.dump(estimator, f)
return metrics_calculator(estimator, x_val, y_val, trial)['ks']
print("Start:", datetime.now())
study.optimize(objective, n_trials=10_000, timeout=600)
print("End:", datetime.now())
Learning rate set to 0.017285
0: learn: 0.6228928 test: 0.6228999 best: 0.6228999 (0) total: 59.4ms remaining: 9m 53s
200: learn: 0.0023858 test: 0.0021901 best: 0.0021901 (200) total: 11.2s remaining: 9m 6s
400: learn: 0.0019080 test: 0.0019723 best: 0.0019723 (400) total: 22.1s remaining: 8m 49s
600: learn: 0.0015829 test: 0.0018501 best: 0.0018495 (596) total: 33.1s remaining: 8m 37s
800: learn: 0.0013379 test: 0.0017781 best: 0.0017769 (799) total: 44s remaining: 8m 25s
1000: learn: 0.0011061 test: 0.0017301 best: 0.0017288 (994) total: 55s remaining: 8m 14s
bestTest = 0.001710501748
bestIteration = 1094
Shrink model to first 1095 iterations.
[I 2021-05-22 18:51:14,729] Trial 17 finished with value: 0.941202540864505 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9993694725280967}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6228878 test: 0.6228932 best: 0.6228932 (0) total: 55.2ms remaining: 9m 12s
200: learn: 0.0023451 test: 0.0021805 best: 0.0021805 (200) total: 11.2s remaining: 9m 4s
400: learn: 0.0018875 test: 0.0019927 best: 0.0019927 (400) total: 23.5s remaining: 9m 21s
600: learn: 0.0015533 test: 0.0018661 best: 0.0018661 (600) total: 35.3s remaining: 9m 11s
800: learn: 0.0012913 test: 0.0017888 best: 0.0017887 (799) total: 46.4s remaining: 8m 52s
1000: learn: 0.0010753 test: 0.0017535 best: 0.0017518 (997) total: 57.4s remaining: 8m 36s
1200: learn: 0.0009129 test: 0.0017279 best: 0.0017277 (1199) total: 1m 8s remaining: 8m 22s
bestTest = 0.001718565496
bestIteration = 1317
Shrink model to first 1318 iterations.
[I 2021-05-22 18:52:35,264] Trial 18 finished with value: 0.9454621836797772 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9999853058625567}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6228371 test: 0.6228408 best: 0.6228408 (0) total: 55.8ms remaining: 9m 18s
200: learn: 0.0023713 test: 0.0020376 best: 0.0020376 (199) total: 11.2s remaining: 9m 5s
400: learn: 0.0018830 test: 0.0018403 best: 0.0018403 (400) total: 22.5s remaining: 8m 57s
600: learn: 0.0015765 test: 0.0017616 best: 0.0017616 (600) total: 33.5s remaining: 8m 43s
800: learn: 0.0013411 test: 0.0017227 best: 0.0017225 (799) total: 44.7s remaining: 8m 32s
bestTest = 0.001722098109
bestIteration = 813
Shrink model to first 814 iterations.
[I 2021-05-22 18:53:26,053] Trial 19 finished with value: 0.9539814693103216 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9990000028393905}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6228373 test: 0.6228410 best: 0.6228410 (0) total: 60.9ms remaining: 10m 9s
200: learn: 0.0023713 test: 0.0020376 best: 0.0020376 (199) total: 11.2s remaining: 9m 5s
400: learn: 0.0018830 test: 0.0018403 best: 0.0018403 (400) total: 22.6s remaining: 9m
600: learn: 0.0015765 test: 0.0017616 best: 0.0017616 (600) total: 33.8s remaining: 8m 48s
800: learn: 0.0013411 test: 0.0017227 best: 0.0017225 (799) total: 45.1s remaining: 8m 37s
bestTest = 0.00172210412
bestIteration = 813
Shrink model to first 814 iterations.
[I 2021-05-22 18:54:17,542] Trial 20 finished with value: 0.9539423900184384 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9990002723464493}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6228916 test: 0.6228965 best: 0.6228965 (0) total: 56.6ms remaining: 9m 25s
200: learn: 0.0023881 test: 0.0021826 best: 0.0021826 (200) total: 11.3s remaining: 9m 9s
400: learn: 0.0019131 test: 0.0019705 best: 0.0019705 (400) total: 22.5s remaining: 8m 59s
600: learn: 0.0015765 test: 0.0018517 best: 0.0018502 (595) total: 33.8s remaining: 8m 49s
800: learn: 0.0012975 test: 0.0017557 best: 0.0017545 (799) total: 45.1s remaining: 8m 38s
1000: learn: 0.0010747 test: 0.0017282 best: 0.0017198 (968) total: 56.5s remaining: 8m 28s
bestTest = 0.001719783816
bestIteration = 968
Shrink model to first 969 iterations.
[I 2021-05-22 18:55:17,900] Trial 21 finished with value: 0.9486666856142022 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9998380893132888}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6229319 test: 0.6229351 best: 0.6229351 (0) total: 59.4ms remaining: 9m 54s
200: learn: 0.0023877 test: 0.0021815 best: 0.0021815 (200) total: 11.3s remaining: 9m 9s
400: learn: 0.0019112 test: 0.0019716 best: 0.0019716 (400) total: 22.7s remaining: 9m 3s
600: learn: 0.0015863 test: 0.0018484 best: 0.0018473 (597) total: 34s remaining: 8m 51s
800: learn: 0.0013164 test: 0.0017749 best: 0.0017727 (796) total: 45.3s remaining: 8m 39s
1000: learn: 0.0011009 test: 0.0017407 best: 0.0017383 (990) total: 56.7s remaining: 8m 30s
bestTest = 0.001738261951
bestIteration = 990
Shrink model to first 991 iterations.
[I 2021-05-22 18:56:20,061] Trial 22 finished with value: 0.9388577833515112 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9990000440678655}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6228337 test: 0.6228369 best: 0.6228369 (0) total: 55.4ms remaining: 9m 13s
200: learn: 0.0023892 test: 0.0021895 best: 0.0021895 (200) total: 11.7s remaining: 9m 31s
400: learn: 0.0019166 test: 0.0019677 best: 0.0019677 (400) total: 23.6s remaining: 9m 25s
600: learn: 0.0015916 test: 0.0018534 best: 0.0018530 (596) total: 35.1s remaining: 9m 9s
800: learn: 0.0013303 test: 0.0017685 best: 0.0017673 (799) total: 46.5s remaining: 8m 54s
1000: learn: 0.0010999 test: 0.0017447 best: 0.0017398 (981) total: 58.7s remaining: 8m 47s
bestTest = 0.001728632351
bestIteration = 1094
Shrink model to first 1095 iterations.
[I 2021-05-22 18:57:30,204] Trial 23 finished with value: 0.9391872928353447 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9995188173139586}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6229277 test: 0.6229315 best: 0.6229315 (0) total: 59.1ms remaining: 9m 50s
200: learn: 0.0023872 test: 0.0021788 best: 0.0021788 (200) total: 12.6s remaining: 10m 15s
400: learn: 0.0019042 test: 0.0019664 best: 0.0019658 (398) total: 24.2s remaining: 9m 38s
600: learn: 0.0015533 test: 0.0018472 best: 0.0018460 (597) total: 35.7s remaining: 9m 18s
800: learn: 0.0013063 test: 0.0017855 best: 0.0017838 (796) total: 47s remaining: 9m
1000: learn: 0.0010967 test: 0.0017475 best: 0.0017475 (1000) total: 58.6s remaining: 8m 46s
bestTest = 0.001736398806
bestIteration = 1094
Shrink model to first 1095 iterations.
[I 2021-05-22 18:58:40,057] Trial 24 finished with value: 0.9312373214342811 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.999446455594389}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6229326 test: 0.6229343 best: 0.6229343 (0) total: 55.5ms remaining: 9m 14s
200: learn: 0.0023842 test: 0.0021815 best: 0.0021815 (200) total: 11.7s remaining: 9m 30s
400: learn: 0.0019071 test: 0.0019662 best: 0.0019656 (398) total: 24s remaining: 9m 33s
600: learn: 0.0015918 test: 0.0018581 best: 0.0018550 (586) total: 35.5s remaining: 9m 14s
800: learn: 0.0013173 test: 0.0017839 best: 0.0017827 (796) total: 47.3s remaining: 9m 2s
1000: learn: 0.0011076 test: 0.0017433 best: 0.0017431 (998) total: 58.8s remaining: 8m 48s
bestTest = 0.00173214668
bestIteration = 1043
Shrink model to first 1044 iterations.
[I 2021-05-22 18:59:46,903] Trial 25 finished with value: 0.9560526717801329 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9991048454072937}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0: learn: 0.6228903 test: 0.6228947 best: 0.6228947 (0) total: 61.2ms remaining: 10m 11s
200: learn: 0.0023636 test: 0.0020337 best: 0.0020336 (199) total: 11.5s remaining: 9m 21s
400: learn: 0.0018801 test: 0.0018338 best: 0.0018338 (400) total: 23.5s remaining: 9m 22s
600: learn: 0.0015772 test: 0.0017578 best: 0.0017568 (596) total: 35.3s remaining: 9m 12s
800: learn: 0.0013488 test: 0.0017098 best: 0.0017098 (800) total: 47.8s remaining: 9m 9s
1000: learn: 0.0011515 test: 0.0016841 best: 0.0016831 (970) total: 1m remaining: 9m 2s
1200: learn: 0.0009921 test: 0.0016697 best: 0.0016684 (1186) total: 1m 12s remaining: 8m 52s
bestTest = 0.001666608369
bestIteration = 1229
Shrink model to first 1230 iterations.
[I 2021-05-22 19:01:07,712] Trial 26 finished with value: 0.9537860728509054 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9997897264426976}. Best is trial 10 with value: 0.9682454108477009.
[20]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:", study.best_params)
Number of finished trials: 27
Best trial: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.04695449109346117}
[21]:
trials = [t for t in study.trials if t.state.name == 'COMPLETE']
trials_summary = sorted(trials, key=lambda x: -np.inf if x.value is None else x.value, reverse=True)
trials_summary = [dict(trial_number=trial.number, **trial.user_attrs, **trial.params) for trial in trials_summary]
trials_summary = pd.DataFrame(trials_summary)
trials_summary.iloc[:10]
[21]:
trial_number | score_ks | score_auc | score_nll | score_f1 | score_mcc | score_acc | catb_boosting_type | catb_bweight | |
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 0.968245 | 0.996877 | -0.002194 | 0.835165 | 0.835328 | 0.999415 | Ordered | 0.046954 |
1 | 16 | 0.962032 | 0.996644 | -0.001831 | 0.883721 | 0.883765 | 0.999610 | Ordered | 0.147900 |
2 | 14 | 0.960859 | 0.996933 | -0.001865 | 0.883721 | 0.883765 | 0.999610 | Ordered | 0.142914 |
3 | 11 | 0.958202 | 0.996081 | -0.002488 | 0.737864 | 0.745325 | 0.998947 | Ordered | 0.017351 |
4 | 0 | 0.957303 | 0.996713 | -0.001759 | 0.915663 | 0.917198 | 0.999727 | Ordered | 0.324844 |
5 | 13 | 0.957303 | 0.994597 | -0.002544 | 0.678261 | 0.697148 | 0.998557 | Ordered | 0.012003 |
6 | 7 | 0.956873 | 0.996136 | -0.001814 | 0.915663 | 0.917198 | 0.999727 | Plain | 0.266458 |
7 | 15 | 0.956404 | 0.996748 | -0.001815 | 0.894118 | 0.894501 | 0.999649 | Ordered | 0.163119 |
8 | 6 | 0.956092 | 0.994742 | -0.001706 | 0.915663 | 0.917198 | 0.999727 | Ordered | 0.789061 |
9 | 25 | 0.956053 | 0.995162 | -0.001731 | 0.915663 | 0.917198 | 0.999727 | Ordered | 0.999105 |
[22]:
# next question, how much the bad predictions of each estimator cost to the decision maker?
# Let's fix the cost of false positive to 100 and vary the cost of false negative
# and see what is the cost for each estimator given the best post threshold
cost_fp = 100
cost_fn = np.array([1, 10, 100, 1000, 10000]).reshape(-1,1)
costs_arr = []
for trial_id in trials_summary.iloc[:, 0]:
with open(f"{os.path.join(tempdir, str(trial_id))}.pkl", "rb") as f:
estimator = pickle.load(f)
n_positives = len(y_val == 1)
n_negatives = len(y_val == 0)
n_total = len(y_val)
fpr, fnr, _ = metrics.det_curve(y_val, estimator.predict_proba(x_val)[:,1])
costs = fpr * cost_fp * n_negatives + fnr * cost_fn * n_positives
costs = costs / n_total
costs = np.min(costs, 1)
#costs = np.array(np.round(costs), dtype=int)
costs_arr.append(costs)
costs_arr = np.vstack(costs_arr)
trials_costs = pd.DataFrame(costs_arr, columns=[f'cfn_{c}' for c in cost_fn.ravel()])
trials_summary = pd.concat([trials_summary, trials_costs.iloc[:10]], 1)
trials_summary.iloc[:10]
[22]:
trial_number | score_ks | score_auc | score_nll | score_f1 | score_mcc | score_acc | catb_boosting_type | catb_bweight | cfn_1 | cfn_10 | cfn_100 | cfn_1000 | cfn_10000 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | 0.968245 | 0.996877 | -0.002194 | 0.835165 | 0.835328 | 0.999415 | Ordered | 0.046954 | 0.159811 | 1.124027 | 3.175459 | 11.157138 | 11.157138 |
1 | 16 | 0.962032 | 0.996644 | -0.001831 | 0.883721 | 0.883765 | 0.999610 | Ordered | 0.147900 | 0.140272 | 0.987249 | 3.796820 | 11.118059 | 11.118059 |
2 | 14 | 0.960859 | 0.996933 | -0.001865 | 0.883721 | 0.883765 | 0.999610 | Ordered | 0.142914 | 0.140272 | 1.034145 | 3.914058 | 8.742038 | 8.742038 |
3 | 11 | 0.958202 | 0.996081 | -0.002488 | 0.737864 | 0.745325 | 0.998947 | Ordered | 0.017351 | 0.159811 | 1.202186 | 4.179797 | 12.946969 | 12.946969 |
4 | 0 | 0.957303 | 0.996713 | -0.001759 | 0.915663 | 0.917198 | 0.999727 | Ordered | 0.324844 | 0.140272 | 0.963802 | 4.269679 | 9.910508 | 9.910508 |
5 | 13 | 0.957303 | 0.994597 | -0.002544 | 0.678261 | 0.697148 | 0.998557 | Ordered | 0.012003 | 0.159811 | 1.108395 | 4.269679 | 19.453671 | 19.453671 |
6 | 7 | 0.956873 | 0.996136 | -0.001814 | 0.915663 | 0.917198 | 0.999727 | Plain | 0.266458 | 0.140272 | 1.014605 | 4.312666 | 12.759389 | 12.759389 |
7 | 15 | 0.956404 | 0.996748 | -0.001815 | 0.894118 | 0.894501 | 0.999649 | Ordered | 0.163119 | 0.140272 | 1.010697 | 4.359561 | 10.176248 | 10.176248 |
8 | 6 | 0.956092 | 0.994742 | -0.001706 | 0.915663 | 0.917198 | 0.999727 | Ordered | 0.789061 | 0.140272 | 0.991157 | 4.390825 | 19.738950 | 19.738950 |
9 | 25 | 0.956053 | 0.995162 | -0.001731 | 0.915663 | 0.917198 | 0.999727 | Ordered | 0.999105 | 0.140272 | 0.971618 | 4.394733 | 16.339052 | 16.339052 |
[23]:
# Now let's get the position of each estimator for each score/cost
def func(x):
if x.name[:5] == 'score':
return np.argsort(-x)+1
if x.name[:3] == 'cfn':
return np.argsort(x)+1
else:
return x
trials_summary.agg(func, 0).iloc[:10]
[23]:
trial_number | score_ks | score_auc | score_nll | score_f1 | score_mcc | score_acc | catb_boosting_type | catb_bweight | cfn_1 | cfn_10 | cfn_100 | cfn_1000 | cfn_10000 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | 1 | 3 | 14 | 14 | 14 | 14 | Ordered | 0.046954 | 2 | 5 | 1 | 3 | 3 |
1 | 16 | 2 | 1 | 9 | 25 | 25 | 25 | Ordered | 0.147900 | 3 | 10 | 2 | 5 | 5 |
2 | 14 | 3 | 8 | 23 | 23 | 23 | 23 | Ordered | 0.142914 | 5 | 2 | 3 | 8 | 8 |
3 | 11 | 4 | 5 | 21 | 22 | 22 | 22 | Ordered | 0.017351 | 7 | 9 | 4 | 2 | 2 |
4 | 0 | 5 | 2 | 18 | 21 | 21 | 21 | Ordered | 0.324844 | 8 | 8 | 5 | 1 | 1 |
5 | 13 | 6 | 11 | 12 | 19 | 19 | 19 | Ordered | 0.012003 | 9 | 7 | 6 | 7 | 7 |
6 | 7 | 7 | 7 | 13 | 18 | 18 | 18 | Plain | 0.266458 | 10 | 3 | 7 | 4 | 4 |
7 | 15 | 8 | 4 | 25 | 17 | 17 | 17 | Ordered | 0.163119 | 1 | 6 | 8 | 10 | 10 |
8 | 6 | 9 | 10 | 10 | 16 | 16 | 16 | Ordered | 0.789061 | 4 | 1 | 9 | 6 | 6 |
9 | 25 | 10 | 23 | 27 | 15 | 15 | 15 | Ordered | 0.999105 | 6 | 4 | 10 | 9 | 9 |
[24]:
chosen_model_trial_number = study.best_trial.number
with open(f"{os.path.join(tempdir, str(chosen_model_trial_number))}.pkl", "rb") as f:
chosen_model = pickle.load(f)
[25]:
# Costs on the test set
metrics_calculator(chosen_model, x_test, y_test)
[25]:
{'ks': 0.8578369586438965,
'auc': 0.978799398206103,
'nll': -0.003521800900719857,
'f1': 0.7722772277227722,
'mcc': 0.7722150256321112,
'acc': 0.9991924440855307}