1. Fraud detection

Applied project on fraud detection with catboosting

[1]:
#!pip install -q -U catboost
[17]:
import os
import pickle
import tempfile
from datetime import datetime

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split

import optuna
import catboost as catb
import lightgbm as lgb

%matplotlib inline
[3]:
# from https://www.kaggle.com/mlg-ulb/creditcardfraud?select=creditcard.csv
df = pd.read_csv('creditcard.csv')
[4]:
df.head()
[4]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0

5 rows × 31 columns

[5]:
x_full = df.iloc[:,:-1].to_numpy()
y_full = df.iloc[:, -1].to_numpy()

x_train_val, x_test, y_train_val, y_test = train_test_split(
    x_full, y_full, test_size=0.1, random_state=0, stratify=y_full)
x_train, x_val, y_train, y_val = train_test_split(
    x_train_val, y_train_val, test_size=0.1, random_state=0, stratify=y_train_val)
y_full.mean(), x_train.shape, x_val.shape, x_test.shape
[5]:
(0.001727485630620034, (230693, 30), (25633, 30), (28481, 30))
[6]:
try:
    study
except NameError:
    study = optuna.create_study(direction="maximize")

try:
    tempdir
except NameError:
    tempdir = tempfile.TemporaryDirectory().name
    os.mkdir(tempdir)

print(tempdir)
[I 2021-05-22 18:38:00,297] A new study created in memory with name: no-name-8e99faa3-d344-4da3-bbb9-ad2a54e7443e
/tmp/pytmpfiles/tmpifnbk7lw
[7]:
def metrics_calculator(estimator, x_true, y_true, trial=None):

    y_ppred = estimator.predict_proba(x_true)[:,1]
    y_bpred = estimator.predict(x_true)

    res = dict(
        ks = stats.ks_2samp(y_ppred[y_true==0], y_ppred[y_true==1]).statistic,
        auc = metrics.roc_auc_score(y_true, y_ppred),
        nll = -metrics.log_loss(y_true, y_ppred),
        f1 = metrics.f1_score(y_true, y_bpred),
        mcc = metrics.matthews_corrcoef(y_true, y_bpred),
        acc = metrics.accuracy_score(y_true, y_bpred),
    )

    if trial is not None:
        [trial.set_user_attr('score_'+k, v) for k, v in res.items()]

    return res
[8]:
# Since there is a large imbalance in our class, we will consider
# weighting, but we must transform your probabilities back after
# the estimation procedure is finished to give accurate
# probabilities or scores

class WeightCorrectedEstimator():
    def __init__(self, estimator, bweight):
        self.estimator = estimator
        self.bweight = bweight

    def predict(self, x):
        return self.estimator.predict(x)

    def predict_proba(self, x):
        uncorr_est = self.estimator.predict_proba(x)[:,1]
        corr_est = self.bweight/(1/uncorr_est + self.bweight - 1)
        corr_est = np.column_stack([1-corr_est, corr_est])
        return corr_est
[16]:
#%%capture cap_out --no-stderr

def objective(trial: optuna.trial.Trial) -> float:

    param = {
        'boosting_type': trial.suggest_categorical('catb_boosting_type', ['Plain', 'Ordered']),
        'verbose': 200,
        'iterations': 10_000,
        'early_stopping_rounds': 50,
        'task_type': 'GPU',
    }

    bweight = trial.suggest_float('catb_bweight', 0, 1)

    train_poll = catb.Pool(data=x_train, label=y_train, weight=[1 if y else bweight for y in y_train])
    val_pool = catb.Pool(data=x_val, label=y_val, weight=[1 if y else bweight for y in y_val])

    estimator = catb.CatBoostClassifier(**param)
    estimator.fit(train_poll, eval_set=val_pool)
    estimator = WeightCorrectedEstimator(estimator, bweight)

    with open(f"{os.path.join(tempdir, str(trial.number))}.pkl", "wb") as f:
        pickle.dump(estimator, f)

    return metrics_calculator(estimator, x_val, y_val, trial)['ks']

print("Start:", datetime.now())
study.optimize(objective, n_trials=10_000, timeout=600)
print("End:", datetime.now())
Learning rate set to 0.017285
0:      learn: 0.6228928        test: 0.6228999 best: 0.6228999 (0)     total: 59.4ms   remaining: 9m 53s
200:    learn: 0.0023858        test: 0.0021901 best: 0.0021901 (200)   total: 11.2s    remaining: 9m 6s
400:    learn: 0.0019080        test: 0.0019723 best: 0.0019723 (400)   total: 22.1s    remaining: 8m 49s
600:    learn: 0.0015829        test: 0.0018501 best: 0.0018495 (596)   total: 33.1s    remaining: 8m 37s
800:    learn: 0.0013379        test: 0.0017781 best: 0.0017769 (799)   total: 44s      remaining: 8m 25s
1000:   learn: 0.0011061        test: 0.0017301 best: 0.0017288 (994)   total: 55s      remaining: 8m 14s
bestTest = 0.001710501748
bestIteration = 1094
Shrink model to first 1095 iterations.
[I 2021-05-22 18:51:14,729] Trial 17 finished with value: 0.941202540864505 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9993694725280967}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6228878        test: 0.6228932 best: 0.6228932 (0)     total: 55.2ms   remaining: 9m 12s
200:    learn: 0.0023451        test: 0.0021805 best: 0.0021805 (200)   total: 11.2s    remaining: 9m 4s
400:    learn: 0.0018875        test: 0.0019927 best: 0.0019927 (400)   total: 23.5s    remaining: 9m 21s
600:    learn: 0.0015533        test: 0.0018661 best: 0.0018661 (600)   total: 35.3s    remaining: 9m 11s
800:    learn: 0.0012913        test: 0.0017888 best: 0.0017887 (799)   total: 46.4s    remaining: 8m 52s
1000:   learn: 0.0010753        test: 0.0017535 best: 0.0017518 (997)   total: 57.4s    remaining: 8m 36s
1200:   learn: 0.0009129        test: 0.0017279 best: 0.0017277 (1199)  total: 1m 8s    remaining: 8m 22s
bestTest = 0.001718565496
bestIteration = 1317
Shrink model to first 1318 iterations.
[I 2021-05-22 18:52:35,264] Trial 18 finished with value: 0.9454621836797772 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9999853058625567}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6228371        test: 0.6228408 best: 0.6228408 (0)     total: 55.8ms   remaining: 9m 18s
200:    learn: 0.0023713        test: 0.0020376 best: 0.0020376 (199)   total: 11.2s    remaining: 9m 5s
400:    learn: 0.0018830        test: 0.0018403 best: 0.0018403 (400)   total: 22.5s    remaining: 8m 57s
600:    learn: 0.0015765        test: 0.0017616 best: 0.0017616 (600)   total: 33.5s    remaining: 8m 43s
800:    learn: 0.0013411        test: 0.0017227 best: 0.0017225 (799)   total: 44.7s    remaining: 8m 32s
bestTest = 0.001722098109
bestIteration = 813
Shrink model to first 814 iterations.
[I 2021-05-22 18:53:26,053] Trial 19 finished with value: 0.9539814693103216 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9990000028393905}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6228373        test: 0.6228410 best: 0.6228410 (0)     total: 60.9ms   remaining: 10m 9s
200:    learn: 0.0023713        test: 0.0020376 best: 0.0020376 (199)   total: 11.2s    remaining: 9m 5s
400:    learn: 0.0018830        test: 0.0018403 best: 0.0018403 (400)   total: 22.6s    remaining: 9m
600:    learn: 0.0015765        test: 0.0017616 best: 0.0017616 (600)   total: 33.8s    remaining: 8m 48s
800:    learn: 0.0013411        test: 0.0017227 best: 0.0017225 (799)   total: 45.1s    remaining: 8m 37s
bestTest = 0.00172210412
bestIteration = 813
Shrink model to first 814 iterations.
[I 2021-05-22 18:54:17,542] Trial 20 finished with value: 0.9539423900184384 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9990002723464493}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6228916        test: 0.6228965 best: 0.6228965 (0)     total: 56.6ms   remaining: 9m 25s
200:    learn: 0.0023881        test: 0.0021826 best: 0.0021826 (200)   total: 11.3s    remaining: 9m 9s
400:    learn: 0.0019131        test: 0.0019705 best: 0.0019705 (400)   total: 22.5s    remaining: 8m 59s
600:    learn: 0.0015765        test: 0.0018517 best: 0.0018502 (595)   total: 33.8s    remaining: 8m 49s
800:    learn: 0.0012975        test: 0.0017557 best: 0.0017545 (799)   total: 45.1s    remaining: 8m 38s
1000:   learn: 0.0010747        test: 0.0017282 best: 0.0017198 (968)   total: 56.5s    remaining: 8m 28s
bestTest = 0.001719783816
bestIteration = 968
Shrink model to first 969 iterations.
[I 2021-05-22 18:55:17,900] Trial 21 finished with value: 0.9486666856142022 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9998380893132888}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6229319        test: 0.6229351 best: 0.6229351 (0)     total: 59.4ms   remaining: 9m 54s
200:    learn: 0.0023877        test: 0.0021815 best: 0.0021815 (200)   total: 11.3s    remaining: 9m 9s
400:    learn: 0.0019112        test: 0.0019716 best: 0.0019716 (400)   total: 22.7s    remaining: 9m 3s
600:    learn: 0.0015863        test: 0.0018484 best: 0.0018473 (597)   total: 34s      remaining: 8m 51s
800:    learn: 0.0013164        test: 0.0017749 best: 0.0017727 (796)   total: 45.3s    remaining: 8m 39s
1000:   learn: 0.0011009        test: 0.0017407 best: 0.0017383 (990)   total: 56.7s    remaining: 8m 30s
bestTest = 0.001738261951
bestIteration = 990
Shrink model to first 991 iterations.
[I 2021-05-22 18:56:20,061] Trial 22 finished with value: 0.9388577833515112 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9990000440678655}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6228337        test: 0.6228369 best: 0.6228369 (0)     total: 55.4ms   remaining: 9m 13s
200:    learn: 0.0023892        test: 0.0021895 best: 0.0021895 (200)   total: 11.7s    remaining: 9m 31s
400:    learn: 0.0019166        test: 0.0019677 best: 0.0019677 (400)   total: 23.6s    remaining: 9m 25s
600:    learn: 0.0015916        test: 0.0018534 best: 0.0018530 (596)   total: 35.1s    remaining: 9m 9s
800:    learn: 0.0013303        test: 0.0017685 best: 0.0017673 (799)   total: 46.5s    remaining: 8m 54s
1000:   learn: 0.0010999        test: 0.0017447 best: 0.0017398 (981)   total: 58.7s    remaining: 8m 47s
bestTest = 0.001728632351
bestIteration = 1094
Shrink model to first 1095 iterations.
[I 2021-05-22 18:57:30,204] Trial 23 finished with value: 0.9391872928353447 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9995188173139586}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6229277        test: 0.6229315 best: 0.6229315 (0)     total: 59.1ms   remaining: 9m 50s
200:    learn: 0.0023872        test: 0.0021788 best: 0.0021788 (200)   total: 12.6s    remaining: 10m 15s
400:    learn: 0.0019042        test: 0.0019664 best: 0.0019658 (398)   total: 24.2s    remaining: 9m 38s
600:    learn: 0.0015533        test: 0.0018472 best: 0.0018460 (597)   total: 35.7s    remaining: 9m 18s
800:    learn: 0.0013063        test: 0.0017855 best: 0.0017838 (796)   total: 47s      remaining: 9m
1000:   learn: 0.0010967        test: 0.0017475 best: 0.0017475 (1000)  total: 58.6s    remaining: 8m 46s
bestTest = 0.001736398806
bestIteration = 1094
Shrink model to first 1095 iterations.
[I 2021-05-22 18:58:40,057] Trial 24 finished with value: 0.9312373214342811 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.999446455594389}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6229326        test: 0.6229343 best: 0.6229343 (0)     total: 55.5ms   remaining: 9m 14s
200:    learn: 0.0023842        test: 0.0021815 best: 0.0021815 (200)   total: 11.7s    remaining: 9m 30s
400:    learn: 0.0019071        test: 0.0019662 best: 0.0019656 (398)   total: 24s      remaining: 9m 33s
600:    learn: 0.0015918        test: 0.0018581 best: 0.0018550 (586)   total: 35.5s    remaining: 9m 14s
800:    learn: 0.0013173        test: 0.0017839 best: 0.0017827 (796)   total: 47.3s    remaining: 9m 2s
1000:   learn: 0.0011076        test: 0.0017433 best: 0.0017431 (998)   total: 58.8s    remaining: 8m 48s
bestTest = 0.00173214668
bestIteration = 1043
Shrink model to first 1044 iterations.
[I 2021-05-22 18:59:46,903] Trial 25 finished with value: 0.9560526717801329 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9991048454072937}. Best is trial 10 with value: 0.9682454108477009.
Learning rate set to 0.017285
0:      learn: 0.6228903        test: 0.6228947 best: 0.6228947 (0)     total: 61.2ms   remaining: 10m 11s
200:    learn: 0.0023636        test: 0.0020337 best: 0.0020336 (199)   total: 11.5s    remaining: 9m 21s
400:    learn: 0.0018801        test: 0.0018338 best: 0.0018338 (400)   total: 23.5s    remaining: 9m 22s
600:    learn: 0.0015772        test: 0.0017578 best: 0.0017568 (596)   total: 35.3s    remaining: 9m 12s
800:    learn: 0.0013488        test: 0.0017098 best: 0.0017098 (800)   total: 47.8s    remaining: 9m 9s
1000:   learn: 0.0011515        test: 0.0016841 best: 0.0016831 (970)   total: 1m       remaining: 9m 2s
1200:   learn: 0.0009921        test: 0.0016697 best: 0.0016684 (1186)  total: 1m 12s   remaining: 8m 52s
bestTest = 0.001666608369
bestIteration = 1229
Shrink model to first 1230 iterations.
[I 2021-05-22 19:01:07,712] Trial 26 finished with value: 0.9537860728509054 and parameters: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.9997897264426976}. Best is trial 10 with value: 0.9682454108477009.
[20]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:", study.best_params)
Number of finished trials: 27
Best trial: {'catb_boosting_type': 'Ordered', 'catb_bweight': 0.04695449109346117}
[21]:
trials = [t for t in study.trials if t.state.name == 'COMPLETE']
trials_summary = sorted(trials, key=lambda x: -np.inf if x.value is None else x.value, reverse=True)
trials_summary = [dict(trial_number=trial.number, **trial.user_attrs, **trial.params) for trial in trials_summary]
trials_summary = pd.DataFrame(trials_summary)
trials_summary.iloc[:10]
[21]:
trial_number score_ks score_auc score_nll score_f1 score_mcc score_acc catb_boosting_type catb_bweight
0 10 0.968245 0.996877 -0.002194 0.835165 0.835328 0.999415 Ordered 0.046954
1 16 0.962032 0.996644 -0.001831 0.883721 0.883765 0.999610 Ordered 0.147900
2 14 0.960859 0.996933 -0.001865 0.883721 0.883765 0.999610 Ordered 0.142914
3 11 0.958202 0.996081 -0.002488 0.737864 0.745325 0.998947 Ordered 0.017351
4 0 0.957303 0.996713 -0.001759 0.915663 0.917198 0.999727 Ordered 0.324844
5 13 0.957303 0.994597 -0.002544 0.678261 0.697148 0.998557 Ordered 0.012003
6 7 0.956873 0.996136 -0.001814 0.915663 0.917198 0.999727 Plain 0.266458
7 15 0.956404 0.996748 -0.001815 0.894118 0.894501 0.999649 Ordered 0.163119
8 6 0.956092 0.994742 -0.001706 0.915663 0.917198 0.999727 Ordered 0.789061
9 25 0.956053 0.995162 -0.001731 0.915663 0.917198 0.999727 Ordered 0.999105
[22]:
# next question, how much the bad predictions of each estimator cost to the decision maker?
# Let's fix the cost of false positive to 100 and vary the cost of false negative
# and see what is the cost for each estimator given the best post threshold
cost_fp = 100
cost_fn = np.array([1, 10, 100, 1000, 10000]).reshape(-1,1)
costs_arr = []
for trial_id in trials_summary.iloc[:, 0]:
    with open(f"{os.path.join(tempdir, str(trial_id))}.pkl", "rb") as f:
        estimator = pickle.load(f)
    n_positives = len(y_val == 1)
    n_negatives = len(y_val == 0)
    n_total = len(y_val)
    fpr, fnr, _ = metrics.det_curve(y_val, estimator.predict_proba(x_val)[:,1])

    costs = fpr * cost_fp * n_negatives + fnr * cost_fn * n_positives
    costs = costs / n_total
    costs = np.min(costs, 1)
    #costs = np.array(np.round(costs), dtype=int)
    costs_arr.append(costs)

costs_arr = np.vstack(costs_arr)
trials_costs = pd.DataFrame(costs_arr, columns=[f'cfn_{c}' for c in cost_fn.ravel()])
trials_summary = pd.concat([trials_summary, trials_costs.iloc[:10]], 1)
trials_summary.iloc[:10]
[22]:
trial_number score_ks score_auc score_nll score_f1 score_mcc score_acc catb_boosting_type catb_bweight cfn_1 cfn_10 cfn_100 cfn_1000 cfn_10000
0 10 0.968245 0.996877 -0.002194 0.835165 0.835328 0.999415 Ordered 0.046954 0.159811 1.124027 3.175459 11.157138 11.157138
1 16 0.962032 0.996644 -0.001831 0.883721 0.883765 0.999610 Ordered 0.147900 0.140272 0.987249 3.796820 11.118059 11.118059
2 14 0.960859 0.996933 -0.001865 0.883721 0.883765 0.999610 Ordered 0.142914 0.140272 1.034145 3.914058 8.742038 8.742038
3 11 0.958202 0.996081 -0.002488 0.737864 0.745325 0.998947 Ordered 0.017351 0.159811 1.202186 4.179797 12.946969 12.946969
4 0 0.957303 0.996713 -0.001759 0.915663 0.917198 0.999727 Ordered 0.324844 0.140272 0.963802 4.269679 9.910508 9.910508
5 13 0.957303 0.994597 -0.002544 0.678261 0.697148 0.998557 Ordered 0.012003 0.159811 1.108395 4.269679 19.453671 19.453671
6 7 0.956873 0.996136 -0.001814 0.915663 0.917198 0.999727 Plain 0.266458 0.140272 1.014605 4.312666 12.759389 12.759389
7 15 0.956404 0.996748 -0.001815 0.894118 0.894501 0.999649 Ordered 0.163119 0.140272 1.010697 4.359561 10.176248 10.176248
8 6 0.956092 0.994742 -0.001706 0.915663 0.917198 0.999727 Ordered 0.789061 0.140272 0.991157 4.390825 19.738950 19.738950
9 25 0.956053 0.995162 -0.001731 0.915663 0.917198 0.999727 Ordered 0.999105 0.140272 0.971618 4.394733 16.339052 16.339052
[23]:
# Now let's get the position of each estimator for each score/cost
def func(x):
    if x.name[:5] == 'score':
        return np.argsort(-x)+1
    if x.name[:3] == 'cfn':
        return np.argsort(x)+1
    else:
        return x
trials_summary.agg(func, 0).iloc[:10]
[23]:
trial_number score_ks score_auc score_nll score_f1 score_mcc score_acc catb_boosting_type catb_bweight cfn_1 cfn_10 cfn_100 cfn_1000 cfn_10000
0 10 1 3 14 14 14 14 Ordered 0.046954 2 5 1 3 3
1 16 2 1 9 25 25 25 Ordered 0.147900 3 10 2 5 5
2 14 3 8 23 23 23 23 Ordered 0.142914 5 2 3 8 8
3 11 4 5 21 22 22 22 Ordered 0.017351 7 9 4 2 2
4 0 5 2 18 21 21 21 Ordered 0.324844 8 8 5 1 1
5 13 6 11 12 19 19 19 Ordered 0.012003 9 7 6 7 7
6 7 7 7 13 18 18 18 Plain 0.266458 10 3 7 4 4
7 15 8 4 25 17 17 17 Ordered 0.163119 1 6 8 10 10
8 6 9 10 10 16 16 16 Ordered 0.789061 4 1 9 6 6
9 25 10 23 27 15 15 15 Ordered 0.999105 6 4 10 9 9
[24]:
chosen_model_trial_number = study.best_trial.number
with open(f"{os.path.join(tempdir, str(chosen_model_trial_number))}.pkl", "rb") as f:
    chosen_model = pickle.load(f)
[25]:
# Costs on the test set

metrics_calculator(chosen_model, x_test, y_test)
[25]:
{'ks': 0.8578369586438965,
 'auc': 0.978799398206103,
 'nll': -0.003521800900719857,
 'f1': 0.7722772277227722,
 'mcc': 0.7722150256321112,
 'acc': 0.9991924440855307}