Zero-Adjusted Gamma Regression¶

Imports¶

In [13]:

Copied!





from xgboostlss.model import *
from xgboostlss.distributions.ZAGamma import *

from sklearn.model_selection import train_test_split 
import pandas as pd
import multiprocessing
import plotnine
from plotnine import *
plotnine.options.figure_size = (18, 9)
n_cpu = multiprocessing.cpu_count()
from xgboostlss.model import *
from xgboostlss.distributions.ZAGamma import *

from sklearn.model_selection import train_test_split 
import pandas as pd
import multiprocessing
import plotnine
from plotnine import *
plotnine.options.figure_size = (18, 9)
n_cpu = multiprocessing.cpu_count()

Data¶

In [2]:

Copied!





# The simulation example closely follows https://towardsdatascience.com/zero-inflated-regression-c7dfc656d8af
np.random.seed(123)
n_samples = 1000

data = pd.DataFrame({"age": np.random.randint(1, 100, size=n_samples)})
data["income"] = np.where((data.age > 17) & (data.age < 70), 1500*data.age + 5000 + 10000*np.random.randn(n_samples), 0) / 1000

y = data["income"].values.reshape(-1,1) 
X = data.drop(columns="income")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = xgb.DMatrix(X_train, label=y_train, nthread=n_cpu)
dtest = xgb.DMatrix(X_test, nthread=n_cpu)
# The simulation example closely follows https://towardsdatascience.com/zero-inflated-regression-c7dfc656d8af
np.random.seed(123)
n_samples = 1000

data = pd.DataFrame({"age": np.random.randint(1, 100, size=n_samples)})
data["income"] = np.where((data.age > 17) & (data.age < 70), 1500*data.age + 5000 + 10000*np.random.randn(n_samples), 0) / 1000

y = data["income"].values.reshape(-1,1) 
X = data.drop(columns="income")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = xgb.DMatrix(X_train, label=y_train, nthread=n_cpu)
dtest = xgb.DMatrix(X_test, nthread=n_cpu)

Distribution Selection¶

In [3]:

Copied!





# Specifies Zero-Adjusted Gamma distribution. See ?ZAGamma for an overview.
xgblss = XGBoostLSS(
    ZAGamma(stabilization="None",        # Options are "None", "MAD", "L2".
            response_fn="exp",           # Function to transform the concentration and rate parameters, e.g., "exp" or "softplus".
            loss_fn="nll"                # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).)      
           )           
)
# Specifies Zero-Adjusted Gamma distribution. See ?ZAGamma for an overview.
xgblss = XGBoostLSS(
    ZAGamma(stabilization="None",        # Options are "None", "MAD", "L2".
            response_fn="exp",           # Function to transform the concentration and rate parameters, e.g., "exp" or "softplus".
            loss_fn="nll"                # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).)      
           )           
)

Hyper-Parameter Optimization¶

Any XGBoost hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:

- Float/Int sample_type
    - {"param_name": ["sample_type", low, high, log]}
        - sample_type: str, Type of sampling, e.g., "float" or "int"
        - low: int, Lower endpoint of the range of suggested values
        - high: int, Upper endpoint of the range of suggested values
        - log: bool, Flag to sample the value from the log domain or not
    - Example: {"eta": "float", low=1e-5, high=1, log=True]}

- Categorical sample_type
    - {"param_name": ["sample_type", ["choice1", "choice2", "choice3", "..."]]}
        - sample_type: str, Type of sampling, either "categorical"
        - choice1, choice2, choice3, ...: str, Possible choices for the parameter
    - Example: {"booster": ["categorical", ["gbtree", "dart"]]}

- For parameters without tunable choice (this is needed if tree_method = "gpu_hist" and gpu_id needs to be specified)
    - {"param_name": ["none", [value]]},
        - param_name: str, Name of the parameter
        - value: int, Value of the parameter
    - Example: {"gpu_id": ["none", [0]]}

Depending on which parameters are optimized, it might happen that some of them are not used, e.g., when {"booster": ["categorical", ["gbtree", "gblinear"]]} and {"max_depth": ["int", 1, 10, False]} are specified, max_depth is not used when gblinear is sampled, since it has no such argument.

In [4]:

Copied!





param_dict = {
    "eta":              ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":        ["int",   {"low": 1,      "high": 10,    "log": False}],
    "gamma":            ["float", {"low": 1e-8,   "high": 40,    "log": True}],
    "subsample":        ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "colsample_bytree": ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "booster":          ["categorical", ["gbtree"]],
    # "tree_method":    ["categorical", ["auto", "approx", "hist", "gpu_hist"]],
    # "gpu_id":         ["none", [0]]
}

np.random.seed(123)
opt_param = xgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=5,              # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=None,              # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                            )
param_dict = {
    "eta":              ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":        ["int",   {"low": 1,      "high": 10,    "log": False}],
    "gamma":            ["float", {"low": 1e-8,   "high": 40,    "log": True}],
    "subsample":        ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "colsample_bytree": ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "booster":          ["categorical", ["gbtree"]],
    # "tree_method":    ["categorical", ["auto", "approx", "hist", "gpu_hist"]],
    # "gpu_id":         ["none", [0]]
}

np.random.seed(123)
opt_param = xgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=5,              # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=None,              # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                            )

[I 2023-06-14 11:12:54,804] A new study created in memory with name: XGBoostLSS Hyper-Parameter Optimization
C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\optuna\progress_bar.py:56: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.

   0%|          | 00:00/05:00

[I 2023-06-14 11:13:01,314] Trial 0 finished with value: 489.663745 and parameters: {'eta': 0.0004994689863651731, 'max_depth': 6, 'gamma': 6.727234032296441e-06, 'subsample': 0.6790691829197013, 'colsample_bytree': 0.5741085583744718, 'booster': 'gbtree'}. Best is trial 0 with value: 489.663745.
[I 2023-06-14 11:13:06,911] Trial 1 finished with value: 353.61188980000003 and parameters: {'eta': 0.07686860410527004, 'max_depth': 6, 'gamma': 0.0001473429854361382, 'subsample': 0.28026420931475604, 'colsample_bytree': 0.4662779757841441, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003.
[I 2023-06-14 11:13:12,296] Trial 2 finished with value: 365.84898699999997 and parameters: {'eta': 0.02201534223384387, 'max_depth': 2, 'gamma': 0.0007199924645744285, 'subsample': 0.8412759101889264, 'colsample_bytree': 0.8897966334222682, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003.
[I 2023-06-14 11:13:17,917] Trial 3 finished with value: 489.20664659999994 and parameters: {'eta': 0.0005400984088211269, 'max_depth': 8, 'gamma': 1.2048011930697045, 'subsample': 0.24068284752937102, 'colsample_bytree': 0.7132841528390652, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003.
[I 2023-06-14 11:13:24,374] Trial 4 finished with value: 437.3848634 and parameters: {'eta': 0.0034186732107667544, 'max_depth': 8, 'gamma': 0.0027334228726129174, 'subsample': 0.47775065226407804, 'colsample_bytree': 0.500609659098279, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003.
[I 2023-06-14 11:13:30,790] Trial 5 finished with value: 498.0828006 and parameters: {'eta': 0.00019711489220891617, 'max_depth': 7, 'gamma': 0.10133802737047372, 'subsample': 0.5400235757185473, 'colsample_bytree': 0.22893151117202512, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003.
[I 2023-06-14 11:13:37,557] Trial 6 finished with value: 433.9174623999999 and parameters: {'eta': 0.0036803448153959755, 'max_depth': 7, 'gamma': 0.0008142387777247196, 'subsample': 0.6030790663481743, 'colsample_bytree': 0.4343670805714829, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003.
[I 2023-06-14 11:13:44,452] Trial 7 finished with value: 343.4201658 and parameters: {'eta': 0.22346975215264378, 'max_depth': 6, 'gamma': 0.001112772431214217, 'subsample': 0.6405354672316017, 'colsample_bytree': 0.5871817242273005, 'booster': 'gbtree'}. Best is trial 7 with value: 343.4201658.
[I 2023-06-14 11:13:52,755] Trial 8 finished with value: 407.31444700000003 and parameters: {'eta': 0.00681972351805692, 'max_depth': 8, 'gamma': 0.0016743521075288036, 'subsample': 0.5789212071694412, 'colsample_bytree': 0.21505254514198394, 'booster': 'gbtree'}. Best is trial 7 with value: 343.4201658.
[I 2023-06-14 11:14:00,534] Trial 9 finished with value: 381.772693 and parameters: {'eta': 0.013012684635862552, 'max_depth': 7, 'gamma': 0.0006377528108766672, 'subsample': 0.7580361140671947, 'colsample_bytree': 0.9434557176446021, 'booster': 'gbtree'}. Best is trial 7 with value: 343.4201658.
[I 2023-06-14 11:14:07,108] Trial 10 finished with value: 333.6009032 and parameters: {'eta': 0.5918756191167681, 'max_depth': 3, 'gamma': 1.0371731223631388e-08, 'subsample': 0.9013988495341498, 'colsample_bytree': 0.7052546807123293, 'booster': 'gbtree'}. Best is trial 10 with value: 333.6009032.
[I 2023-06-14 11:14:13,622] Trial 11 finished with value: 330.74412240000004 and parameters: {'eta': 0.7615601833834365, 'max_depth': 3, 'gamma': 4.221081659230463e-07, 'subsample': 0.9450087583995899, 'colsample_bytree': 0.718752800311086, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.

C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: invalid value encountered in subtract

[I 2023-06-14 11:14:15,018] Trial 12 finished with value: 448.30231319999996 and parameters: {'eta': 0.9759898788003969, 'max_depth': 3, 'gamma': 2.3695381759415525e-08, 'subsample': 0.9913463243727789, 'colsample_bytree': 0.7886146418837507, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:16,432] Trial 13 finished with value: 454.17113040000004 and parameters: {'eta': 0.998365650612784, 'max_depth': 4, 'gamma': 1.0627036502681216e-08, 'subsample': 0.9987136143955275, 'colsample_bytree': 0.7318697616922749, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:23,050] Trial 14 finished with value: 348.7441408 and parameters: {'eta': 0.13689434948126394, 'max_depth': 1, 'gamma': 2.6458997335625636e-07, 'subsample': 0.8618645700132618, 'colsample_bytree': 0.8427378077293346, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:24,467] Trial 15 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:14:31,324] Trial 16 finished with value: 355.562683 and parameters: {'eta': 0.047757303204794994, 'max_depth': 4, 'gamma': 2.0872853272389765e-06, 'subsample': 0.9317197287564777, 'colsample_bytree': 0.6963846781909746, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:37,842] Trial 17 finished with value: 339.82510979999995 and parameters: {'eta': 0.3321904547154749, 'max_depth': 1, 'gamma': 8.62346126178739e-08, 'subsample': 0.7879480437072626, 'colsample_bytree': 0.8231017828602335, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:46,005] Trial 18 finished with value: 356.8211668 and parameters: {'eta': 0.04200706472961177, 'max_depth': 10, 'gamma': 1.2427696247551008e-08, 'subsample': 0.7728506614421669, 'colsample_bytree': 0.6555005196588269, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:53,190] Trial 19 finished with value: 340.12927260000004 and parameters: {'eta': 0.31914867875590813, 'max_depth': 3, 'gamma': 6.128086122316244e-06, 'subsample': 0.9235586036550077, 'colsample_bytree': 0.7759240849237012, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:14:59,455] Trial 20 finished with value: 351.3050292 and parameters: {'eta': 0.09156487559702021, 'max_depth': 2, 'gamma': 3.009481285384051e-07, 'subsample': 0.9468928228771349, 'colsample_bytree': 0.8966570319043722, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:05,476] Trial 21 finished with value: 335.31197520000006 and parameters: {'eta': 0.4743019636993075, 'max_depth': 1, 'gamma': 8.969416953612334e-08, 'subsample': 0.8162456405577587, 'colsample_bytree': 0.8175917962402711, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:11,708] Trial 22 finished with value: 335.45767220000005 and parameters: {'eta': 0.4980895860722806, 'max_depth': 2, 'gamma': 6.875320776265797e-08, 'subsample': 0.8416605758606337, 'colsample_bytree': 0.6525756126845417, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:18,146] Trial 23 finished with value: 345.94083859999995 and parameters: {'eta': 0.1831723507073939, 'max_depth': 3, 'gamma': 1.0193323692203245e-08, 'subsample': 0.914861318483273, 'colsample_bytree': 0.7683855998733723, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:19,487] Trial 24 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:15:26,877] Trial 25 finished with value: 338.06359860000003 and parameters: {'eta': 0.3809572459399861, 'max_depth': 5, 'gamma': 1.1666120779831324e-07, 'subsample': 0.8197531857320648, 'colsample_bytree': 0.6722619468656158, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:33,456] Trial 26 finished with value: 349.5020144 and parameters: {'eta': 0.11923264603090025, 'max_depth': 2, 'gamma': 2.508789089907947e-06, 'subsample': 0.8473738904099016, 'colsample_bytree': 0.7438287092320836, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:34,972] Trial 27 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:15:41,924] Trial 28 finished with value: 350.88667 and parameters: {'eta': 0.10391140165643059, 'max_depth': 5, 'gamma': 5.6156740840580176e-05, 'subsample': 0.884228936427663, 'colsample_bytree': 0.8018459244780497, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:48,032] Trial 29 finished with value: 336.6932432 and parameters: {'eta': 0.40669610770196074, 'max_depth': 1, 'gamma': 1.8247717340682924e-05, 'subsample': 0.6833492872661286, 'colsample_bytree': 0.6194384024680208, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:15:54,844] Trial 30 finished with value: 345.509265 and parameters: {'eta': 0.1941269022593734, 'max_depth': 4, 'gamma': 1.165208774089997e-06, 'subsample': 0.9615364143661826, 'colsample_bytree': 0.760955560468168, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:01,555] Trial 31 finished with value: 335.9316284 and parameters: {'eta': 0.48706977991818695, 'max_depth': 3, 'gamma': 3.34962784802572e-08, 'subsample': 0.8333023101548964, 'colsample_bytree': 0.6418576296095067, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:08,285] Trial 32 finished with value: 334.46282340000005 and parameters: {'eta': 0.5940989654942358, 'max_depth': 2, 'gamma': 9.460809139837196e-08, 'subsample': 0.8045840611861502, 'colsample_bytree': 0.6938440944266571, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:09,879] Trial 33 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:16:15,904] Trial 34 finished with value: 333.19677140000005 and parameters: {'eta': 0.5742243149626465, 'max_depth': 1, 'gamma': 4.548968672244205e-08, 'subsample': 0.8861881913816053, 'colsample_bytree': 0.725182590450069, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:22,287] Trial 35 finished with value: 345.1963684 and parameters: {'eta': 0.19743565561490767, 'max_depth': 2, 'gamma': 1.050030308451467e-06, 'subsample': 0.8922499867537265, 'colsample_bytree': 0.5612894641861115, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:29,297] Trial 36 finished with value: 331.6865904 and parameters: {'eta': 0.6705499266911197, 'max_depth': 5, 'gamma': 2.939083086648572e-08, 'subsample': 0.9509323074103875, 'colsample_bytree': 0.6950238547771973, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:30,713] Trial 37 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:16:32,299] Trial 38 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:16:39,326] Trial 39 finished with value: 343.10999759999993 and parameters: {'eta': 0.24367259389360857, 'max_depth': 5, 'gamma': 5.174642585621116e-06, 'subsample': 0.9099727580737649, 'colsample_bytree': 0.5587708112164813, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:46,299] Trial 40 finished with value: 332.0002868 and parameters: {'eta': 0.6494229988228657, 'max_depth': 4, 'gamma': 4.890343790436607e-07, 'subsample': 0.9592014737890387, 'colsample_bytree': 0.7365556316457476, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:16:53,487] Trial 41 finished with value: 332.6141664 and parameters: {'eta': 0.6270173331548624, 'max_depth': 4, 'gamma': 3.837706598278457e-07, 'subsample': 0.9531883613256601, 'colsample_bytree': 0.7276784152265554, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:17:00,544] Trial 42 finished with value: 331.11581420000005 and parameters: {'eta': 0.7071402931987123, 'max_depth': 4, 'gamma': 4.107134048507206e-07, 'subsample': 0.9655276530096673, 'colsample_bytree': 0.7426148717734584, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:17:02,016] Trial 43 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:17:03,591] Trial 44 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:17:10,737] Trial 45 finished with value: 340.2684386 and parameters: {'eta': 0.3166504955642488, 'max_depth': 4, 'gamma': 1.2187170234843903e-06, 'subsample': 0.9476266678949253, 'colsample_bytree': 0.7587016075637424, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:17:18,382] Trial 46 finished with value: 343.01641839999996 and parameters: {'eta': 0.24493239743888814, 'max_depth': 6, 'gamma': 2.623920066693074e-07, 'subsample': 0.8663687965360755, 'colsample_bytree': 0.6281256925213721, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:17:25,682] Trial 47 finished with value: 332.4527770000001 and parameters: {'eta': 0.621569791064164, 'max_depth': 4, 'gamma': 0.00021491674231753342, 'subsample': 0.9790177292929024, 'colsample_bytree': 0.7871695968762151, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
[I 2023-06-14 11:17:32,761] Trial 48 finished with value: 330.22024519999997 and parameters: {'eta': 0.7874108977452866, 'max_depth': 7, 'gamma': 0.0002969927818272007, 'subsample': 0.9781045278129599, 'colsample_bytree': 0.7986216239254601, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997.
[I 2023-06-14 11:17:34,277] Trial 49 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:17:35,855] Trial 50 pruned. Trial was pruned at iteration 20.
[I 2023-06-14 11:17:43,210] Trial 51 finished with value: 340.94965199999996 and parameters: {'eta': 0.2968545513751083, 'max_depth': 7, 'gamma': 0.0005430035396654545, 'subsample': 0.9686532160058666, 'colsample_bytree': 0.8103339431626249, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997.
[I 2023-06-14 11:17:50,524] Trial 52 finished with value: 331.1304444 and parameters: {'eta': 0.72453741329648, 'max_depth': 5, 'gamma': 0.00012984440995526618, 'subsample': 0.9745382895386392, 'colsample_bytree': 0.7830897672199564, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997.
[I 2023-06-14 11:17:57,569] Trial 53 finished with value: 330.8529544 and parameters: {'eta': 0.7600908251426459, 'max_depth': 6, 'gamma': 5.887497234837467e-05, 'subsample': 0.9292095593339198, 'colsample_bytree': 0.6860121732824026, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997.

Hyper-Parameter Optimization successfully finished.
  Number of finished trials:  54
  Best trial:
    Value: 330.22024519999997
    Params: 
    eta: 0.7874108977452866
    max_depth: 7
    gamma: 0.0002969927818272007
    subsample: 0.9781045278129599
    colsample_bytree: 0.7986216239254601
    booster: gbtree
    opt_rounds: 99

Model Training¶

In [5]:

Copied!





np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
xgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )
np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
xgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )

Out[5]:

<xgboost.core.Booster at 0x1c60aec29a0>

Prediction¶

In [6]:

Copied!





# Set seed for reproducibility
torch.manual_seed(123)

# Number of samples to draw from predicted distribution
n_samples = 1000
# Quantiles to calculate from predicted distribution
quant_sel = [0.05, 0.95] 

# Sample from predicted distribution
pred_samples = xgblss.predict(dtest,
                              pred_type="samples",
                              n_samples=n_samples,
                              seed=123)

# Calculate quantiles from predicted distribution
pred_quantiles = xgblss.predict(dtest,
                                pred_type="quantiles",
                                n_samples=n_samples,
                                quantiles=quant_sel)

# Returns predicted distributional parameters
pred_params = xgblss.predict(dtest,
                             pred_type="parameters")
# Set seed for reproducibility
torch.manual_seed(123)

# Number of samples to draw from predicted distribution
n_samples = 1000
# Quantiles to calculate from predicted distribution
quant_sel = [0.05, 0.95] 

# Sample from predicted distribution
pred_samples = xgblss.predict(dtest,
                              pred_type="samples",
                              n_samples=n_samples,
                              seed=123)

# Calculate quantiles from predicted distribution
pred_quantiles = xgblss.predict(dtest,
                                pred_type="quantiles",
                                n_samples=n_samples,
                                quantiles=quant_sel)

# Returns predicted distributional parameters
pred_params = xgblss.predict(dtest,
                             pred_type="parameters")

In [7]:

Copied!

pred_samples.head()
pred_samples.head()

Out[7]:

	y_sample0	y_sample1	y_sample2	y_sample3	y_sample4	y_sample5	y_sample6	y_sample7	y_sample8	y_sample9	...	y_sample990	y_sample991	y_sample992	y_sample993	y_sample994	y_sample995	y_sample996	y_sample997	y_sample998	y_sample999
0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
1	69.204796	102.120705	71.703278	105.628113	89.443787	95.627541	107.683418	94.397614	103.259506	68.001175	...	123.875603	89.156029	107.914062	68.041969	98.416306	107.876129	113.996597	123.945808	91.579063	117.527573
2	43.042534	37.217079	36.585884	44.556767	70.851280	25.616079	28.614723	39.900421	63.246895	20.834999	...	26.661818	32.848087	41.232250	33.875793	48.706497	47.504135	59.874935	48.180283	47.074406	47.412148
3	39.445667	53.570946	50.907501	56.910980	53.622116	45.287960	47.335651	40.071209	66.523354	92.728165	...	56.905788	55.022266	65.082413	51.495502	49.140320	46.178185	35.503922	41.097782	72.074226	57.199459
4	0.000000	0.000000	0.000000	0.000000	0.000000	47.195942	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000

5 rows × 1000 columns

In [8]:

Copied!

pred_quantiles.head()
pred_quantiles.head()

Out[8]:

	quant_0.05	quant_0.95
0	0.000000	0.000000
1	63.945352	120.214192
2	19.138763	60.182043
3	38.923154	76.505629
4	0.000000	0.000000

In [9]:

Copied!

pred_params.head()
pred_params.head()

Out[9]:

	concentration	rate	gate
0	32.130577	0.302448	0.997929
1	25.962152	0.286466	0.003079
2	10.248434	0.267827	0.003601
3	24.000774	0.423238	0.001000
4	8.810806	0.305360	0.995934

SHAP Interpretability¶

In [10]:

Copied!





# Partial Dependence Plot of concentration parameter
xgblss.plot(X_test,
            parameter="concentration",
            feature="age",
            plot_type="Partial_Dependence")
# Partial Dependence Plot of concentration parameter
xgblss.plot(X_test,
            parameter="concentration",
            feature="age",
            plot_type="Partial_Dependence")

No description has been provided for this image

C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\numpy\lib\function_base.py:2854: RuntimeWarning: invalid value encountered in divide
C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\numpy\lib\function_base.py:2855: RuntimeWarning: invalid value encountered in divide

In [11]:

Copied!





# Feature Importance of gate parameter
xgblss.plot(X_test,
            parameter="gate",
            plot_type="Feature_Importance")
# Feature Importance of gate parameter
xgblss.plot(X_test,
            parameter="gate",
            plot_type="Feature_Importance")

Density Plots of Actual and Predicted Samples¶

In [12]:

Copied!





pred_df = pd.melt(pred_samples.iloc[:,0:5])
actual_df = pd.DataFrame.from_dict({"variable": "ACTUAL", "value": y_test.reshape(-1,)})
plot_df = pd.concat([pred_df, actual_df])

(
    ggplot(plot_df, 
           aes(x="value",
               color="variable",
               fill="variable")) +  
    geom_density(alpha=0.4) + 
    facet_wrap("variable",
              scales="free_y",
              ncol=2) + 
    theme_bw(base_size=15) + 
    theme(legend_position="none")
)
pred_df = pd.melt(pred_samples.iloc[:,0:5])
actual_df = pd.DataFrame.from_dict({"variable": "ACTUAL", "value": y_test.reshape(-1,)})
plot_df = pd.concat([pred_df, actual_df])

(
    ggplot(plot_df, 
           aes(x="value",
               color="variable",
               fill="variable")) +  
    geom_density(alpha=0.4) + 
    facet_wrap("variable",
              scales="free_y",
              ncol=2) + 
    theme_bw(base_size=15) + 
    theme(legend_position="none")
)

Out[12]:

<Figure Size: (1800 x 900)>