Zero-Adjusted Gamma Regression¶
Imports¶
In [13]:
Copied!
from xgboostlss.model import *
from xgboostlss.distributions.ZAGamma import *
from sklearn.model_selection import train_test_split
import pandas as pd
import multiprocessing
import plotnine
from plotnine import *
plotnine.options.figure_size = (18, 9)
n_cpu = multiprocessing.cpu_count()
from xgboostlss.model import *
from xgboostlss.distributions.ZAGamma import *
from sklearn.model_selection import train_test_split
import pandas as pd
import multiprocessing
import plotnine
from plotnine import *
plotnine.options.figure_size = (18, 9)
n_cpu = multiprocessing.cpu_count()
Data¶
In [2]:
Copied!
# The simulation example closely follows https://towardsdatascience.com/zero-inflated-regression-c7dfc656d8af
np.random.seed(123)
n_samples = 1000
data = pd.DataFrame({"age": np.random.randint(1, 100, size=n_samples)})
data["income"] = np.where((data.age > 17) & (data.age < 70), 1500*data.age + 5000 + 10000*np.random.randn(n_samples), 0) / 1000
y = data["income"].values.reshape(-1,1)
X = data.drop(columns="income")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
dtrain = xgb.DMatrix(X_train, label=y_train, nthread=n_cpu)
dtest = xgb.DMatrix(X_test, nthread=n_cpu)
# The simulation example closely follows https://towardsdatascience.com/zero-inflated-regression-c7dfc656d8af
np.random.seed(123)
n_samples = 1000
data = pd.DataFrame({"age": np.random.randint(1, 100, size=n_samples)})
data["income"] = np.where((data.age > 17) & (data.age < 70), 1500*data.age + 5000 + 10000*np.random.randn(n_samples), 0) / 1000
y = data["income"].values.reshape(-1,1)
X = data.drop(columns="income")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
dtrain = xgb.DMatrix(X_train, label=y_train, nthread=n_cpu)
dtest = xgb.DMatrix(X_test, nthread=n_cpu)
Distribution Selection¶
In [3]:
Copied!
# Specifies Zero-Adjusted Gamma distribution. See ?ZAGamma for an overview.
xgblss = XGBoostLSS(
ZAGamma(stabilization="None", # Options are "None", "MAD", "L2".
response_fn="exp", # Function to transform the concentration and rate parameters, e.g., "exp" or "softplus".
loss_fn="nll" # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).)
)
)
# Specifies Zero-Adjusted Gamma distribution. See ?ZAGamma for an overview.
xgblss = XGBoostLSS(
ZAGamma(stabilization="None", # Options are "None", "MAD", "L2".
response_fn="exp", # Function to transform the concentration and rate parameters, e.g., "exp" or "softplus".
loss_fn="nll" # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).)
)
)
Hyper-Parameter Optimization¶
Any XGBoost hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:
- Float/Int sample_type
- {"param_name": ["sample_type", low, high, log]}
- sample_type: str, Type of sampling, e.g., "float" or "int"
- low: int, Lower endpoint of the range of suggested values
- high: int, Upper endpoint of the range of suggested values
- log: bool, Flag to sample the value from the log domain or not
- Example: {"eta": "float", low=1e-5, high=1, log=True]}
- Categorical sample_type
- {"param_name": ["sample_type", ["choice1", "choice2", "choice3", "..."]]}
- sample_type: str, Type of sampling, either "categorical"
- choice1, choice2, choice3, ...: str, Possible choices for the parameter
- Example: {"booster": ["categorical", ["gbtree", "dart"]]}
- For parameters without tunable choice (this is needed if tree_method = "gpu_hist" and gpu_id needs to be specified)
- {"param_name": ["none", [value]]},
- param_name: str, Name of the parameter
- value: int, Value of the parameter
- Example: {"gpu_id": ["none", [0]]}
Depending on which parameters are optimized, it might happen that some of them are not used, e.g., when {"booster": ["categorical", ["gbtree", "gblinear"]]} and {"max_depth": ["int", 1, 10, False]} are specified, max_depth is not used when gblinear is sampled, since it has no such argument.
In [4]:
Copied!
param_dict = {
"eta": ["float", {"low": 1e-5, "high": 1, "log": True}],
"max_depth": ["int", {"low": 1, "high": 10, "log": False}],
"gamma": ["float", {"low": 1e-8, "high": 40, "log": True}],
"subsample": ["float", {"low": 0.2, "high": 1.0, "log": False}],
"colsample_bytree": ["float", {"low": 0.2, "high": 1.0, "log": False}],
"booster": ["categorical", ["gbtree"]],
# "tree_method": ["categorical", ["auto", "approx", "hist", "gpu_hist"]],
# "gpu_id": ["none", [0]]
}
np.random.seed(123)
opt_param = xgblss.hyper_opt(param_dict,
dtrain,
num_boost_round=100, # Number of boosting iterations.
nfold=5, # Number of cv-folds.
early_stopping_rounds=20, # Number of early-stopping rounds
max_minutes=5, # Time budget in minutes, i.e., stop study after the given number of minutes.
n_trials=None, # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
silence=False, # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
seed=123, # Seed used to generate cv-folds.
hp_seed=None # Seed for random number generator used in the Bayesian hyperparameter search.
)
param_dict = {
"eta": ["float", {"low": 1e-5, "high": 1, "log": True}],
"max_depth": ["int", {"low": 1, "high": 10, "log": False}],
"gamma": ["float", {"low": 1e-8, "high": 40, "log": True}],
"subsample": ["float", {"low": 0.2, "high": 1.0, "log": False}],
"colsample_bytree": ["float", {"low": 0.2, "high": 1.0, "log": False}],
"booster": ["categorical", ["gbtree"]],
# "tree_method": ["categorical", ["auto", "approx", "hist", "gpu_hist"]],
# "gpu_id": ["none", [0]]
}
np.random.seed(123)
opt_param = xgblss.hyper_opt(param_dict,
dtrain,
num_boost_round=100, # Number of boosting iterations.
nfold=5, # Number of cv-folds.
early_stopping_rounds=20, # Number of early-stopping rounds
max_minutes=5, # Time budget in minutes, i.e., stop study after the given number of minutes.
n_trials=None, # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
silence=False, # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
seed=123, # Seed used to generate cv-folds.
hp_seed=None # Seed for random number generator used in the Bayesian hyperparameter search.
)
[I 2023-06-14 11:12:54,804] A new study created in memory with name: XGBoostLSS Hyper-Parameter Optimization
C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\optuna\progress_bar.py:56: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
0%| | 00:00/05:00
[I 2023-06-14 11:13:01,314] Trial 0 finished with value: 489.663745 and parameters: {'eta': 0.0004994689863651731, 'max_depth': 6, 'gamma': 6.727234032296441e-06, 'subsample': 0.6790691829197013, 'colsample_bytree': 0.5741085583744718, 'booster': 'gbtree'}. Best is trial 0 with value: 489.663745. [I 2023-06-14 11:13:06,911] Trial 1 finished with value: 353.61188980000003 and parameters: {'eta': 0.07686860410527004, 'max_depth': 6, 'gamma': 0.0001473429854361382, 'subsample': 0.28026420931475604, 'colsample_bytree': 0.4662779757841441, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003. [I 2023-06-14 11:13:12,296] Trial 2 finished with value: 365.84898699999997 and parameters: {'eta': 0.02201534223384387, 'max_depth': 2, 'gamma': 0.0007199924645744285, 'subsample': 0.8412759101889264, 'colsample_bytree': 0.8897966334222682, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003. [I 2023-06-14 11:13:17,917] Trial 3 finished with value: 489.20664659999994 and parameters: {'eta': 0.0005400984088211269, 'max_depth': 8, 'gamma': 1.2048011930697045, 'subsample': 0.24068284752937102, 'colsample_bytree': 0.7132841528390652, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003. [I 2023-06-14 11:13:24,374] Trial 4 finished with value: 437.3848634 and parameters: {'eta': 0.0034186732107667544, 'max_depth': 8, 'gamma': 0.0027334228726129174, 'subsample': 0.47775065226407804, 'colsample_bytree': 0.500609659098279, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003. [I 2023-06-14 11:13:30,790] Trial 5 finished with value: 498.0828006 and parameters: {'eta': 0.00019711489220891617, 'max_depth': 7, 'gamma': 0.10133802737047372, 'subsample': 0.5400235757185473, 'colsample_bytree': 0.22893151117202512, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003. [I 2023-06-14 11:13:37,557] Trial 6 finished with value: 433.9174623999999 and parameters: {'eta': 0.0036803448153959755, 'max_depth': 7, 'gamma': 0.0008142387777247196, 'subsample': 0.6030790663481743, 'colsample_bytree': 0.4343670805714829, 'booster': 'gbtree'}. Best is trial 1 with value: 353.61188980000003. [I 2023-06-14 11:13:44,452] Trial 7 finished with value: 343.4201658 and parameters: {'eta': 0.22346975215264378, 'max_depth': 6, 'gamma': 0.001112772431214217, 'subsample': 0.6405354672316017, 'colsample_bytree': 0.5871817242273005, 'booster': 'gbtree'}. Best is trial 7 with value: 343.4201658. [I 2023-06-14 11:13:52,755] Trial 8 finished with value: 407.31444700000003 and parameters: {'eta': 0.00681972351805692, 'max_depth': 8, 'gamma': 0.0016743521075288036, 'subsample': 0.5789212071694412, 'colsample_bytree': 0.21505254514198394, 'booster': 'gbtree'}. Best is trial 7 with value: 343.4201658. [I 2023-06-14 11:14:00,534] Trial 9 finished with value: 381.772693 and parameters: {'eta': 0.013012684635862552, 'max_depth': 7, 'gamma': 0.0006377528108766672, 'subsample': 0.7580361140671947, 'colsample_bytree': 0.9434557176446021, 'booster': 'gbtree'}. Best is trial 7 with value: 343.4201658. [I 2023-06-14 11:14:07,108] Trial 10 finished with value: 333.6009032 and parameters: {'eta': 0.5918756191167681, 'max_depth': 3, 'gamma': 1.0371731223631388e-08, 'subsample': 0.9013988495341498, 'colsample_bytree': 0.7052546807123293, 'booster': 'gbtree'}. Best is trial 10 with value: 333.6009032. [I 2023-06-14 11:14:13,622] Trial 11 finished with value: 330.74412240000004 and parameters: {'eta': 0.7615601833834365, 'max_depth': 3, 'gamma': 4.221081659230463e-07, 'subsample': 0.9450087583995899, 'colsample_bytree': 0.718752800311086, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004.
C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: invalid value encountered in subtract
[I 2023-06-14 11:14:15,018] Trial 12 finished with value: 448.30231319999996 and parameters: {'eta': 0.9759898788003969, 'max_depth': 3, 'gamma': 2.3695381759415525e-08, 'subsample': 0.9913463243727789, 'colsample_bytree': 0.7886146418837507, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:16,432] Trial 13 finished with value: 454.17113040000004 and parameters: {'eta': 0.998365650612784, 'max_depth': 4, 'gamma': 1.0627036502681216e-08, 'subsample': 0.9987136143955275, 'colsample_bytree': 0.7318697616922749, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:23,050] Trial 14 finished with value: 348.7441408 and parameters: {'eta': 0.13689434948126394, 'max_depth': 1, 'gamma': 2.6458997335625636e-07, 'subsample': 0.8618645700132618, 'colsample_bytree': 0.8427378077293346, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:24,467] Trial 15 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:14:31,324] Trial 16 finished with value: 355.562683 and parameters: {'eta': 0.047757303204794994, 'max_depth': 4, 'gamma': 2.0872853272389765e-06, 'subsample': 0.9317197287564777, 'colsample_bytree': 0.6963846781909746, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:37,842] Trial 17 finished with value: 339.82510979999995 and parameters: {'eta': 0.3321904547154749, 'max_depth': 1, 'gamma': 8.62346126178739e-08, 'subsample': 0.7879480437072626, 'colsample_bytree': 0.8231017828602335, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:46,005] Trial 18 finished with value: 356.8211668 and parameters: {'eta': 0.04200706472961177, 'max_depth': 10, 'gamma': 1.2427696247551008e-08, 'subsample': 0.7728506614421669, 'colsample_bytree': 0.6555005196588269, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:53,190] Trial 19 finished with value: 340.12927260000004 and parameters: {'eta': 0.31914867875590813, 'max_depth': 3, 'gamma': 6.128086122316244e-06, 'subsample': 0.9235586036550077, 'colsample_bytree': 0.7759240849237012, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:14:59,455] Trial 20 finished with value: 351.3050292 and parameters: {'eta': 0.09156487559702021, 'max_depth': 2, 'gamma': 3.009481285384051e-07, 'subsample': 0.9468928228771349, 'colsample_bytree': 0.8966570319043722, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:05,476] Trial 21 finished with value: 335.31197520000006 and parameters: {'eta': 0.4743019636993075, 'max_depth': 1, 'gamma': 8.969416953612334e-08, 'subsample': 0.8162456405577587, 'colsample_bytree': 0.8175917962402711, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:11,708] Trial 22 finished with value: 335.45767220000005 and parameters: {'eta': 0.4980895860722806, 'max_depth': 2, 'gamma': 6.875320776265797e-08, 'subsample': 0.8416605758606337, 'colsample_bytree': 0.6525756126845417, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:18,146] Trial 23 finished with value: 345.94083859999995 and parameters: {'eta': 0.1831723507073939, 'max_depth': 3, 'gamma': 1.0193323692203245e-08, 'subsample': 0.914861318483273, 'colsample_bytree': 0.7683855998733723, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:19,487] Trial 24 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:15:26,877] Trial 25 finished with value: 338.06359860000003 and parameters: {'eta': 0.3809572459399861, 'max_depth': 5, 'gamma': 1.1666120779831324e-07, 'subsample': 0.8197531857320648, 'colsample_bytree': 0.6722619468656158, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:33,456] Trial 26 finished with value: 349.5020144 and parameters: {'eta': 0.11923264603090025, 'max_depth': 2, 'gamma': 2.508789089907947e-06, 'subsample': 0.8473738904099016, 'colsample_bytree': 0.7438287092320836, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:34,972] Trial 27 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:15:41,924] Trial 28 finished with value: 350.88667 and parameters: {'eta': 0.10391140165643059, 'max_depth': 5, 'gamma': 5.6156740840580176e-05, 'subsample': 0.884228936427663, 'colsample_bytree': 0.8018459244780497, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:48,032] Trial 29 finished with value: 336.6932432 and parameters: {'eta': 0.40669610770196074, 'max_depth': 1, 'gamma': 1.8247717340682924e-05, 'subsample': 0.6833492872661286, 'colsample_bytree': 0.6194384024680208, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:15:54,844] Trial 30 finished with value: 345.509265 and parameters: {'eta': 0.1941269022593734, 'max_depth': 4, 'gamma': 1.165208774089997e-06, 'subsample': 0.9615364143661826, 'colsample_bytree': 0.760955560468168, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:01,555] Trial 31 finished with value: 335.9316284 and parameters: {'eta': 0.48706977991818695, 'max_depth': 3, 'gamma': 3.34962784802572e-08, 'subsample': 0.8333023101548964, 'colsample_bytree': 0.6418576296095067, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:08,285] Trial 32 finished with value: 334.46282340000005 and parameters: {'eta': 0.5940989654942358, 'max_depth': 2, 'gamma': 9.460809139837196e-08, 'subsample': 0.8045840611861502, 'colsample_bytree': 0.6938440944266571, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:09,879] Trial 33 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:16:15,904] Trial 34 finished with value: 333.19677140000005 and parameters: {'eta': 0.5742243149626465, 'max_depth': 1, 'gamma': 4.548968672244205e-08, 'subsample': 0.8861881913816053, 'colsample_bytree': 0.725182590450069, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:22,287] Trial 35 finished with value: 345.1963684 and parameters: {'eta': 0.19743565561490767, 'max_depth': 2, 'gamma': 1.050030308451467e-06, 'subsample': 0.8922499867537265, 'colsample_bytree': 0.5612894641861115, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:29,297] Trial 36 finished with value: 331.6865904 and parameters: {'eta': 0.6705499266911197, 'max_depth': 5, 'gamma': 2.939083086648572e-08, 'subsample': 0.9509323074103875, 'colsample_bytree': 0.6950238547771973, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:30,713] Trial 37 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:16:32,299] Trial 38 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:16:39,326] Trial 39 finished with value: 343.10999759999993 and parameters: {'eta': 0.24367259389360857, 'max_depth': 5, 'gamma': 5.174642585621116e-06, 'subsample': 0.9099727580737649, 'colsample_bytree': 0.5587708112164813, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:46,299] Trial 40 finished with value: 332.0002868 and parameters: {'eta': 0.6494229988228657, 'max_depth': 4, 'gamma': 4.890343790436607e-07, 'subsample': 0.9592014737890387, 'colsample_bytree': 0.7365556316457476, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:16:53,487] Trial 41 finished with value: 332.6141664 and parameters: {'eta': 0.6270173331548624, 'max_depth': 4, 'gamma': 3.837706598278457e-07, 'subsample': 0.9531883613256601, 'colsample_bytree': 0.7276784152265554, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:17:00,544] Trial 42 finished with value: 331.11581420000005 and parameters: {'eta': 0.7071402931987123, 'max_depth': 4, 'gamma': 4.107134048507206e-07, 'subsample': 0.9655276530096673, 'colsample_bytree': 0.7426148717734584, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:17:02,016] Trial 43 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:17:03,591] Trial 44 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:17:10,737] Trial 45 finished with value: 340.2684386 and parameters: {'eta': 0.3166504955642488, 'max_depth': 4, 'gamma': 1.2187170234843903e-06, 'subsample': 0.9476266678949253, 'colsample_bytree': 0.7587016075637424, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:17:18,382] Trial 46 finished with value: 343.01641839999996 and parameters: {'eta': 0.24493239743888814, 'max_depth': 6, 'gamma': 2.623920066693074e-07, 'subsample': 0.8663687965360755, 'colsample_bytree': 0.6281256925213721, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:17:25,682] Trial 47 finished with value: 332.4527770000001 and parameters: {'eta': 0.621569791064164, 'max_depth': 4, 'gamma': 0.00021491674231753342, 'subsample': 0.9790177292929024, 'colsample_bytree': 0.7871695968762151, 'booster': 'gbtree'}. Best is trial 11 with value: 330.74412240000004. [I 2023-06-14 11:17:32,761] Trial 48 finished with value: 330.22024519999997 and parameters: {'eta': 0.7874108977452866, 'max_depth': 7, 'gamma': 0.0002969927818272007, 'subsample': 0.9781045278129599, 'colsample_bytree': 0.7986216239254601, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997. [I 2023-06-14 11:17:34,277] Trial 49 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:17:35,855] Trial 50 pruned. Trial was pruned at iteration 20. [I 2023-06-14 11:17:43,210] Trial 51 finished with value: 340.94965199999996 and parameters: {'eta': 0.2968545513751083, 'max_depth': 7, 'gamma': 0.0005430035396654545, 'subsample': 0.9686532160058666, 'colsample_bytree': 0.8103339431626249, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997. [I 2023-06-14 11:17:50,524] Trial 52 finished with value: 331.1304444 and parameters: {'eta': 0.72453741329648, 'max_depth': 5, 'gamma': 0.00012984440995526618, 'subsample': 0.9745382895386392, 'colsample_bytree': 0.7830897672199564, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997. [I 2023-06-14 11:17:57,569] Trial 53 finished with value: 330.8529544 and parameters: {'eta': 0.7600908251426459, 'max_depth': 6, 'gamma': 5.887497234837467e-05, 'subsample': 0.9292095593339198, 'colsample_bytree': 0.6860121732824026, 'booster': 'gbtree'}. Best is trial 48 with value: 330.22024519999997. Hyper-Parameter Optimization successfully finished. Number of finished trials: 54 Best trial: Value: 330.22024519999997 Params: eta: 0.7874108977452866 max_depth: 7 gamma: 0.0002969927818272007 subsample: 0.9781045278129599 colsample_bytree: 0.7986216239254601 booster: gbtree opt_rounds: 99
Model Training¶
In [5]:
Copied!
np.random.seed(123)
opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]
# Train Model with optimized hyperparameters
xgblss.train(opt_params,
dtrain,
num_boost_round=n_rounds
)
np.random.seed(123)
opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]
# Train Model with optimized hyperparameters
xgblss.train(opt_params,
dtrain,
num_boost_round=n_rounds
)
Out[5]:
<xgboost.core.Booster at 0x1c60aec29a0>
Prediction¶
In [6]:
Copied!
# Set seed for reproducibility
torch.manual_seed(123)
# Number of samples to draw from predicted distribution
n_samples = 1000
# Quantiles to calculate from predicted distribution
quant_sel = [0.05, 0.95]
# Sample from predicted distribution
pred_samples = xgblss.predict(dtest,
pred_type="samples",
n_samples=n_samples,
seed=123)
# Calculate quantiles from predicted distribution
pred_quantiles = xgblss.predict(dtest,
pred_type="quantiles",
n_samples=n_samples,
quantiles=quant_sel)
# Returns predicted distributional parameters
pred_params = xgblss.predict(dtest,
pred_type="parameters")
# Set seed for reproducibility
torch.manual_seed(123)
# Number of samples to draw from predicted distribution
n_samples = 1000
# Quantiles to calculate from predicted distribution
quant_sel = [0.05, 0.95]
# Sample from predicted distribution
pred_samples = xgblss.predict(dtest,
pred_type="samples",
n_samples=n_samples,
seed=123)
# Calculate quantiles from predicted distribution
pred_quantiles = xgblss.predict(dtest,
pred_type="quantiles",
n_samples=n_samples,
quantiles=quant_sel)
# Returns predicted distributional parameters
pred_params = xgblss.predict(dtest,
pred_type="parameters")
In [7]:
Copied!
pred_samples.head()
pred_samples.head()
Out[7]:
y_sample0 | y_sample1 | y_sample2 | y_sample3 | y_sample4 | y_sample5 | y_sample6 | y_sample7 | y_sample8 | y_sample9 | ... | y_sample990 | y_sample991 | y_sample992 | y_sample993 | y_sample994 | y_sample995 | y_sample996 | y_sample997 | y_sample998 | y_sample999 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1 | 69.204796 | 102.120705 | 71.703278 | 105.628113 | 89.443787 | 95.627541 | 107.683418 | 94.397614 | 103.259506 | 68.001175 | ... | 123.875603 | 89.156029 | 107.914062 | 68.041969 | 98.416306 | 107.876129 | 113.996597 | 123.945808 | 91.579063 | 117.527573 |
2 | 43.042534 | 37.217079 | 36.585884 | 44.556767 | 70.851280 | 25.616079 | 28.614723 | 39.900421 | 63.246895 | 20.834999 | ... | 26.661818 | 32.848087 | 41.232250 | 33.875793 | 48.706497 | 47.504135 | 59.874935 | 48.180283 | 47.074406 | 47.412148 |
3 | 39.445667 | 53.570946 | 50.907501 | 56.910980 | 53.622116 | 45.287960 | 47.335651 | 40.071209 | 66.523354 | 92.728165 | ... | 56.905788 | 55.022266 | 65.082413 | 51.495502 | 49.140320 | 46.178185 | 35.503922 | 41.097782 | 72.074226 | 57.199459 |
4 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 47.195942 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
5 rows × 1000 columns
In [8]:
Copied!
pred_quantiles.head()
pred_quantiles.head()
Out[8]:
quant_0.05 | quant_0.95 | |
---|---|---|
0 | 0.000000 | 0.000000 |
1 | 63.945352 | 120.214192 |
2 | 19.138763 | 60.182043 |
3 | 38.923154 | 76.505629 |
4 | 0.000000 | 0.000000 |
In [9]:
Copied!
pred_params.head()
pred_params.head()
Out[9]:
concentration | rate | gate | |
---|---|---|---|
0 | 32.130577 | 0.302448 | 0.997929 |
1 | 25.962152 | 0.286466 | 0.003079 |
2 | 10.248434 | 0.267827 | 0.003601 |
3 | 24.000774 | 0.423238 | 0.001000 |
4 | 8.810806 | 0.305360 | 0.995934 |
SHAP Interpretability¶
In [10]:
Copied!
# Partial Dependence Plot of concentration parameter
xgblss.plot(X_test,
parameter="concentration",
feature="age",
plot_type="Partial_Dependence")
# Partial Dependence Plot of concentration parameter
xgblss.plot(X_test,
parameter="concentration",
feature="age",
plot_type="Partial_Dependence")
C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\numpy\lib\function_base.py:2854: RuntimeWarning: invalid value encountered in divide C:\Users\maerzale\.virtualenvs\XGBoostLSS-vIPRRz-M\lib\site-packages\numpy\lib\function_base.py:2855: RuntimeWarning: invalid value encountered in divide
In [11]:
Copied!
# Feature Importance of gate parameter
xgblss.plot(X_test,
parameter="gate",
plot_type="Feature_Importance")
# Feature Importance of gate parameter
xgblss.plot(X_test,
parameter="gate",
plot_type="Feature_Importance")
Density Plots of Actual and Predicted Samples¶
In [12]:
Copied!
pred_df = pd.melt(pred_samples.iloc[:,0:5])
actual_df = pd.DataFrame.from_dict({"variable": "ACTUAL", "value": y_test.reshape(-1,)})
plot_df = pd.concat([pred_df, actual_df])
(
ggplot(plot_df,
aes(x="value",
color="variable",
fill="variable")) +
geom_density(alpha=0.4) +
facet_wrap("variable",
scales="free_y",
ncol=2) +
theme_bw(base_size=15) +
theme(legend_position="none")
)
pred_df = pd.melt(pred_samples.iloc[:,0:5])
actual_df = pd.DataFrame.from_dict({"variable": "ACTUAL", "value": y_test.reshape(-1,)})
plot_df = pd.concat([pred_df, actual_df])
(
ggplot(plot_df,
aes(x="value",
color="variable",
fill="variable")) +
geom_density(alpha=0.4) +
facet_wrap("variable",
scales="free_y",
ncol=2) +
theme_bw(base_size=15) +
theme(legend_position="none")
)
Out[12]:
<Figure Size: (1800 x 900)>