Zero-Adjusted Gamma Regression¶

Imports¶

In [2]:

Copied!





from lightgbmlss.model import *
from lightgbmlss.distributions.ZAGamma import *

from sklearn.model_selection import train_test_split 
import pandas as pd
import plotnine
from plotnine import *
plotnine.options.figure_size = (18, 9)
from lightgbmlss.model import *
from lightgbmlss.distributions.ZAGamma import *

from sklearn.model_selection import train_test_split 
import pandas as pd
import plotnine
from plotnine import *
plotnine.options.figure_size = (18, 9)

Data¶

In [3]:

Copied!





# The simulation example closely follows https://towardsdatascience.com/zero-inflated-regression-c7dfc656d8af
np.random.seed(123)
n_samples = 1000

data = pd.DataFrame({"age": np.random.randint(1, 100, size=n_samples)})
data["income"] = np.where((data.age > 17) & (data.age < 70), 1500*data.age + 5000 + 10000*np.random.randn(n_samples), 0) / 1000

y = data["income"].values
X = data.drop(columns="income")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = lgb.Dataset(X_train, label=y_train)
# The simulation example closely follows https://towardsdatascience.com/zero-inflated-regression-c7dfc656d8af
np.random.seed(123)
n_samples = 1000

data = pd.DataFrame({"age": np.random.randint(1, 100, size=n_samples)})
data["income"] = np.where((data.age > 17) & (data.age < 70), 1500*data.age + 5000 + 10000*np.random.randn(n_samples), 0) / 1000

y = data["income"].values
X = data.drop(columns="income")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = lgb.Dataset(X_train, label=y_train)

Distribution Selection¶

In [4]:

Copied!





# Specifies Zero-Adjusted Gamma distribution. See ?ZAGamma for an overview.
lgblss = LightGBMLSS(
    ZAGamma(stabilization="None",        # Options are "None", "MAD", "L2".
            response_fn="exp",           # Function to transform the concentration and rate parameters, e.g., "exp" or "softplus".
            loss_fn="nll"                # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).)      
           )           
)
# Specifies Zero-Adjusted Gamma distribution. See ?ZAGamma for an overview.
lgblss = LightGBMLSS(
    ZAGamma(stabilization="None",        # Options are "None", "MAD", "L2".
            response_fn="exp",           # Function to transform the concentration and rate parameters, e.g., "exp" or "softplus".
            loss_fn="nll"                # Loss function. Options are "nll" (negative log-likelihood) or "crps"(continuous ranked probability score).)      
           )           
)

Hyper-Parameter Optimization¶

Any LightGBM hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:

- Float/Int sample_type
    - {"param_name": ["sample_type", low, high, log]}
        - sample_type: str, Type of sampling, e.g., "float" or "int"
        - low: int, Lower endpoint of the range of suggested values
        - high: int, Upper endpoint of the range of suggested values
        - log: bool, Flag to sample the value from the log domain or not
    - Example: {"eta": "float", low=1e-5, high=1, log=True]}

- Categorical sample_type
    - {"param_name": ["sample_type", ["choice1", "choice2", "choice3", "..."]]}
        - sample_type: str, Type of sampling, either "categorical"
        - choice1, choice2, choice3, ...: str, Possible choices for the parameter
    - Example: {"boosting": ["categorical", ["gbdt", "dart"]]}

- For parameters without tunable choice (this is needed if tree_method = "gpu_hist" and gpu_id needs to be specified)
    - {"param_name": ["none", [value]]},
        - param_name: str, Name of the parameter
        - value: int, Value of the parameter
    - Example: {"gpu_id": ["none", [0]]}

In [5]:

Copied!





param_dict = {
    "eta":                      ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":                ["int",   {"low": 1,      "high": 10,    "log": False}],
    "min_gain_to_split":        ["float", {"low": 1e-8,   "high": 40,    "log": False}],
    "min_sum_hessian_in_leaf":  ["float", {"low": 1e-8,   "high": 500,   "log": True}],
    "subsample":                ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "feature_fraction":         ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "boosting":                 ["categorical", ["gbdt"]],
}

np.random.seed(123)
opt_param = lgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=5,              # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=20,                # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                            )
param_dict = {
    "eta":                      ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":                ["int",   {"low": 1,      "high": 10,    "log": False}],
    "min_gain_to_split":        ["float", {"low": 1e-8,   "high": 40,    "log": False}],
    "min_sum_hessian_in_leaf":  ["float", {"low": 1e-8,   "high": 500,   "log": True}],
    "subsample":                ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "feature_fraction":         ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "boosting":                 ["categorical", ["gbdt"]],
}

np.random.seed(123)
opt_param = lgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=5,              # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=20,                # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                            )

[I 2023-08-11 12:13:43,049] A new study created in memory with name: LightGBMLSS Hyper-Parameter Optimization

  0%|          | 0/20 [00:00<?, ?it/s]

[I 2023-08-11 12:13:46,907] Trial 0 finished with value: 493.3361578406364 and parameters: {'eta': 0.0003914367990686954, 'max_depth': 8, 'min_gain_to_split': 33.26852906521015, 'min_sum_hessian_in_leaf': 0.08584812980967063, 'subsample': 0.8021623973944538, 'feature_fraction': 0.8005070720671894, 'boosting': 'gbdt'}. Best is trial 0 with value: 493.3361578406364.
[I 2023-08-11 12:13:47,643] Trial 1 finished with value: 471.35372890681646 and parameters: {'eta': 0.40295767323333237, 'max_depth': 10, 'min_gain_to_split': 24.549932527360678, 'min_sum_hessian_in_leaf': 256.4870626827937, 'subsample': 0.8683684494312434, 'feature_fraction': 0.7849944406887219, 'boosting': 'gbdt'}. Best is trial 1 with value: 471.35372890681646.
[I 2023-08-11 12:13:51,060] Trial 2 finished with value: 503.6348357284388 and parameters: {'eta': 1.222371366077261e-05, 'max_depth': 4, 'min_gain_to_split': 28.061063870566468, 'min_sum_hessian_in_leaf': 0.0007422808244019155, 'subsample': 0.5522281492920988, 'feature_fraction': 0.6521861926605665, 'boosting': 'gbdt'}. Best is trial 1 with value: 471.35372890681646.
[I 2023-08-11 12:13:54,435] Trial 3 finished with value: 503.3457675319369 and parameters: {'eta': 2.274369193692896e-05, 'max_depth': 5, 'min_gain_to_split': 39.238368142734814, 'min_sum_hessian_in_leaf': 0.0006223989318040594, 'subsample': 0.7412515727598581, 'feature_fraction': 0.9452613103135388, 'boosting': 'gbdt'}. Best is trial 1 with value: 471.35372890681646.
[I 2023-08-11 12:13:57,807] Trial 4 finished with value: 502.3184990853341 and parameters: {'eta': 5.598186442465094e-05, 'max_depth': 3, 'min_gain_to_split': 21.23896288198938, 'min_sum_hessian_in_leaf': 0.21992447027885145, 'subsample': 0.45542723654621087, 'feature_fraction': 0.8855719650525622, 'boosting': 'gbdt'}. Best is trial 1 with value: 471.35372890681646.
[I 2023-08-11 12:13:59,599] Trial 5 finished with value: 401.0408565228521 and parameters: {'eta': 0.041701168766377694, 'max_depth': 8, 'min_gain_to_split': 17.301602719722112, 'min_sum_hessian_in_leaf': 32.73881939196468, 'subsample': 0.7377555648382916, 'feature_fraction': 0.9073126421475854, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.
[I 2023-08-11 12:14:02,830] Trial 6 finished with value: 407.386455067243 and parameters: {'eta': 0.007126608872225107, 'max_depth': 5, 'min_gain_to_split': 24.033222285497697, 'min_sum_hessian_in_leaf': 8.907709262528446, 'subsample': 0.4954284084741154, 'feature_fraction': 0.8681250783490104, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.
[I 2023-08-11 12:14:06,129] Trial 7 finished with value: 496.7796683249693 and parameters: {'eta': 0.00026493753680363304, 'max_depth': 3, 'min_gain_to_split': 38.809508809426916, 'min_sum_hessian_in_leaf': 6.812433286336614e-05, 'subsample': 0.21286798118279357, 'feature_fraction': 0.6112152779922334, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.
[I 2023-08-11 12:14:09,346] Trial 8 finished with value: 407.1040293789821 and parameters: {'eta': 0.007732793896149601, 'max_depth': 3, 'min_gain_to_split': 37.84720417299646, 'min_sum_hessian_in_leaf': 6.236162955045407e-08, 'subsample': 0.9353287449899692, 'feature_fraction': 0.3984151482200087, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.

C:\Users\maerzale\.virtualenvs\LightGBMLSS--u9b4l4T\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: invalid value encountered in subtract

[I 2023-08-11 12:14:10,085] Trial 9 finished with value: 435.35091073184594 and parameters: {'eta': 0.9428989545188251, 'max_depth': 10, 'min_gain_to_split': 4.488114462250434, 'min_sum_hessian_in_leaf': 1.7998195761788722, 'subsample': 0.29209480384149383, 'feature_fraction': 0.9592081048531047, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.
[I 2023-08-11 12:14:11,097] Trial 10 finished with value: 469.6566333865485 and parameters: {'eta': 0.20312012140237876, 'max_depth': 7, 'min_gain_to_split': 12.765970540892038, 'min_sum_hessian_in_leaf': 167.938324975031, 'subsample': 0.7047234963516413, 'feature_fraction': 0.26898548607003475, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.
[I 2023-08-11 12:14:14,892] Trial 11 finished with value: 410.651136877749 and parameters: {'eta': 0.016887194634392852, 'max_depth': 1, 'min_gain_to_split': 16.11709690117341, 'min_sum_hessian_in_leaf': 7.572146181609595e-08, 'subsample': 0.9978336630147084, 'feature_fraction': 0.4268355236219649, 'boosting': 'gbdt'}. Best is trial 5 with value: 401.0408565228521.
[I 2023-08-11 12:14:17,623] Trial 12 finished with value: 377.60158084711895 and parameters: {'eta': 0.03699519752376613, 'max_depth': 7, 'min_gain_to_split': 29.510639132062416, 'min_sum_hessian_in_leaf': 1.251547058329255e-08, 'subsample': 0.9633512537434151, 'feature_fraction': 0.482137842472506, 'boosting': 'gbdt'}. Best is trial 12 with value: 377.60158084711895.
[I 2023-08-11 12:14:19,401] Trial 13 finished with value: 377.8206629881021 and parameters: {'eta': 0.0764582042253138, 'max_depth': 8, 'min_gain_to_split': 30.97426056045043, 'min_sum_hessian_in_leaf': 6.7959924810432435e-06, 'subsample': 0.8745012211500811, 'feature_fraction': 0.6773493987586585, 'boosting': 'gbdt'}. Best is trial 12 with value: 377.60158084711895.
[I 2023-08-11 12:14:21,249] Trial 14 finished with value: 377.08800070558675 and parameters: {'eta': 0.0763535359468984, 'max_depth': 7, 'min_gain_to_split': 30.028503666783337, 'min_sum_hessian_in_leaf': 2.4843843388235532e-06, 'subsample': 0.9967113125593, 'feature_fraction': 0.5022537683346732, 'boosting': 'gbdt'}. Best is trial 14 with value: 377.08800070558675.
[I 2023-08-11 12:14:22,835] Trial 15 finished with value: 376.7629699437488 and parameters: {'eta': 0.09620102011855627, 'max_depth': 7, 'min_gain_to_split': 29.437274575766544, 'min_sum_hessian_in_leaf': 1.1839961164729091e-08, 'subsample': 0.9694329011088387, 'feature_fraction': 0.5286979187071998, 'boosting': 'gbdt'}. Best is trial 15 with value: 376.7629699437488.
[I 2023-08-11 12:14:24,205] Trial 16 finished with value: 377.7666131850925 and parameters: {'eta': 0.149654603762201, 'max_depth': 6, 'min_gain_to_split': 33.99184916354226, 'min_sum_hessian_in_leaf': 8.319116968525249e-07, 'subsample': 0.990017156839053, 'feature_fraction': 0.5284298109620141, 'boosting': 'gbdt'}. Best is trial 15 with value: 376.7629699437488.
[I 2023-08-11 12:14:25,170] Trial 17 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:14:26,420] Trial 18 finished with value: 364.798082239509 and parameters: {'eta': 0.8932571745812234, 'max_depth': 6, 'min_gain_to_split': 32.84914712406526, 'min_sum_hessian_in_leaf': 1.2602701826082383e-05, 'subsample': 0.6414980864372755, 'feature_fraction': 0.2984688213287875, 'boosting': 'gbdt'}. Best is trial 18 with value: 364.798082239509.
[I 2023-08-11 12:14:27,424] Trial 19 finished with value: 374.1395215044128 and parameters: {'eta': 0.7729049694175503, 'max_depth': 6, 'min_gain_to_split': 36.447746033684936, 'min_sum_hessian_in_leaf': 1.8155247988671537e-05, 'subsample': 0.6136889714080027, 'feature_fraction': 0.24597577103025603, 'boosting': 'gbdt'}. Best is trial 18 with value: 364.798082239509.

Hyper-Parameter Optimization successfully finished.
  Number of finished trials:  20
  Best trial:
    Value: 364.798082239509
    Params: 
    eta: 0.8932571745812234
    max_depth: 6
    min_gain_to_split: 32.84914712406526
    min_sum_hessian_in_leaf: 1.2602701826082383e-05
    subsample: 0.6414980864372755
    feature_fraction: 0.2984688213287875
    boosting: gbdt
    opt_rounds: 9

Model Training¶

In [6]:

Copied!





np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
lgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )
np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
lgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )

Prediction¶

In [7]:

Copied!





# Set seed for reproducibility
torch.manual_seed(123)

# Number of samples to draw from predicted distribution
n_samples = 1000
# Quantiles to calculate from predicted distribution
quant_sel = [0.05, 0.95] 

# Sample from predicted distribution
pred_samples = lgblss.predict(X_test,
                              pred_type="samples",
                              n_samples=n_samples,
                              seed=123)

# Calculate quantiles from predicted distribution
pred_quantiles = lgblss.predict(X_test,
                                pred_type="quantiles",
                                n_samples=n_samples,
                                quantiles=quant_sel)

# Returns predicted distributional parameters
pred_params = lgblss.predict(X_test,
                             pred_type="parameters")
# Set seed for reproducibility
torch.manual_seed(123)

# Number of samples to draw from predicted distribution
n_samples = 1000
# Quantiles to calculate from predicted distribution
quant_sel = [0.05, 0.95] 

# Sample from predicted distribution
pred_samples = lgblss.predict(X_test,
                              pred_type="samples",
                              n_samples=n_samples,
                              seed=123)

# Calculate quantiles from predicted distribution
pred_quantiles = lgblss.predict(X_test,
                                pred_type="quantiles",
                                n_samples=n_samples,
                                quantiles=quant_sel)

# Returns predicted distributional parameters
pred_params = lgblss.predict(X_test,
                             pred_type="parameters")

In [8]:

Copied!

pred_samples.head()
pred_samples.head()

Out[8]:

	y_sample0	y_sample1	y_sample2	y_sample3	y_sample4	y_sample5	y_sample6	y_sample7	y_sample8	y_sample9	...	y_sample990	y_sample991	y_sample992	y_sample993	y_sample994	y_sample995	y_sample996	y_sample997	y_sample998	y_sample999
0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	69.516289	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
1	60.929550	105.531929	64.138367	110.538750	83.007111	65.914368	114.569489	83.754181	76.929359	106.249832	...	126.694046	58.902824	106.914062	109.303070	64.697182	71.515106	96.678009	93.943962	62.027760	0.000000
2	37.353783	32.012577	31.435833	38.747078	74.590515	53.662891	22.332142	55.751812	19.008478	32.330696	...	0.000000	28.788361	22.363855	16.031998	38.852062	25.945065	30.270662	23.981115	37.747807	25.279463
3	30.415985	51.455875	47.262981	56.843914	75.915627	90.310211	78.174225	73.345291	48.665390	84.060516	...	35.612038	56.444321	64.669716	63.445686	94.317162	33.868572	28.650946	30.990072	83.390099	59.522594
4	0.000000	0.000000	0.000000	23.404318	0.000000	46.938168	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000

5 rows × 1000 columns

In [9]:

Copied!

pred_quantiles.head()
pred_quantiles.head()

Out[9]:

	quant_0.05	quant_0.95
0	0.0	77.157624
1	0.0	133.450491
2	0.0	51.579050
3	0.0	91.223141
4	0.0	24.037125

In [10]:

Copied!

pred_params.head()
pred_params.head()

Out[10]:

	concentration	rate	gate
0	13.424025	0.149034	0.943975
1	13.424025	0.149034	0.054856
2	9.111379	0.275753	0.054856
3	9.111379	0.158935	0.054856
4	9.111379	0.275753	0.943975

SHAP Interpretability¶

In [11]:

Copied!





# Partial Dependence Plot of concentration parameter
lgblss.plot(X_test,
            parameter="concentration",
            feature="age",
            plot_type="Partial_Dependence")
# Partial Dependence Plot of concentration parameter
lgblss.plot(X_test,
            parameter="concentration",
            feature="age",
            plot_type="Partial_Dependence")

No description has been provided for this image

In [12]:

Copied!





# Feature Importance of gate parameter
lgblss.plot(X_test,
            parameter="gate",
            plot_type="Feature_Importance")
# Feature Importance of gate parameter
lgblss.plot(X_test,
            parameter="gate",
            plot_type="Feature_Importance")

Density Plots of Actual and Predicted Samples¶

In [13]:

Copied!





pred_df = pd.melt(pred_samples.iloc[:,0:5])
actual_df = pd.DataFrame.from_dict({"variable": "ACTUAL", "value": y_test.reshape(-1,)})
plot_df = pd.concat([pred_df, actual_df])

(
    ggplot(plot_df, 
           aes(x="value",
               color="variable",
               fill="variable")) +  
    geom_density(alpha=0.4) + 
    facet_wrap("variable",
              scales="free_y",
              ncol=2) + 
    theme_bw(base_size=15) + 
    theme(legend_position="none")
)
pred_df = pd.melt(pred_samples.iloc[:,0:5])
actual_df = pd.DataFrame.from_dict({"variable": "ACTUAL", "value": y_test.reshape(-1,)})
plot_df = pd.concat([pred_df, actual_df])

(
    ggplot(plot_df, 
           aes(x="value",
               color="variable",
               fill="variable")) +  
    geom_density(alpha=0.4) + 
    facet_wrap("variable",
              scales="free_y",
              ncol=2) + 
    theme_bw(base_size=15) + 
    theme(legend_position="none")
)

Out[13]:

<Figure Size: (1800 x 900)>