Expectile Regression¶

Imports¶

In [2]:

Copied!





from lightgbmlss.model import *
from lightgbmlss.distributions.Expectile import *
from lightgbmlss.datasets.data_loader import load_simulated_gaussian_data

import plotnine
from plotnine import *
plotnine.options.figure_size = (20, 10)
from lightgbmlss.model import *
from lightgbmlss.distributions.Expectile import *
from lightgbmlss.datasets.data_loader import load_simulated_gaussian_data

import plotnine
from plotnine import *
plotnine.options.figure_size = (20, 10)

Data¶

In [3]:

Copied!

# The data is a simulated Gaussian as follows, where x is the only true feature and all others are noise variables
    # loc = 10
    # scale = 1 + 4*((0.3 < x) & (x < 0.5)) + 2*(x > 0.7)

train, test = load_simulated_gaussian_data()

X_train, y_train = train.filter(regex="x"), train["y"].values
X_test, y_test = test.filter(regex="x"), test["y"].values

dtrain = lgb.Dataset(X_train, label=y_train)
# The data is a simulated Gaussian as follows, where x is the only true feature and all others are noise variables
    # loc = 10
    # scale = 1 + 4*((0.3 < x) & (x < 0.5)) + 2*(x > 0.7)

train, test = load_simulated_gaussian_data()

X_train, y_train = train.filter(regex="x"), train["y"].values
X_test, y_test = test.filter(regex="x"), test["y"].values

dtrain = lgb.Dataset(X_train, label=y_train)

Expectile Specification¶

In [4]:

Copied!





lgblss = LightGBMLSS(
    Expectile(stabilization="None",              # Options are "None", "MAD", "L2".
              expectiles = [0.05, 0.95],         # List of expectiles to be estimated, in increasing order.
              penalize_crossing = True           # Whether to include a penalty term to discourage crossing of expectiles.
              )    
)
lgblss = LightGBMLSS(
    Expectile(stabilization="None",              # Options are "None", "MAD", "L2".
              expectiles = [0.05, 0.95],         # List of expectiles to be estimated, in increasing order.
              penalize_crossing = True           # Whether to include a penalty term to discourage crossing of expectiles.
              )    
)

Hyper-Parameter Optimization¶

Any LightGBM hyperparameter can be tuned, where the structure of the parameter dictionary needs to be as follows:

- Float/Int sample_type
    - {"param_name": ["sample_type", low, high, log]}
        - sample_type: str, Type of sampling, e.g., "float" or "int"
        - low: int, Lower endpoint of the range of suggested values
        - high: int, Upper endpoint of the range of suggested values
        - log: bool, Flag to sample the value from the log domain or not
    - Example: {"eta": "float", low=1e-5, high=1, log=True]}

- Categorical sample_type
    - {"param_name": ["sample_type", ["choice1", "choice2", "choice3", "..."]]}
        - sample_type: str, Type of sampling, either "categorical"
        - choice1, choice2, choice3, ...: str, Possible choices for the parameter
    - Example: {"boosting": ["categorical", ["gbdt", "dart"]]}

- For parameters without tunable choice (this is needed if tree_method = "gpu_hist" and gpu_id needs to be specified)
    - {"param_name": ["none", [value]]},
        - param_name: str, Name of the parameter
        - value: int, Value of the parameter
    - Example: {"gpu_id": ["none", [0]]}

In [5]:

Copied!





param_dict = {
    "eta":                      ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":                ["int",   {"low": 1,      "high": 10,    "log": False}],
    "num_leaves":               ["int",   {"low": 255,    "high": 255,   "log": False}],  # set to constant for this example
    "min_data_in_leaf":         ["int",   {"low": 20,     "high": 20,    "log": False}],  # set to constant for this example
    "min_gain_to_split":        ["float", {"low": 1e-8,   "high": 40,    "log": False}],
    "min_sum_hessian_in_leaf":  ["float", {"low": 1e-8,   "high": 500,   "log": True}],
    "subsample":                ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "feature_fraction":         ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "boosting":                 ["categorical", ["gbdt"]],
}

np.random.seed(123)
opt_param = lgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=10,             # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=30,                # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                             )
param_dict = {
    "eta":                      ["float", {"low": 1e-5,   "high": 1,     "log": True}],
    "max_depth":                ["int",   {"low": 1,      "high": 10,    "log": False}],
    "num_leaves":               ["int",   {"low": 255,    "high": 255,   "log": False}],  # set to constant for this example
    "min_data_in_leaf":         ["int",   {"low": 20,     "high": 20,    "log": False}],  # set to constant for this example
    "min_gain_to_split":        ["float", {"low": 1e-8,   "high": 40,    "log": False}],
    "min_sum_hessian_in_leaf":  ["float", {"low": 1e-8,   "high": 500,   "log": True}],
    "subsample":                ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "feature_fraction":         ["float", {"low": 0.2,    "high": 1.0,   "log": False}],
    "boosting":                 ["categorical", ["gbdt"]],
}

np.random.seed(123)
opt_param = lgblss.hyper_opt(param_dict,
                             dtrain,
                             num_boost_round=100,        # Number of boosting iterations.
                             nfold=5,                    # Number of cv-folds.
                             early_stopping_rounds=20,   # Number of early-stopping rounds
                             max_minutes=10,             # Time budget in minutes, i.e., stop study after the given number of minutes.
                             n_trials=30,                # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
                             silence=False,              # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
                             seed=123,                   # Seed used to generate cv-folds.
                             hp_seed=None                # Seed for random number generator used in the Bayesian hyperparameter search.
                             )

[I 2023-08-11 12:21:09,469] A new study created in memory with name: LightGBMLSS Hyper-Parameter Optimization

  0%|          | 0/30 [00:00<?, ?it/s]

[I 2023-08-11 12:21:12,718] Trial 0 finished with value: 2455.671630859375 and parameters: {'eta': 4.999979903379203e-05, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 21.93993866573528, 'min_sum_hessian_in_leaf': 0.0003530133520827798, 'subsample': 0.4971819264686692, 'feature_fraction': 0.3707311537482785, 'boosting': 'gbdt'}. Best is trial 0 with value: 2455.671630859375.
[I 2023-08-11 12:21:14,662] Trial 1 finished with value: 1905.1077880859375 and parameters: {'eta': 0.031600943671035775, 'max_depth': 3, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 25.283240746368264, 'min_sum_hessian_in_leaf': 49.082392515255734, 'subsample': 0.6788944834474666, 'feature_fraction': 0.9258044091945574, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:18,260] Trial 2 finished with value: 2163.520751953125 and parameters: {'eta': 0.005894981780547752, 'max_depth': 9, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 10.409790461895193, 'min_sum_hessian_in_leaf': 0.0008141832901711874, 'subsample': 0.4070793729617024, 'feature_fraction': 0.6846602442537073, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:20,956] Trial 3 finished with value: 2340.4150390625 and parameters: {'eta': 0.001961322558956042, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 12.389445813281812, 'min_sum_hessian_in_leaf': 0.0001665823267805825, 'subsample': 0.8122180498006835, 'feature_fraction': 0.5881597651097203, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:23,663] Trial 4 finished with value: 2455.928955078125 and parameters: {'eta': 4.876502677739385e-05, 'max_depth': 2, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 4.557086477842732, 'min_sum_hessian_in_leaf': 8.971172534030456e-08, 'subsample': 0.43772569367787945, 'feature_fraction': 0.33757411361894973, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:26,413] Trial 5 finished with value: 2020.169677734375 and parameters: {'eta': 0.015583832782804402, 'max_depth': 6, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 17.990075586462332, 'min_sum_hessian_in_leaf': 9.047858492815616e-06, 'subsample': 0.2661853234410493, 'feature_fraction': 0.43711054797968024, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:29,603] Trial 6 finished with value: 2453.094970703125 and parameters: {'eta': 3.192833281012269e-05, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 4.179852781197093, 'min_sum_hessian_in_leaf': 0.0013367062405656063, 'subsample': 0.760623390889942, 'feature_fraction': 0.9918521803651483, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:33,061] Trial 7 finished with value: 2390.234375 and parameters: {'eta': 0.001099408166117131, 'max_depth': 7, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 5.928259874226358, 'min_sum_hessian_in_leaf': 22.801819887756512, 'subsample': 0.8539904207951285, 'feature_fraction': 0.5227409182953131, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:36,013] Trial 8 finished with value: 1914.283935546875 and parameters: {'eta': 0.2493102080752807, 'max_depth': 1, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 9.13540771750102, 'min_sum_hessian_in_leaf': 0.34110612176978133, 'subsample': 0.5308020126325235, 'feature_fraction': 0.8641969342663409, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:36,677] Trial 9 finished with value: 1937.4613037109375 and parameters: {'eta': 0.9636709157054849, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 32.67317336133683, 'min_sum_hessian_in_leaf': 1.2744267371826801e-08, 'subsample': 0.9935887353691604, 'feature_fraction': 0.943953338852964, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:38,579] Trial 10 finished with value: 1946.8072509765625 and parameters: {'eta': 0.04879973495349672, 'max_depth': 3, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 39.06107022051272, 'min_sum_hessian_in_leaf': 106.46715749293509, 'subsample': 0.6595408126514654, 'feature_fraction': 0.7942683208717877, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:40,307] Trial 11 finished with value: 1942.6409912109375 and parameters: {'eta': 0.3475471835643695, 'max_depth': 1, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 23.567503144962224, 'min_sum_hessian_in_leaf': 0.4888567766798536, 'subsample': 0.598911344176229, 'feature_fraction': 0.8378628459298021, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:43,076] Trial 12 finished with value: 1970.949951171875 and parameters: {'eta': 0.1355810372876077, 'max_depth': 1, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 27.308295812909954, 'min_sum_hessian_in_leaf': 0.45165961145216943, 'subsample': 0.5607270559851195, 'feature_fraction': 0.8192697272856723, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:43,791] Trial 13 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:45,120] Trial 14 finished with value: 1917.481201171875 and parameters: {'eta': 0.1918179723427333, 'max_depth': 2, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 0.5575780726770159, 'min_sum_hessian_in_leaf': 0.38144740122258586, 'subsample': 0.5276921773301158, 'feature_fraction': 0.8851651910106035, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:46,648] Trial 15 finished with value: 1908.2398681640625 and parameters: {'eta': 0.06814525677756796, 'max_depth': 3, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 13.416527851881755, 'min_sum_hessian_in_leaf': 1.5533574937270136, 'subsample': 0.6891248913559522, 'feature_fraction': 0.9718069324964819, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:47,454] Trial 16 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:48,239] Trial 17 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:49,033] Trial 18 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:49,857] Trial 19 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:51,393] Trial 20 finished with value: 1911.4632568359375 and parameters: {'eta': 0.06986238192969108, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 29.160727721810346, 'min_sum_hessian_in_leaf': 0.016442500462406705, 'subsample': 0.6023777335914249, 'feature_fraction': 0.9155248389222844, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:52,873] Trial 21 finished with value: 1911.5198974609375 and parameters: {'eta': 0.07161895441713878, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 29.27696506397222, 'min_sum_hessian_in_leaf': 0.020946061323593588, 'subsample': 0.6163027333377462, 'feature_fraction': 0.9137504739618006, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:53,660] Trial 22 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:55,121] Trial 23 finished with value: 1908.989501953125 and parameters: {'eta': 0.08734639812497531, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 30.42886083118784, 'min_sum_hessian_in_leaf': 0.03828710084297306, 'subsample': 0.7131459665462002, 'feature_fraction': 0.7904449014182882, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:55,893] Trial 24 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:21:57,012] Trial 25 finished with value: 1917.7242431640625 and parameters: {'eta': 0.1615054287282905, 'max_depth': 5, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 20.364960331001463, 'min_sum_hessian_in_leaf': 3.329890678595058, 'subsample': 0.7972962597807455, 'feature_fraction': 0.8464533630717106, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:57,915] Trial 26 finished with value: 1909.692626953125 and parameters: {'eta': 0.5158300039154515, 'max_depth': 2, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 25.112985606000265, 'min_sum_hessian_in_leaf': 0.05243689522362543, 'subsample': 0.6973560330154309, 'feature_fraction': 0.7889207482360956, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:21:58,767] Trial 27 pruned. Trial was pruned at iteration 20.
[I 2023-08-11 12:22:00,393] Trial 28 finished with value: 1921.2425537109375 and parameters: {'eta': 0.09255361005893879, 'max_depth': 4, 'num_leaves': 255, 'min_data_in_leaf': 20, 'min_gain_to_split': 14.802111899924473, 'min_sum_hessian_in_leaf': 1.7943497012679166, 'subsample': 0.692797127315583, 'feature_fraction': 0.6813967545026932, 'boosting': 'gbdt'}. Best is trial 1 with value: 1905.1077880859375.
[I 2023-08-11 12:22:01,231] Trial 29 pruned. Trial was pruned at iteration 20.

Hyper-Parameter Optimization successfully finished.
  Number of finished trials:  30
  Best trial:
    Value: 1905.1077880859375
    Params: 
    eta: 0.031600943671035775
    max_depth: 3
    num_leaves: 255
    min_data_in_leaf: 20
    min_gain_to_split: 25.283240746368264
    min_sum_hessian_in_leaf: 49.082392515255734
    subsample: 0.6788944834474666
    feature_fraction: 0.9258044091945574
    boosting: gbdt
    opt_rounds: 55

Model Training¶

In [6]:

Copied!





np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
lgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )
np.random.seed(123)

opt_params = opt_param.copy()
n_rounds = opt_params["opt_rounds"]
del opt_params["opt_rounds"]

# Train Model with optimized hyperparameters
lgblss.train(opt_params,
             dtrain,
             num_boost_round=n_rounds
             )

Prediction¶

In [7]:

Copied!

# Predicted expectiles
pred_expectile = lgblss.predict(X_test, pred_type="expectiles")
# Predicted expectiles
pred_expectile = lgblss.predict(X_test, pred_type="expectiles")

In [8]:

Copied!

pred_expectile.head()
pred_expectile.head()

Out[8]:

	expectile_0.05	expectile_0.95
0	6.695340	13.277894
1	6.615792	13.277894
2	8.519470	11.511595
3	4.557220	14.967069
4	6.615792	13.367647

SHAP Interpretability¶

In [9]:

Copied!





# Partial Dependence Plot of how x acts on selected expectile 
lgblss.expectile_plot(X_test,
                      expectile="expectile_0.95",
                      feature="x_true",
                      plot_type="Partial_Dependence")
# Partial Dependence Plot of how x acts on selected expectile 
lgblss.expectile_plot(X_test,
                      expectile="expectile_0.95",
                      feature="x_true",
                      plot_type="Partial_Dependence")

No description has been provided for this image

In [10]:

Copied!





# Partial Dependence Plot of how x acts on selected expectile 
lgblss.expectile_plot(X_test,
                      expectile="expectile_0.05",
                      feature="x_true",
                      plot_type="Partial_Dependence")
# Partial Dependence Plot of how x acts on selected expectile 
lgblss.expectile_plot(X_test,
                      expectile="expectile_0.05",
                      feature="x_true",
                      plot_type="Partial_Dependence")

In [11]:

Copied!





# Global Feature Importance of selected expectile
lgblss.expectile_plot(X_test,
                      expectile="expectile_0.95",
                      plot_type="Feature_Importance")
# Global Feature Importance of selected expectile
lgblss.expectile_plot(X_test,
                      expectile="expectile_0.95",
                      plot_type="Feature_Importance")

Plot of Actual vs. Predicted Expectiles¶

In [12]:

Copied!





np.random.seed(123)

###
# Actual Expectiles
###
y_loc = np.array([10])
y_scale = np.array([1 + 4*((0.3 < test["x_true"].values) & (test["x_true"].values < 0.5)) + 2*(test["x_true"].values > 0.7)])
tau_lower = np.array([lgblss.dist.tau[0]])
tau_upper = np.array([lgblss.dist.tau[1]])

# Calculates exact expectiles assuming a Normal distribution
expectile_lb = expectile_norm(tau=tau_lower,
                              m=y_loc,
                              sd=y_scale).reshape(-1,)
expectile_ub =  expectile_norm(tau=tau_upper,
                               m=y_loc,
                               sd=y_scale).reshape(-1,)

test["expect"] = np.where(test["y"].values < expectile_lb, 0, np.where(test["y"].values < expectile_ub, 1, 2))
test["alpha"] = np.where(test["y"].values <= expectile_lb, 1, np.where(test["y"].values >= expectile_ub, 1, 0))
df_expectiles = test[test["alpha"] == 1]

# Lower Bound
yl = list(set(expectile_lb))
yl.sort()
yl = [yl[2],yl[0],yl[2],yl[1],yl[1]]
sfunl = pd.DataFrame({"x_true":[0, 0.3, 0.5, 0.7, 1],
                      "y":yl})

# Upper Bound
yu = list(set(expectile_ub))
yu.sort()
yu = [yu[0],yu[2],yu[0],yu[1],yu[1]]
sfunu = pd.DataFrame({"x_true":[0, 0.3, 0.5, 0.7, 1],
                      "y":yu})



###
# Forecasted Expectiles
###
test["lb"] = pred_expectile.iloc[:,0]
test["ub"] = pred_expectile.iloc[:,1]



###
# Plot
###
(ggplot(test,
        aes("x_true",
            "y")) +
 geom_point(alpha = 0.2, color = "black", size = 2) +
 theme_bw(base_size=15) +
 theme(legend_position="bottom",
       plot_title = element_text(hjust = 0.5)) +
 labs(title = "LightGBMLSS Expectile Regression - Simulated Data Example")  +
 geom_line(aes("x_true",
               "ub"),
           size = 1.5,
           color = "blue",
           alpha = 0.7) +
 geom_line(aes("x_true",
               "lb"),
           size = 1.5,
           color = "blue",
           alpha = 0.7) +
 geom_point(df_expectiles,
            aes("x_true",
                "y"),
            color = "red",
            alpha = 0.7,
            size = 2) +
 geom_step(sfunl,
           aes("x_true",
               "y"),
           size = 1,
           linetype = "dashed")  +
 geom_step(sfunu,
           aes("x_true",
               "y"),
           size = 1,
           linetype = "dashed")
)
np.random.seed(123)

###
# Actual Expectiles
###
y_loc = np.array([10])
y_scale = np.array([1 + 4*((0.3 < test["x_true"].values) & (test["x_true"].values < 0.5)) + 2*(test["x_true"].values > 0.7)])
tau_lower = np.array([lgblss.dist.tau[0]])
tau_upper = np.array([lgblss.dist.tau[1]])

# Calculates exact expectiles assuming a Normal distribution
expectile_lb = expectile_norm(tau=tau_lower,
                              m=y_loc,
                              sd=y_scale).reshape(-1,)
expectile_ub =  expectile_norm(tau=tau_upper,
                               m=y_loc,
                               sd=y_scale).reshape(-1,)

test["expect"] = np.where(test["y"].values < expectile_lb, 0, np.where(test["y"].values < expectile_ub, 1, 2))
test["alpha"] = np.where(test["y"].values <= expectile_lb, 1, np.where(test["y"].values >= expectile_ub, 1, 0))
df_expectiles = test[test["alpha"] == 1]

# Lower Bound
yl = list(set(expectile_lb))
yl.sort()
yl = [yl[2],yl[0],yl[2],yl[1],yl[1]]
sfunl = pd.DataFrame({"x_true":[0, 0.3, 0.5, 0.7, 1],
                      "y":yl})

# Upper Bound
yu = list(set(expectile_ub))
yu.sort()
yu = [yu[0],yu[2],yu[0],yu[1],yu[1]]
sfunu = pd.DataFrame({"x_true":[0, 0.3, 0.5, 0.7, 1],
                      "y":yu})



###
# Forecasted Expectiles
###
test["lb"] = pred_expectile.iloc[:,0]
test["ub"] = pred_expectile.iloc[:,1]



###
# Plot
###
(ggplot(test,
        aes("x_true",
            "y")) +
 geom_point(alpha = 0.2, color = "black", size = 2) +
 theme_bw(base_size=15) +
 theme(legend_position="bottom",
       plot_title = element_text(hjust = 0.5)) +
 labs(title = "LightGBMLSS Expectile Regression - Simulated Data Example")  +
 geom_line(aes("x_true",
               "ub"),
           size = 1.5,
           color = "blue",
           alpha = 0.7) +
 geom_line(aes("x_true",
               "lb"),
           size = 1.5,
           color = "blue",
           alpha = 0.7) +
 geom_point(df_expectiles,
            aes("x_true",
                "y"),
            color = "red",
            alpha = 0.7,
            size = 2) +
 geom_step(sfunl,
           aes("x_true",
               "y"),
           size = 1,
           linetype = "dashed")  +
 geom_step(sfunu,
           aes("x_true",
               "y"),
           size = 1,
           linetype = "dashed")
)

Out[12]:

<Figure Size: (2000 x 1000)>