| {{py: | |
| """ | |
| Template file to easily generate loops over samples using Tempita | |
| (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). | |
| Generated file: _loss.pyx | |
| Each loss class is generated by a cdef functions on single samples. | |
| The keywords between double braces are substituted during the build. | |
| """ | |
| doc_HalfSquaredError = ( | |
| """Half Squared Error with identity link. | |
| Domain: | |
| y_true and y_pred all real numbers | |
| Link: | |
| y_pred = raw_prediction | |
| """ | |
| ) | |
| doc_AbsoluteError = ( | |
| """Absolute Error with identity link. | |
| Domain: | |
| y_true and y_pred all real numbers | |
| Link: | |
| y_pred = raw_prediction | |
| """ | |
| ) | |
| doc_PinballLoss = ( | |
| """Quantile Loss aka Pinball Loss with identity link. | |
| Domain: | |
| y_true and y_pred all real numbers | |
| quantile in (0, 1) | |
| Link: | |
| y_pred = raw_prediction | |
| Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError() | |
| """ | |
| ) | |
| doc_HuberLoss = ( | |
| """Huber Loss with identity link. | |
| Domain: | |
| y_true and y_pred all real numbers | |
| delta in positive real numbers | |
| Link: | |
| y_pred = raw_prediction | |
| """ | |
| ) | |
| doc_HalfPoissonLoss = ( | |
| """Half Poisson deviance loss with log-link. | |
| Domain: | |
| y_true in non-negative real numbers | |
| y_pred in positive real numbers | |
| Link: | |
| y_pred = exp(raw_prediction) | |
| Half Poisson deviance with log-link is | |
| y_true * log(y_true/y_pred) + y_pred - y_true | |
| = y_true * log(y_true) - y_true * raw_prediction | |
| + exp(raw_prediction) - y_true | |
| Dropping constant terms, this gives: | |
| exp(raw_prediction) - y_true * raw_prediction | |
| """ | |
| ) | |
| doc_HalfGammaLoss = ( | |
| """Half Gamma deviance loss with log-link. | |
| Domain: | |
| y_true and y_pred in positive real numbers | |
| Link: | |
| y_pred = exp(raw_prediction) | |
| Half Gamma deviance with log-link is | |
| log(y_pred/y_true) + y_true/y_pred - 1 | |
| = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1 | |
| Dropping constant terms, this gives: | |
| raw_prediction + y_true * exp(-raw_prediction) | |
| """ | |
| ) | |
| doc_HalfTweedieLoss = ( | |
| """Half Tweedie deviance loss with log-link. | |
| Domain: | |
| y_true in real numbers if p <= 0 | |
| y_true in non-negative real numbers if 0 < p < 2 | |
| y_true in positive real numbers if p >= 2 | |
| y_pred and power in positive real numbers | |
| Link: | |
| y_pred = exp(raw_prediction) | |
| Half Tweedie deviance with log-link and p=power is | |
| max(y_true, 0)**(2-p) / (1-p) / (2-p) | |
| - y_true * y_pred**(1-p) / (1-p) | |
| + y_pred**(2-p) / (2-p) | |
| = max(y_true, 0)**(2-p) / (1-p) / (2-p) | |
| - y_true * exp((1-p) * raw_prediction) / (1-p) | |
| + exp((2-p) * raw_prediction) / (2-p) | |
| Dropping constant terms, this gives: | |
| exp((2-p) * raw_prediction) / (2-p) | |
| - y_true * exp((1-p) * raw_prediction) / (1-p) | |
| Notes: | |
| - Poisson with p=1 and and Gamma with p=2 have different terms dropped such | |
| that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2. | |
| - While the Tweedie distribution only exists for p<=0 or p>=1, the range | |
| 0<p<1 still gives a strictly consistent scoring function for the | |
| expectation. | |
| """ | |
| ) | |
| doc_HalfTweedieLossIdentity = ( | |
| """Half Tweedie deviance loss with identity link. | |
| Domain: | |
| y_true in real numbers if p <= 0 | |
| y_true in non-negative real numbers if 0 < p < 2 | |
| y_true in positive real numbers if p >= 2 | |
| y_pred and power in positive real numbers, y_pred may be negative for p=0. | |
| Link: | |
| y_pred = raw_prediction | |
| Half Tweedie deviance with identity link and p=power is | |
| max(y_true, 0)**(2-p) / (1-p) / (2-p) | |
| - y_true * y_pred**(1-p) / (1-p) | |
| + y_pred**(2-p) / (2-p) | |
| Notes: | |
| - Here, we do not drop constant terms in contrast to the version with log-link. | |
| """ | |
| ) | |
| doc_HalfBinomialLoss = ( | |
| """Half Binomial deviance loss with logit link. | |
| Domain: | |
| y_true in [0, 1] | |
| y_pred in (0, 1), i.e. boundaries excluded | |
| Link: | |
| y_pred = expit(raw_prediction) | |
| """ | |
| ) | |
| doc_ExponentialLoss = ( | |
| """"Exponential loss with (half) logit link | |
| Domain: | |
| y_true in [0, 1] | |
| y_pred in (0, 1), i.e. boundaries excluded | |
| Link: | |
| y_pred = expit(2 * raw_prediction) | |
| """ | |
| ) | |
| # loss class name, docstring, param, | |
| # cy_loss, cy_loss_grad, | |
| # cy_grad, cy_grad_hess, | |
| class_list = [ | |
| ("CyHalfSquaredError", doc_HalfSquaredError, None, | |
| "closs_half_squared_error", None, | |
| "cgradient_half_squared_error", "cgrad_hess_half_squared_error"), | |
| ("CyAbsoluteError", doc_AbsoluteError, None, | |
| "closs_absolute_error", None, | |
| "cgradient_absolute_error", "cgrad_hess_absolute_error"), | |
| ("CyPinballLoss", doc_PinballLoss, "quantile", | |
| "closs_pinball_loss", None, | |
| "cgradient_pinball_loss", "cgrad_hess_pinball_loss"), | |
| ("CyHuberLoss", doc_HuberLoss, "delta", | |
| "closs_huber_loss", None, | |
| "cgradient_huber_loss", "cgrad_hess_huber_loss"), | |
| ("CyHalfPoissonLoss", doc_HalfPoissonLoss, None, | |
| "closs_half_poisson", "closs_grad_half_poisson", | |
| "cgradient_half_poisson", "cgrad_hess_half_poisson"), | |
| ("CyHalfGammaLoss", doc_HalfGammaLoss, None, | |
| "closs_half_gamma", "closs_grad_half_gamma", | |
| "cgradient_half_gamma", "cgrad_hess_half_gamma"), | |
| ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power", | |
| "closs_half_tweedie", "closs_grad_half_tweedie", | |
| "cgradient_half_tweedie", "cgrad_hess_half_tweedie"), | |
| ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power", | |
| "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity", | |
| "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"), | |
| ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None, | |
| "closs_half_binomial", "closs_grad_half_binomial", | |
| "cgradient_half_binomial", "cgrad_hess_half_binomial"), | |
| ("CyExponentialLoss", doc_ExponentialLoss, None, | |
| "closs_exponential", "closs_grad_exponential", | |
| "cgradient_exponential", "cgrad_hess_exponential"), | |
| ] | |
| }} | |
| # Design: | |
| # See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons. | |
| # a) Merge link functions into loss functions for speed and numerical | |
| # stability, i.e. use raw_prediction instead of y_pred in signature. | |
| # b) Pure C functions (nogil) calculate single points (single sample) | |
| # c) Wrap C functions in a loop to get Python functions operating on ndarrays. | |
| # - Write loops manually---use Tempita for this. | |
| # Reason: There is still some performance overhead when using a wrapper | |
| # function "wrap" that carries out the loop and gets as argument a function | |
| # pointer to one of the C functions from b), e.g. | |
| # wrap(closs_half_poisson, y_true, ...) | |
| # - Pass n_threads as argument to prange and propagate option to all callers. | |
| # d) Provide classes (Cython extension types) per loss (names start with Cy) in | |
| # order to have semantical structured objects. | |
| # - Member functions for single points just call the C function from b). | |
| # These are used e.g. in SGD `_plain_sgd`. | |
| # - Member functions operating on ndarrays, see c), looping over calls to C | |
| # functions from b). | |
| # e) Provide convenience Python classes that compose from these extension types | |
| # elsewhere (see loss.py) | |
| # - Example: loss.gradient calls CyLoss.gradient but does some input | |
| # checking like None -> np.empty(). | |
| # | |
| # Note: We require 1-dim ndarrays to be contiguous. | |
| from cython.parallel import parallel, prange | |
| import numpy as np | |
| from libc.math cimport exp, fabs, log, log1p, pow | |
| from libc.stdlib cimport malloc, free | |
| # ------------------------------------- | |
| # Helper functions | |
| # ------------------------------------- | |
| # Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of | |
| # https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf | |
| # Note: The only important cutoff is at x = 18. All others are to save computation | |
| # time. Compared to the reference, we add the additional case distinction x <= -2 in | |
| # order to use log instead of log1p for improved performance. As with the other | |
| # cutoffs, this is accurate within machine precision of double. | |
| cdef inline double log1pexp(double x) noexcept nogil: | |
| if x <= -37: | |
| return exp(x) | |
| elif x <= -2: | |
| return log1p(exp(x)) | |
| elif x <= 18: | |
| return log(1. + exp(x)) | |
| elif x <= 33.3: | |
| return x + exp(-x) | |
| else: | |
| return x | |
| cdef inline double_pair sum_exp_minus_max( | |
| const int i, | |
| const floating_in[:, :] raw_prediction, # IN | |
| floating_out *p # OUT | |
| ) noexcept nogil: | |
| # Thread local buffers are used to store part of the results via p. | |
| # The results are stored as follows: | |
| # p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1 | |
| # return.val1 = max_value = max(raw_prediction_i_k, k = 0 to n_classes-1) | |
| # return.val2 = sum_exps = sum(p[k], k = 0 to n_classes-1) = sum of exponentials | |
| # len(p) must be n_classes | |
| # Notes: | |
| # - We return the max value and sum of exps (stored in p) as a double_pair. | |
| # - i needs to be passed (and stays constant) because otherwise Cython does | |
| # not generate optimal code, see | |
| # https://github.com/scikit-learn/scikit-learn/issues/17299 | |
| # - We do not normalize p by calculating p[k] = p[k] / sum_exps. | |
| # This helps to save one loop over k. | |
| cdef: | |
| int k | |
| int n_classes = raw_prediction.shape[1] | |
| double_pair max_value_and_sum_exps # val1 = max_value, val2 = sum_exps | |
| max_value_and_sum_exps.val1 = raw_prediction[i, 0] | |
| max_value_and_sum_exps.val2 = 0 | |
| for k in range(1, n_classes): | |
| # Compute max value of array for numerical stability | |
| if max_value_and_sum_exps.val1 < raw_prediction[i, k]: | |
| max_value_and_sum_exps.val1 = raw_prediction[i, k] | |
| for k in range(n_classes): | |
| p[k] = exp(raw_prediction[i, k] - max_value_and_sum_exps.val1) | |
| max_value_and_sum_exps.val2 += p[k] | |
| return max_value_and_sum_exps | |
| # ------------------------------------- | |
| # Single point inline C functions | |
| # ------------------------------------- | |
| # Half Squared Error | |
| cdef inline double closs_half_squared_error( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true) | |
| cdef inline double cgradient_half_squared_error( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return raw_prediction - y_true | |
| cdef inline double_pair cgrad_hess_half_squared_error( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| gh.val1 = raw_prediction - y_true # gradient | |
| gh.val2 = 1. # hessian | |
| return gh | |
| # Absolute Error | |
| cdef inline double closs_absolute_error( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return fabs(raw_prediction - y_true) | |
| cdef inline double cgradient_absolute_error( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return 1. if raw_prediction > y_true else -1. | |
| cdef inline double_pair cgrad_hess_absolute_error( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| # Note that exact hessian = 0 almost everywhere. Optimization routines like | |
| # in HGBT, however, need a hessian > 0. Therefore, we assign 1. | |
| gh.val1 = 1. if raw_prediction > y_true else -1. # gradient | |
| gh.val2 = 1. # hessian | |
| return gh | |
| # Quantile Loss / Pinball Loss | |
| cdef inline double closs_pinball_loss( | |
| double y_true, | |
| double raw_prediction, | |
| double quantile | |
| ) noexcept nogil: | |
| return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction | |
| else (1. - quantile) * (raw_prediction - y_true)) | |
| cdef inline double cgradient_pinball_loss( | |
| double y_true, | |
| double raw_prediction, | |
| double quantile | |
| ) noexcept nogil: | |
| return -quantile if y_true >=raw_prediction else 1. - quantile | |
| cdef inline double_pair cgrad_hess_pinball_loss( | |
| double y_true, | |
| double raw_prediction, | |
| double quantile | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| # Note that exact hessian = 0 almost everywhere. Optimization routines like | |
| # in HGBT, however, need a hessian > 0. Therefore, we assign 1. | |
| gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile # gradient | |
| gh.val2 = 1. # hessian | |
| return gh | |
| # Huber Loss | |
| cdef inline double closs_huber_loss( | |
| double y_true, | |
| double raw_prediction, | |
| double delta, | |
| ) noexcept nogil: | |
| cdef double abserr = fabs(y_true - raw_prediction) | |
| if abserr <= delta: | |
| return 0.5 * abserr**2 | |
| else: | |
| return delta * (abserr - 0.5 * delta) | |
| cdef inline double cgradient_huber_loss( | |
| double y_true, | |
| double raw_prediction, | |
| double delta, | |
| ) noexcept nogil: | |
| cdef double res = raw_prediction - y_true | |
| if fabs(res) <= delta: | |
| return res | |
| else: | |
| return delta if res >=0 else -delta | |
| cdef inline double_pair cgrad_hess_huber_loss( | |
| double y_true, | |
| double raw_prediction, | |
| double delta, | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| gh.val2 = raw_prediction - y_true # used as temporary | |
| if fabs(gh.val2) <= delta: | |
| gh.val1 = gh.val2 # gradient | |
| gh.val2 = 1 # hessian | |
| else: | |
| gh.val1 = delta if gh.val2 >=0 else -delta # gradient | |
| gh.val2 = 0 # hessian | |
| return gh | |
| # Half Poisson Deviance with Log-Link, dropping constant terms | |
| cdef inline double closs_half_poisson( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return exp(raw_prediction) - y_true * raw_prediction | |
| cdef inline double cgradient_half_poisson( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| # y_pred - y_true | |
| return exp(raw_prediction) - y_true | |
| cdef inline double_pair closs_grad_half_poisson( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair lg | |
| lg.val2 = exp(raw_prediction) # used as temporary | |
| lg.val1 = lg.val2 - y_true * raw_prediction # loss | |
| lg.val2 -= y_true # gradient | |
| return lg | |
| cdef inline double_pair cgrad_hess_half_poisson( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| gh.val2 = exp(raw_prediction) # hessian | |
| gh.val1 = gh.val2 - y_true # gradient | |
| return gh | |
| # Half Gamma Deviance with Log-Link, dropping constant terms | |
| cdef inline double closs_half_gamma( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return raw_prediction + y_true * exp(-raw_prediction) | |
| cdef inline double cgradient_half_gamma( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| return 1. - y_true * exp(-raw_prediction) | |
| cdef inline double_pair closs_grad_half_gamma( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair lg | |
| lg.val2 = exp(-raw_prediction) # used as temporary | |
| lg.val1 = raw_prediction + y_true * lg.val2 # loss | |
| lg.val2 = 1. - y_true * lg.val2 # gradient | |
| return lg | |
| cdef inline double_pair cgrad_hess_half_gamma( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| gh.val2 = exp(-raw_prediction) # used as temporary | |
| gh.val1 = 1. - y_true * gh.val2 # gradient | |
| gh.val2 *= y_true # hessian | |
| return gh | |
| # Half Tweedie Deviance with Log-Link, dropping constant terms | |
| # Note that by dropping constants this is no longer continuous in parameter power. | |
| cdef inline double closs_half_tweedie( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| if power == 0.: | |
| return closs_half_squared_error(y_true, exp(raw_prediction)) | |
| elif power == 1.: | |
| return closs_half_poisson(y_true, raw_prediction) | |
| elif power == 2.: | |
| return closs_half_gamma(y_true, raw_prediction) | |
| else: | |
| return (exp((2. - power) * raw_prediction) / (2. - power) | |
| - y_true * exp((1. - power) * raw_prediction) / (1. - power)) | |
| cdef inline double cgradient_half_tweedie( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| cdef double exp1 | |
| if power == 0.: | |
| exp1 = exp(raw_prediction) | |
| return exp1 * (exp1 - y_true) | |
| elif power == 1.: | |
| return cgradient_half_poisson(y_true, raw_prediction) | |
| elif power == 2.: | |
| return cgradient_half_gamma(y_true, raw_prediction) | |
| else: | |
| return (exp((2. - power) * raw_prediction) | |
| - y_true * exp((1. - power) * raw_prediction)) | |
| cdef inline double_pair closs_grad_half_tweedie( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| cdef double_pair lg | |
| cdef double exp1, exp2 | |
| if power == 0.: | |
| exp1 = exp(raw_prediction) | |
| lg.val1 = closs_half_squared_error(y_true, exp1) # loss | |
| lg.val2 = exp1 * (exp1 - y_true) # gradient | |
| elif power == 1.: | |
| return closs_grad_half_poisson(y_true, raw_prediction) | |
| elif power == 2.: | |
| return closs_grad_half_gamma(y_true, raw_prediction) | |
| else: | |
| exp1 = exp((1. - power) * raw_prediction) | |
| exp2 = exp((2. - power) * raw_prediction) | |
| lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power) # loss | |
| lg.val2 = exp2 - y_true * exp1 # gradient | |
| return lg | |
| cdef inline double_pair cgrad_hess_half_tweedie( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| cdef double exp1, exp2 | |
| if power == 0.: | |
| exp1 = exp(raw_prediction) | |
| gh.val1 = exp1 * (exp1 - y_true) # gradient | |
| gh.val2 = exp1 * (2 * exp1 - y_true) # hessian | |
| elif power == 1.: | |
| return cgrad_hess_half_poisson(y_true, raw_prediction) | |
| elif power == 2.: | |
| return cgrad_hess_half_gamma(y_true, raw_prediction) | |
| else: | |
| exp1 = exp((1. - power) * raw_prediction) | |
| exp2 = exp((2. - power) * raw_prediction) | |
| gh.val1 = exp2 - y_true * exp1 # gradient | |
| gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1 # hessian | |
| return gh | |
| # Half Tweedie Deviance with identity link, without dropping constant terms! | |
| # Therefore, best loss value is zero. | |
| cdef inline double closs_half_tweedie_identity( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| cdef double tmp | |
| if power == 0.: | |
| return closs_half_squared_error(y_true, raw_prediction) | |
| elif power == 1.: | |
| if y_true == 0: | |
| return raw_prediction | |
| else: | |
| return y_true * log(y_true/raw_prediction) + raw_prediction - y_true | |
| elif power == 2.: | |
| return log(raw_prediction/y_true) + y_true/raw_prediction - 1. | |
| else: | |
| tmp = pow(raw_prediction, 1. - power) | |
| tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power) | |
| if y_true > 0: | |
| tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power)) | |
| return tmp | |
| cdef inline double cgradient_half_tweedie_identity( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| if power == 0.: | |
| return raw_prediction - y_true | |
| elif power == 1.: | |
| return 1. - y_true / raw_prediction | |
| elif power == 2.: | |
| return (raw_prediction - y_true) / (raw_prediction * raw_prediction) | |
| else: | |
| return pow(raw_prediction, -power) * (raw_prediction - y_true) | |
| cdef inline double_pair closs_grad_half_tweedie_identity( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| cdef double_pair lg | |
| cdef double tmp | |
| if power == 0.: | |
| lg.val2 = raw_prediction - y_true # gradient | |
| lg.val1 = 0.5 * lg.val2 * lg.val2 # loss | |
| elif power == 1.: | |
| if y_true == 0: | |
| lg.val1 = raw_prediction | |
| else: | |
| lg.val1 = (y_true * log(y_true/raw_prediction) # loss | |
| + raw_prediction - y_true) | |
| lg.val2 = 1. - y_true / raw_prediction # gradient | |
| elif power == 2.: | |
| lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1. # loss | |
| tmp = raw_prediction * raw_prediction | |
| lg.val2 = (raw_prediction - y_true) / tmp # gradient | |
| else: | |
| tmp = pow(raw_prediction, 1. - power) | |
| lg.val1 = (raw_prediction * tmp / (2. - power) # loss | |
| - y_true * tmp / (1. - power)) | |
| if y_true > 0: | |
| lg.val1 += (pow(y_true, 2. - power) | |
| / ((1. - power) * (2. - power))) | |
| lg.val2 = tmp * (1. - y_true / raw_prediction) # gradient | |
| return lg | |
| cdef inline double_pair cgrad_hess_half_tweedie_identity( | |
| double y_true, | |
| double raw_prediction, | |
| double power | |
| ) noexcept nogil: | |
| cdef double_pair gh | |
| cdef double tmp | |
| if power == 0.: | |
| gh.val1 = raw_prediction - y_true # gradient | |
| gh.val2 = 1. # hessian | |
| elif power == 1.: | |
| gh.val1 = 1. - y_true / raw_prediction # gradient | |
| gh.val2 = y_true / (raw_prediction * raw_prediction) # hessian | |
| elif power == 2.: | |
| tmp = raw_prediction * raw_prediction | |
| gh.val1 = (raw_prediction - y_true) / tmp # gradient | |
| gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp # hessian | |
| else: | |
| tmp = pow(raw_prediction, -power) | |
| gh.val1 = tmp * (raw_prediction - y_true) # gradient | |
| gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction) # hessian | |
| return gh | |
| # Half Binomial deviance with logit-link, aka log-loss or binary cross entropy | |
| cdef inline double closs_half_binomial( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| # log1p(exp(raw_prediction)) - y_true * raw_prediction | |
| return log1pexp(raw_prediction) - y_true * raw_prediction | |
| cdef inline double cgradient_half_binomial( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| # gradient = y_pred - y_true = expit(raw_prediction) - y_true | |
| # Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/ | |
| # if raw_prediction < 0: | |
| # exp_tmp = exp(raw_prediction) | |
| # return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp) | |
| # else: | |
| # exp_tmp = exp(-raw_prediction) | |
| # return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp) | |
| # Note that optimal speed would be achieved, at the cost of precision, by | |
| # return expit(raw_prediction) - y_true | |
| # i.e. no "if else" and an own inline implementation of expit instead of | |
| # from scipy.special.cython_special cimport expit | |
| # The case distinction raw_prediction < 0 in the stable implementation does not | |
| # provide significant better precision apart from protecting overflow of exp(..). | |
| # The branch (if else), however, can incur runtime costs of up to 30%. | |
| # Instead, we help branch prediction by almost always ending in the first if clause | |
| # and making the second branch (else) a bit simpler. This has the exact same | |
| # precision but is faster than the stable implementation. | |
| # As branching criteria, we use the same cutoff as in log1pexp. Note that the | |
| # maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731 | |
| # (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365. | |
| cdef double exp_tmp | |
| if raw_prediction > -37: | |
| exp_tmp = exp(-raw_prediction) | |
| return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp) | |
| else: | |
| # expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37 | |
| return exp(raw_prediction) - y_true | |
| cdef inline double_pair closs_grad_half_binomial( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair lg | |
| # Same if else conditions as in log1pexp. | |
| if raw_prediction <= -37: | |
| lg.val2 = exp(raw_prediction) # used as temporary | |
| lg.val1 = lg.val2 - y_true * raw_prediction # loss | |
| lg.val2 -= y_true # gradient | |
| elif raw_prediction <= -2: | |
| lg.val2 = exp(raw_prediction) # used as temporary | |
| lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss | |
| lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient | |
| elif raw_prediction <= 18: | |
| lg.val2 = exp(-raw_prediction) # used as temporary | |
| # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x)) | |
| lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss | |
| lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient | |
| else: | |
| lg.val2 = exp(-raw_prediction) # used as temporary | |
| lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss | |
| lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient | |
| return lg | |
| cdef inline double_pair cgrad_hess_half_binomial( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| # with y_pred = expit(raw) | |
| # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2 | |
| # = exp(-raw) / (1 + exp(-raw))**2 | |
| cdef double_pair gh | |
| # See comment in cgradient_half_binomial. | |
| if raw_prediction > -37: | |
| gh.val2 = exp(-raw_prediction) # used as temporary | |
| gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient | |
| gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian | |
| else: | |
| gh.val2 = exp(raw_prediction) # = 1. order Taylor in exp(raw_prediction) | |
| gh.val1 = gh.val2 - y_true | |
| return gh | |
| # Exponential loss with (half) logit-link, aka boosting loss | |
| cdef inline double closs_exponential( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double tmp = exp(raw_prediction) | |
| return y_true / tmp + (1 - y_true) * tmp | |
| cdef inline double cgradient_exponential( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double tmp = exp(raw_prediction) | |
| return -y_true / tmp + (1 - y_true) * tmp | |
| cdef inline double_pair closs_grad_exponential( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| cdef double_pair lg | |
| lg.val2 = exp(raw_prediction) # used as temporary | |
| lg.val1 = y_true / lg.val2 + (1 - y_true) * lg.val2 # loss | |
| lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2 # gradient | |
| return lg | |
| cdef inline double_pair cgrad_hess_exponential( | |
| double y_true, | |
| double raw_prediction | |
| ) noexcept nogil: | |
| # Note that hessian = loss | |
| cdef double_pair gh | |
| gh.val2 = exp(raw_prediction) # used as temporary | |
| gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2 # gradient | |
| gh.val2 = y_true / gh.val2 + (1 - y_true) * gh.val2 # hessian | |
| return gh | |
| # --------------------------------------------------- | |
| # Extension Types for Loss Functions of 1-dim targets | |
| # --------------------------------------------------- | |
| cdef class CyLossFunction: | |
| """Base class for convex loss functions.""" | |
| def __reduce__(self): | |
| return (self.__class__, ()) | |
| cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil: | |
| """Compute the loss for a single sample. | |
| Parameters | |
| ---------- | |
| y_true : double | |
| Observed, true target value. | |
| raw_prediction : double | |
| Raw prediction value (in link space). | |
| Returns | |
| ------- | |
| double | |
| The loss evaluated at `y_true` and `raw_prediction`. | |
| """ | |
| pass | |
| cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil: | |
| """Compute gradient of loss w.r.t. raw_prediction for a single sample. | |
| Parameters | |
| ---------- | |
| y_true : double | |
| Observed, true target value. | |
| raw_prediction : double | |
| Raw prediction value (in link space). | |
| Returns | |
| ------- | |
| double | |
| The derivative of the loss function w.r.t. `raw_prediction`. | |
| """ | |
| pass | |
| cdef double_pair cy_grad_hess( | |
| self, double y_true, double raw_prediction | |
| ) noexcept nogil: | |
| """Compute gradient and hessian. | |
| Gradient and hessian of loss w.r.t. raw_prediction for a single sample. | |
| This is usually diagonal in raw_prediction_i and raw_prediction_j. | |
| Therefore, we return the diagonal element i=j. | |
| For a loss with a non-canonical link, this might implement the diagonal | |
| of the Fisher matrix (=expected hessian) instead of the hessian. | |
| Parameters | |
| ---------- | |
| y_true : double | |
| Observed, true target value. | |
| raw_prediction : double | |
| Raw prediction value (in link space). | |
| Returns | |
| ------- | |
| double_pair | |
| Gradient and hessian of the loss function w.r.t. `raw_prediction`. | |
| """ | |
| pass | |
| def loss( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] loss_out, # OUT | |
| int n_threads=1 | |
| ): | |
| """Compute the point-wise loss value for each input. | |
| The point-wise loss is written to `loss_out` and no array is returned. | |
| Parameters | |
| ---------- | |
| y_true : array of shape (n_samples,) | |
| Observed, true target values. | |
| raw_prediction : array of shape (n_samples,) | |
| Raw prediction values (in link space). | |
| sample_weight : array of shape (n_samples,) or None | |
| Sample weights. | |
| loss_out : array of shape (n_samples,) | |
| A location into which the result is stored. | |
| n_threads : int | |
| Number of threads used by OpenMP (if any). | |
| """ | |
| pass | |
| def gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] gradient_out, # OUT | |
| int n_threads=1 | |
| ): | |
| """Compute gradient of loss w.r.t raw_prediction for each input. | |
| The gradient is written to `gradient_out` and no array is returned. | |
| Parameters | |
| ---------- | |
| y_true : array of shape (n_samples,) | |
| Observed, true target values. | |
| raw_prediction : array of shape (n_samples,) | |
| Raw prediction values (in link space). | |
| sample_weight : array of shape (n_samples,) or None | |
| Sample weights. | |
| gradient_out : array of shape (n_samples,) | |
| A location into which the result is stored. | |
| n_threads : int | |
| Number of threads used by OpenMP (if any). | |
| """ | |
| pass | |
| def loss_gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] loss_out, # OUT | |
| floating_out[::1] gradient_out, # OUT | |
| int n_threads=1 | |
| ): | |
| """Compute loss and gradient of loss w.r.t raw_prediction. | |
| The loss and gradient are written to `loss_out` and `gradient_out` and no arrays | |
| are returned. | |
| Parameters | |
| ---------- | |
| y_true : array of shape (n_samples,) | |
| Observed, true target values. | |
| raw_prediction : array of shape (n_samples,) | |
| Raw prediction values (in link space). | |
| sample_weight : array of shape (n_samples,) or None | |
| Sample weights. | |
| loss_out : array of shape (n_samples,) or None | |
| A location into which the element-wise loss is stored. | |
| gradient_out : array of shape (n_samples,) | |
| A location into which the gradient is stored. | |
| n_threads : int | |
| Number of threads used by OpenMP (if any). | |
| """ | |
| self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads) | |
| self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads) | |
| def gradient_hessian( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] gradient_out, # OUT | |
| floating_out[::1] hessian_out, # OUT | |
| int n_threads=1 | |
| ): | |
| """Compute gradient and hessian of loss w.r.t raw_prediction. | |
| The gradient and hessian are written to `gradient_out` and `hessian_out` and no | |
| arrays are returned. | |
| Parameters | |
| ---------- | |
| y_true : array of shape (n_samples,) | |
| Observed, true target values. | |
| raw_prediction : array of shape (n_samples,) | |
| Raw prediction values (in link space). | |
| sample_weight : array of shape (n_samples,) or None | |
| Sample weights. | |
| gradient_out : array of shape (n_samples,) | |
| A location into which the gradient is stored. | |
| hessian_out : array of shape (n_samples,) | |
| A location into which the hessian is stored. | |
| n_threads : int | |
| Number of threads used by OpenMP (if any). | |
| """ | |
| pass | |
| {{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}} | |
| {{py: | |
| if param is None: | |
| with_param = "" | |
| else: | |
| with_param = ", self." + param | |
| }} | |
| cdef class {{name}}(CyLossFunction): | |
| """{{docstring}}""" | |
| {{if param is not None}} | |
| def __init__(self, {{param}}): | |
| self.{{param}} = {{param}} | |
| {{endif}} | |
| {{if param is not None}} | |
| def __reduce__(self): | |
| return (self.__class__, (self.{{param}},)) | |
| {{endif}} | |
| cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil: | |
| return {{closs}}(y_true, raw_prediction{{with_param}}) | |
| cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil: | |
| return {{cgrad}}(y_true, raw_prediction{{with_param}}) | |
| cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil: | |
| return {{cgrad_hess}}(y_true, raw_prediction{{with_param}}) | |
| def loss( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] loss_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i | |
| int n_samples = y_true.shape[0] | |
| if sample_weight is None: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| else: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| {{if closs_grad is not None}} | |
| def loss_gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] loss_out, # OUT | |
| floating_out[::1] gradient_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i | |
| int n_samples = y_true.shape[0] | |
| double_pair dbl2 | |
| if sample_weight is None: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| loss_out[i] = dbl2.val1 | |
| gradient_out[i] = dbl2.val2 | |
| else: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| loss_out[i] = sample_weight[i] * dbl2.val1 | |
| gradient_out[i] = sample_weight[i] * dbl2.val2 | |
| {{endif}} | |
| def gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] gradient_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i | |
| int n_samples = y_true.shape[0] | |
| if sample_weight is None: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| else: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| def gradient_hessian( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] gradient_out, # OUT | |
| floating_out[::1] hessian_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i | |
| int n_samples = y_true.shape[0] | |
| double_pair dbl2 | |
| if sample_weight is None: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| gradient_out[i] = dbl2.val1 | |
| hessian_out[i] = dbl2.val2 | |
| else: | |
| for i in prange( | |
| n_samples, schedule='static', nogil=True, num_threads=n_threads | |
| ): | |
| dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}}) | |
| gradient_out[i] = sample_weight[i] * dbl2.val1 | |
| hessian_out[i] = sample_weight[i] * dbl2.val2 | |
| {{endfor}} | |
| # The multinomial deviance loss is also known as categorical cross-entropy or | |
| # multinomial log-likelihood. | |
| # Here, we do not inherit from CyLossFunction as its cy_gradient method deviates | |
| # from the API. | |
| cdef class CyHalfMultinomialLoss(): | |
| """Half Multinomial deviance loss with multinomial logit link. | |
| Domain: | |
| y_true in {0, 1, 2, 3, .., n_classes - 1} | |
| y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded | |
| Link: | |
| y_pred = softmax(raw_prediction) | |
| Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is | |
| mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1. | |
| """ | |
| # Here we deviate from the CyLossFunction API. SAG/SAGA needs direct access to | |
| # sample-wise gradients which we provide here. | |
| cdef inline void cy_gradient( | |
| self, | |
| const floating_in y_true, | |
| const floating_in[::1] raw_prediction, # IN | |
| const floating_in sample_weight, | |
| floating_out[::1] gradient_out, # OUT | |
| ) noexcept nogil: | |
| """Compute gradient of loss w.r.t. `raw_prediction` for a single sample. | |
| The gradient of the multinomial logistic loss with respect to a class k, | |
| and for one sample is: | |
| grad_k = - sw * (p[k] - (y==k)) | |
| where: | |
| p[k] = proba[k] = exp(raw_prediction[k] - logsumexp(raw_prediction)) | |
| sw = sample_weight | |
| Parameters | |
| ---------- | |
| y_true : double | |
| Observed, true target value. | |
| raw_prediction : array of shape (n_classes,) | |
| Raw prediction values (in link space). | |
| sample_weight : double | |
| Sample weight. | |
| gradient_out : array of shape (n_classs,) | |
| A location into which the gradient is stored. | |
| Returns | |
| ------- | |
| gradient : double | |
| The derivative of the loss function w.r.t. `raw_prediction`. | |
| """ | |
| cdef: | |
| int k | |
| int n_classes = raw_prediction.shape[0] | |
| double_pair max_value_and_sum_exps | |
| const floating_in[:, :] raw = raw_prediction[None, :] | |
| max_value_and_sum_exps = sum_exp_minus_max(0, raw, &gradient_out[0]) | |
| for k in range(n_classes): | |
| # gradient_out[k] = p_k = y_pred_k = prob of class k | |
| gradient_out[k] /= max_value_and_sum_exps.val2 | |
| # gradient_k = (p_k - (y_true == k)) * sw | |
| gradient_out[k] = (gradient_out[k] - (y_true == k)) * sample_weight | |
| def _test_cy_gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[:, ::1] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| ): | |
| """For testing only.""" | |
| cdef: | |
| int i, k | |
| int n_samples = y_true.shape[0] | |
| int n_classes = raw_prediction.shape[1] | |
| floating_in [:, ::1] gradient_out | |
| gradient = np.empty((n_samples, n_classes), dtype=np.float64) | |
| gradient_out = gradient | |
| for i in range(n_samples): | |
| self.cy_gradient( | |
| y_true=y_true[i], | |
| raw_prediction=raw_prediction[i, :], | |
| sample_weight=1.0 if sample_weight is None else sample_weight[i], | |
| gradient_out=gradient_out[i, :], | |
| ) | |
| return gradient | |
| # Note that we do not assume memory alignment/contiguity of 2d arrays. | |
| # There seems to be little benefit in doing so. Benchmarks proofing the | |
| # opposite are welcome. | |
| def loss( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[:, :] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] loss_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i, k | |
| int n_samples = y_true.shape[0] | |
| int n_classes = raw_prediction.shape[1] | |
| floating_in max_value, sum_exps | |
| floating_in* p # temporary buffer | |
| double_pair max_value_and_sum_exps | |
| # We assume n_samples > n_classes. In this case having the inner loop | |
| # over n_classes is a good default. | |
| # TODO: If every memoryview is contiguous and raw_prediction is | |
| # f-contiguous, can we write a better algo (loops) to improve | |
| # performance? | |
| if sample_weight is None: | |
| # inner loop over n_classes | |
| with nogil, parallel(num_threads=n_threads): | |
| # Define private buffer variables as each thread might use its | |
| # own. | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| max_value = max_value_and_sum_exps.val1 | |
| sum_exps = max_value_and_sum_exps.val2 | |
| loss_out[i] = log(sum_exps) + max_value | |
| # label encoded y_true | |
| k = int(y_true[i]) | |
| loss_out[i] -= raw_prediction[i, k] | |
| free(p) | |
| else: | |
| with nogil, parallel(num_threads=n_threads): | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| max_value = max_value_and_sum_exps.val1 | |
| sum_exps = max_value_and_sum_exps.val2 | |
| loss_out[i] = log(sum_exps) + max_value | |
| # label encoded y_true | |
| k = int(y_true[i]) | |
| loss_out[i] -= raw_prediction[i, k] | |
| loss_out[i] *= sample_weight[i] | |
| free(p) | |
| def loss_gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[:, :] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[::1] loss_out, # OUT | |
| floating_out[:, :] gradient_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i, k | |
| int n_samples = y_true.shape[0] | |
| int n_classes = raw_prediction.shape[1] | |
| floating_in max_value, sum_exps | |
| floating_in* p # temporary buffer | |
| double_pair max_value_and_sum_exps | |
| if sample_weight is None: | |
| # inner loop over n_classes | |
| with nogil, parallel(num_threads=n_threads): | |
| # Define private buffer variables as each thread might use its | |
| # own. | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| max_value = max_value_and_sum_exps.val1 | |
| sum_exps = max_value_and_sum_exps.val2 | |
| loss_out[i] = log(sum_exps) + max_value | |
| for k in range(n_classes): | |
| # label decode y_true | |
| if y_true[i] == k: | |
| loss_out[i] -= raw_prediction[i, k] | |
| p[k] /= sum_exps # p_k = y_pred_k = prob of class k | |
| # gradient_k = p_k - (y_true == k) | |
| gradient_out[i, k] = p[k] - (y_true[i] == k) | |
| free(p) | |
| else: | |
| with nogil, parallel(num_threads=n_threads): | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| max_value = max_value_and_sum_exps.val1 | |
| sum_exps = max_value_and_sum_exps.val2 | |
| loss_out[i] = log(sum_exps) + max_value | |
| for k in range(n_classes): | |
| # label decode y_true | |
| if y_true[i] == k: | |
| loss_out[i] -= raw_prediction[i, k] | |
| p[k] /= sum_exps # p_k = y_pred_k = prob of class k | |
| # gradient_k = (p_k - (y_true == k)) * sw | |
| gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i] | |
| loss_out[i] *= sample_weight[i] | |
| free(p) | |
| def gradient( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[:, :] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[:, :] gradient_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i, k | |
| int n_samples = y_true.shape[0] | |
| int n_classes = raw_prediction.shape[1] | |
| floating_in sum_exps | |
| floating_in* p # temporary buffer | |
| double_pair max_value_and_sum_exps | |
| if sample_weight is None: | |
| # inner loop over n_classes | |
| with nogil, parallel(num_threads=n_threads): | |
| # Define private buffer variables as each thread might use its | |
| # own. | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| sum_exps = max_value_and_sum_exps.val2 | |
| for k in range(n_classes): | |
| p[k] /= sum_exps # p_k = y_pred_k = prob of class k | |
| # gradient_k = y_pred_k - (y_true == k) | |
| gradient_out[i, k] = p[k] - (y_true[i] == k) | |
| free(p) | |
| else: | |
| with nogil, parallel(num_threads=n_threads): | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| sum_exps = max_value_and_sum_exps.val2 | |
| for k in range(n_classes): | |
| p[k] /= sum_exps # p_k = y_pred_k = prob of class k | |
| # gradient_k = (p_k - (y_true == k)) * sw | |
| gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i] | |
| free(p) | |
| def gradient_hessian( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[:, :] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[:, :] gradient_out, # OUT | |
| floating_out[:, :] hessian_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i, k | |
| int n_samples = y_true.shape[0] | |
| int n_classes = raw_prediction.shape[1] | |
| floating_in sum_exps | |
| floating_in* p # temporary buffer | |
| double_pair max_value_and_sum_exps | |
| if sample_weight is None: | |
| # inner loop over n_classes | |
| with nogil, parallel(num_threads=n_threads): | |
| # Define private buffer variables as each thread might use its | |
| # own. | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| sum_exps = max_value_and_sum_exps.val2 | |
| for k in range(n_classes): | |
| p[k] /= sum_exps # p_k = y_pred_k = prob of class k | |
| # hessian_k = p_k * (1 - p_k) | |
| # gradient_k = p_k - (y_true == k) | |
| gradient_out[i, k] = p[k] - (y_true[i] == k) | |
| hessian_out[i, k] = p[k] * (1. - p[k]) | |
| free(p) | |
| else: | |
| with nogil, parallel(num_threads=n_threads): | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| sum_exps = max_value_and_sum_exps.val2 | |
| for k in range(n_classes): | |
| p[k] /= sum_exps # p_k = y_pred_k = prob of class k | |
| # gradient_k = (p_k - (y_true == k)) * sw | |
| # hessian_k = p_k * (1 - p_k) * sw | |
| gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i] | |
| hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i] | |
| free(p) | |
| # This method simplifies the implementation of hessp in linear models, | |
| # i.e. the matrix-vector product of the full hessian, not only of the | |
| # diagonal (in the classes) approximation as implemented above. | |
| def gradient_proba( | |
| self, | |
| const floating_in[::1] y_true, # IN | |
| const floating_in[:, :] raw_prediction, # IN | |
| const floating_in[::1] sample_weight, # IN | |
| floating_out[:, :] gradient_out, # OUT | |
| floating_out[:, :] proba_out, # OUT | |
| int n_threads=1 | |
| ): | |
| cdef: | |
| int i, k | |
| int n_samples = y_true.shape[0] | |
| int n_classes = raw_prediction.shape[1] | |
| floating_in sum_exps | |
| floating_in* p # temporary buffer | |
| double_pair max_value_and_sum_exps | |
| if sample_weight is None: | |
| # inner loop over n_classes | |
| with nogil, parallel(num_threads=n_threads): | |
| # Define private buffer variables as each thread might use its | |
| # own. | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| sum_exps = max_value_and_sum_exps.val2 | |
| for k in range(n_classes): | |
| proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k | |
| # gradient_k = y_pred_k - (y_true == k) | |
| gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k) | |
| free(p) | |
| else: | |
| with nogil, parallel(num_threads=n_threads): | |
| p = <floating_in *> malloc(sizeof(floating_in) * (n_classes)) | |
| for i in prange(n_samples, schedule='static'): | |
| max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) | |
| sum_exps = max_value_and_sum_exps.val2 | |
| for k in range(n_classes): | |
| proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k | |
| # gradient_k = (p_k - (y_true == k)) * sw | |
| gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i] | |
| free(p) | |