Source code for itea.regression._ITExpr_regressor

# Author:  Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.2
# Last modified: 07-14-2021 by Guilherme Aldeia


"""ITExpr sub-class, specialized to regression task.
"""


import numpy as np

from sklearn.base             import RegressorMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.metrics          import mean_squared_error, r2_score
from sklearn.exceptions       import NotFittedError
from scipy.linalg             import lstsq

from itea._base import BaseITExpr


[docs]class ITExpr_regressor(BaseITExpr, RegressorMixin):
    """ITExpr for the regression task. This will be the class 
    in ``ITEA_regressor.bestsol_``.
    """

    def __init__(self, *, expr, tfuncs, labels = [], fitness_f=None, **kwargs):
        r"""Constructor method.

        Parameters
        ----------
        expr : list of Tuple[Transformation, Interaction]
            list of IT terms to create an IT expression. It **must** be a
            python built-in list.

        tfuncs : dict
            should always be a dict where the
            keys are the names of the transformation functions and 
            the values are unary vectorized functions (for example,
            numpy functions). For user-defined functions, see
            numpy.vectorize for more information on how to vectorize
            your transformation functions.

        labels : list of strings, default=[]
            list containing the labels of the variables that will be used.
            When the list of labels is empty, the variables are named
            :math:`x_0, x_1, \cdots`.

        fitness_f : string or None, default=None
            String with the method to evaluate the fitness of the expressions.
            Can be one of ``['rmse', 'mse', 'r2']``. If none is given, then
            'rmse' is used as default fitness function for the regression
            task. Raises ValueError if the attribute value is not correct.

        Attributes
        ----------
        n_terms : int
            the number of inferred IT terms.

        is_fitted : bool
            boolean variable indicating if the ITExpr was fitted before.

        _fitness : float
            fitness (RMSE) of the expression on the training data.

        intercept_ : float 
            regression intercept.

        coef_ : numpy.array of shape (n_terms, )
            coefficients for each term.
        """

        super(ITExpr_regressor, self).__init__(
            expr=expr, tfuncs=tfuncs, labels=labels)

        self.fitness_f = fitness_f


[docs]    def covariance_matrix(self, X, y):
        """Estimation of the covariance matrix of the coefficients.

        Parameters
        ----------
        X: numpy.array of shape (n_samples, n_features)

        Returns
        -------
        covar : numpy.array of shape (n_terms+1, n_terms+1)
            covariance matrix of the coefficients.

            The last row/column is the intercept.
        """

        N = X.shape[0]
        p = self.n_terms + 1

        residuals = y - self.predict(X)
        
        residual_sum_of_squares = residuals.T @ residuals

        sigma_squared_hat = residual_sum_of_squares / (N - p)
        
        X_design = np.ones( (N, p) )
        X_design[:, :-1] = self._eval(X)

        try:
            return np.linalg.inv(X_design.T @ X_design) * sigma_squared_hat
        except np.linalg.LinAlgError as e:
            return np.linalg.pinv(X_design.T @ X_design) * sigma_squared_hat


[docs]    def fit(self, X, y):
        """Fits the linear model created by combining the IT terms.

        This method performs the transformation of the original data in X to 
        the IT expression domain then fits a linear regression model to 
        calculate the best coefficients and intercept to the IT expression.

        If the expression fails to fit, its ``_fitness`` is set to np.inf,
        since the fitness function is the RMSE and smaller values are better.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            training data.

        y : array-like of shape (n_samples, )
            expected values. 

        Returns
        -------
        self : ITExpr_regressor
            itexpr after fitting the coefficients and intercept.
            Only after fitting the model that the attributes ``coef_`` and
            ``intercept_`` will be available.

        Notes
        -----
        This fit method does not check if the input is consistent, to minimize
        the overhead since the ``ITEA_regressor`` will work with a population
        of ``ITExpr_regressor`` instances. The input is then checked in 
        the fit method from ``ITEA_regressor``. If you want to use the fit
        method directly from the ``ITExpr_regressor``, it is recommended that
        you do the check with ``check_array` `that scikit-learn provides in
        ``sklearn.utils.validation``.
        """
                
        if not self._is_fitted:
                
            # applying the interaction and transformation to fit a linear model
            # using the transformed variables Z
            Z = self._eval(X)

            if np.isfinite(Z).all() and np.all(np.abs(Z) < 1e+200):
                # using the LinearRegression from scikit, the fit should be
                # simple as this:
                # from sklearn.linear_model import LinearRegression
                # fit_model_      = LinearRegression().fit(Z, y)
                # self.coef_      = fit_model_.coef_
                # self.intercept_ = fit_model_.intercept_
                # self._fitness   = self.fitness_f(fit_model_.predict(Z), y)

                # Centering (this results in one less column and makes possible
                # to easily calculate the intercept after fitting)
                y_offset = np.average(y, axis=0)
                Z_offset = np.average(Z, axis=0)

                y_centered = y - y_offset
                Z_centered = Z - Z_offset
            
                coef, residues, rank, singular = lstsq(Z_centered, y_centered)
        
                if y.ndim == 1:
                    self.coef_ = np.ravel(coef.T)

                # Saving the fitted parameters                
                self.coef_      = coef.T
                self.intercept_ = y_offset - np.dot(Z_offset, coef.T)
        
                # setting fitted to true to use prediction below
                self._is_fitted = True

                pred = np.dot(self._eval(X), self.coef_) + self.intercept_
                
                if self.fitness_f == 'rmse' or self.fitness_f == None:
                    self._fitness = mean_squared_error(pred, y, squared=False)
                elif self.fitness_f == 'mse':
                    self._fitness = mean_squared_error(pred, y, squared=True)
                elif self.fitness_f == 'r2':
                    self._fitness = r2_score(pred, y)
                else:
                    raise ValueError('Unknown fitness function. passed '
                        f'value for fitness_f is {self.fitness_f}, expected '
                        'one of ["rmse", "mse", "r2"]')
            else:
                self.coef_      = np.ones(self.n_terms)
                self.intercept_ = 0.0

                # Infinite fitness are filtered of the population in ITEA
                self._fitness   = np.inf

                # Failed to fit. Default values were set and the is_fitted
                # is set to true to avoid repeated failing fits.
                self._is_fitted = True

        return self


[docs]    def predict(self, X):
        """Predicts the response value for each sample in X.
        
        If the expression fails to predict a finite value, then the default
        returned value is the expression's intercept.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            samples to be predicted. Must be a two-dimensional array.

        Returns
        -------
        p : numpy.array of shape (n_samples, )
            predicted response value for each sample.

        Raises
        ------
            NotFittedError
                If the expression was not fitted before calling this method.
        """

        # scikit check - searches for attributes ending with '_'
        check_is_fitted(self)

        # my check, which indicates if the expression was changed by
        # manipulators or not fitted
        if not self._is_fitted:
            raise NotFittedError(
                "The expression was simplified and has not refitted.")

        X = check_array(X)

        return np.nan_to_num(
            np.dot(self._eval(X), self.coef_) + self.intercept_,
            nan=self.intercept_,
            posinf=self.intercept_,
            neginf=self.intercept_
        )