Source code for itea.regression._ITExpr_regressor

# Author:  Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.2
# Last modified: 07-14-2021 by Guilherme Aldeia


"""ITExpr sub-class, specialized to regression task.
"""


import numpy as np

from sklearn.base             import RegressorMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.metrics          import mean_squared_error, r2_score
from sklearn.exceptions       import NotFittedError
from scipy.linalg             import lstsq

from itea._base import BaseITExpr


[docs]class ITExpr_regressor(BaseITExpr, RegressorMixin): """ITExpr for the regression task. This will be the class in ``ITEA_regressor.bestsol_``. """ def __init__(self, *, expr, tfuncs, labels = [], fitness_f=None, **kwargs): r"""Constructor method. Parameters ---------- expr : list of Tuple[Transformation, Interaction] list of IT terms to create an IT expression. It **must** be a python built-in list. tfuncs : dict should always be a dict where the keys are the names of the transformation functions and the values are unary vectorized functions (for example, numpy functions). For user-defined functions, see numpy.vectorize for more information on how to vectorize your transformation functions. labels : list of strings, default=[] list containing the labels of the variables that will be used. When the list of labels is empty, the variables are named :math:`x_0, x_1, \cdots`. fitness_f : string or None, default=None String with the method to evaluate the fitness of the expressions. Can be one of ``['rmse', 'mse', 'r2']``. If none is given, then 'rmse' is used as default fitness function for the regression task. Raises ValueError if the attribute value is not correct. Attributes ---------- n_terms : int the number of inferred IT terms. is_fitted : bool boolean variable indicating if the ITExpr was fitted before. _fitness : float fitness (RMSE) of the expression on the training data. intercept_ : float regression intercept. coef_ : numpy.array of shape (n_terms, ) coefficients for each term. """ super(ITExpr_regressor, self).__init__( expr=expr, tfuncs=tfuncs, labels=labels) self.fitness_f = fitness_f
[docs] def covariance_matrix(self, X, y): """Estimation of the covariance matrix of the coefficients. Parameters ---------- X: numpy.array of shape (n_samples, n_features) Returns ------- covar : numpy.array of shape (n_terms+1, n_terms+1) covariance matrix of the coefficients. The last row/column is the intercept. """ N = X.shape[0] p = self.n_terms + 1 residuals = y - self.predict(X) residual_sum_of_squares = residuals.T @ residuals sigma_squared_hat = residual_sum_of_squares / (N - p) X_design = np.ones( (N, p) ) X_design[:, :-1] = self._eval(X) try: return np.linalg.inv(X_design.T @ X_design) * sigma_squared_hat except np.linalg.LinAlgError as e: return np.linalg.pinv(X_design.T @ X_design) * sigma_squared_hat
[docs] def fit(self, X, y): """Fits the linear model created by combining the IT terms. This method performs the transformation of the original data in X to the IT expression domain then fits a linear regression model to calculate the best coefficients and intercept to the IT expression. If the expression fails to fit, its ``_fitness`` is set to np.inf, since the fitness function is the RMSE and smaller values are better. Parameters ---------- X : array-like of shape (n_samples, n_features) training data. y : array-like of shape (n_samples, ) expected values. Returns ------- self : ITExpr_regressor itexpr after fitting the coefficients and intercept. Only after fitting the model that the attributes ``coef_`` and ``intercept_`` will be available. Notes ----- This fit method does not check if the input is consistent, to minimize the overhead since the ``ITEA_regressor`` will work with a population of ``ITExpr_regressor`` instances. The input is then checked in the fit method from ``ITEA_regressor``. If you want to use the fit method directly from the ``ITExpr_regressor``, it is recommended that you do the check with ``check_array` `that scikit-learn provides in ``sklearn.utils.validation``. """ if not self._is_fitted: # applying the interaction and transformation to fit a linear model # using the transformed variables Z Z = self._eval(X) if np.isfinite(Z).all() and np.all(np.abs(Z) < 1e+200): # using the LinearRegression from scikit, the fit should be # simple as this: # from sklearn.linear_model import LinearRegression # fit_model_ = LinearRegression().fit(Z, y) # self.coef_ = fit_model_.coef_ # self.intercept_ = fit_model_.intercept_ # self._fitness = self.fitness_f(fit_model_.predict(Z), y) # Centering (this results in one less column and makes possible # to easily calculate the intercept after fitting) y_offset = np.average(y, axis=0) Z_offset = np.average(Z, axis=0) y_centered = y - y_offset Z_centered = Z - Z_offset coef, residues, rank, singular = lstsq(Z_centered, y_centered) if y.ndim == 1: self.coef_ = np.ravel(coef.T) # Saving the fitted parameters self.coef_ = coef.T self.intercept_ = y_offset - np.dot(Z_offset, coef.T) # setting fitted to true to use prediction below self._is_fitted = True pred = np.dot(self._eval(X), self.coef_) + self.intercept_ if self.fitness_f == 'rmse' or self.fitness_f == None: self._fitness = mean_squared_error(pred, y, squared=False) elif self.fitness_f == 'mse': self._fitness = mean_squared_error(pred, y, squared=True) elif self.fitness_f == 'r2': self._fitness = r2_score(pred, y) else: raise ValueError('Unknown fitness function. passed ' f'value for fitness_f is {self.fitness_f}, expected ' 'one of ["rmse", "mse", "r2"]') else: self.coef_ = np.ones(self.n_terms) self.intercept_ = 0.0 # Infinite fitness are filtered of the population in ITEA self._fitness = np.inf # Failed to fit. Default values were set and the is_fitted # is set to true to avoid repeated failing fits. self._is_fitted = True return self
[docs] def predict(self, X): """Predicts the response value for each sample in X. If the expression fails to predict a finite value, then the default returned value is the expression's intercept. Parameters ---------- X : array-like of shape (n_samples, n_features) samples to be predicted. Must be a two-dimensional array. Returns ------- p : numpy.array of shape (n_samples, ) predicted response value for each sample. Raises ------ NotFittedError If the expression was not fitted before calling this method. """ # scikit check - searches for attributes ending with '_' check_is_fitted(self) # my check, which indicates if the expression was changed by # manipulators or not fitted if not self._is_fitted: raise NotFittedError( "The expression was simplified and has not refitted.") X = check_array(X) return np.nan_to_num( np.dot(self._eval(X), self.coef_) + self.intercept_, nan=self.intercept_, posinf=self.intercept_, neginf=self.intercept_ )