Source code for itea.classification._ITExpr_classifier

# Author:  Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.3
# Last modified: 07-14-2021 by Guilherme Aldeia


"""ITExpr sub-class, specialized to classification task.
"""


import numpy as np

from sklearn.base              import ClassifierMixin
from sklearn.utils.validation  import check_array, check_is_fitted
from sklearn.metrics           import accuracy_score
from sklearn.preprocessing     import LabelEncoder
from sklearn.utils.extmath     import row_norms, safe_sparse_dot, softmax
from sklearn.exceptions        import NotFittedError

from sklearn.linear_model._base     import make_dataset
from sklearn.linear_model._sag_fast import sag32, sag64


from itea._base import BaseITExpr


[docs]class ITExpr_classifier(BaseITExpr, ClassifierMixin): """ITExpr for the classification task. This will be the class in ``ITEA_classifier.bestsol_``. """ def __init__(self, *, expr, tfuncs, labels = [], fitness_f=None, max_iter=100, alpha=0., beta=0., **kwargs ): r"""Constructor method. Parameters ---------- expr : list of Tuple[Transformation, Interaction] list of IT terms to create an IT expression. It **must** be a python built-in list. tfuncs : dict should always be a dict where the keys are the names of the transformation functions and the values are unary vectorized functions (for example, numpy functions). For user-defined functions, see numpy.vectorize for more information on how to vectorize your transformation functions. labels : list of strings, default=[] list containing the labels of the variables that will be used. When the list of labels is empty, the variables are named :math:`x_0, x_1, \cdots`. fitness_f : string or None, default=None String with the method to evaluate the fitness of the expressions. Can be one of ``['accuracy_score']``. If none is given, then the accuracy_score function will be used. Raises ValueError if the attribute value is not correct. max_iter : int, default=100 the maximum number of iterations that the optimization gradient method should perform to adjust the coefficients of the linear model used as the decision function in the inner logistic regression method implemented in the ``ITExpr_classifier``. Smaller values can improve performance, at the cost of a weaker adjustment. alpha : float, default = 0.0 The logistic regressor will use the saga solver with a elastic net regularization. Alpha parameter controls the L1 regularization. beta : float, default = 0.0 The logistic regressor will use the saga solver with a elastic net regularization. Beta parameter controls the L2 regularization. Attributes ---------- n_terms : int number of inferred IT terms. is_fitted : bool boolean variable indicating if the ITExpr was fitted before. _fitness : float fitness (accuracy_score) of the expression on the training data. intercept_ : numpy.array of shape (n_classes, ) intercept array used in the probability estimation for each class of the training data. coef_ : numpy.array of shape (n_classes, n_terms) coefficients used in the probability estimation for each class of the training data. classes_ : numpy.array of shape (n_classes, ) target classes inferred from the training y target data. Notes ----- The saga is described in the paper: "Defazio, A., Bach F. & Lacoste-Julien S. (2014). SAGA: A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives" """ super(ITExpr_classifier, self).__init__( expr=expr, tfuncs=tfuncs, labels=labels) self.max_iter = max_iter self.alpha = alpha self.beta = beta self.fitness_f = fitness_f
[docs] def covariance_matrix(self, X, y): """Estimation of the covariance matrix of the coefficients. Parameters ---------- X: numpy.array of shape (n_samples, n_features) Returns ------- covar : numpy.array of shape (n_classes, n_terms+1, n_terms+1) each element in ``covar`` will be the covariance matrix to the logistic regressor when considering the classes as a one vs all problem. The last row/column of each ``covar[i]`` is the intercept. """ X_design = np.ones( (X.shape[0], self.n_terms + 1 ) ) X_design[:, :-1] = self._eval(X) probs = self.predict_proba(X) covars = np.zeros((len(self.classes_), self.n_terms+1, self.n_terms+1)) for class_id in range(len(self.classes_)): # Estimating as a one vs all classification for each class prob_class1 = probs[:, class_id] prob_class0 = np.sum( probs[:, [i for i in range(len(self.classes_)) if i != class_id]], axis=1) V = np.diagflat(np.prod([prob_class0, prob_class1], axis=0)) try: covars[class_id, :, :] = np.linalg.inv( X_design.T @ V @ X_design) except np.linalg.LinAlgError as e: # singular matrix, lets use the pseudo inverse covars[class_id, :, :] = np.linalg.pinv( X_design.T @ V @ X_design) return covars
[docs] def fit(self, X, y): """Fits the logistic regression with the IT expression as the linear method. This method performs the transformation of the original data in X to the IT expression domain then fits a logistic regressor using the IT expression as decision function. The logistic regressor is fitted by means of the saga method without any penalties. If the expression fails to fit, its ``_fitness`` is set to -np.inf, since the fitness function is the accuracy score and greater values are better. Parameters ---------- X : array-like of shape (n_samples, n_features) training data. y : array-like of shape (n_samples, ) target vector. Can be a binary classification problem or a multi-class classification problem. Returns ------- self : ITExpr_classifier itexpr after fitting the coefficients and intercept. Only after fitting the model that the attributes ``coef_``, ``intercept_``, and ``classes_`` will be available. Notes ----- This fit method does not check if the input is consistent, to minimize the overhead since the ``ITEA_classifier`` will work with a population of ``ITExpr_classifier`` instances. The input is then checked in the fit method from ``ITEA_classifier``. If you want to use the fit method directly from the ``ITExpr_classifier``, it is recommended that you do the check with ``check_array` `that scikit-learn provides in ``sklearn.utils.validation``. """ if not self._is_fitted: Z = self._eval(X) if np.isfinite(Z).all() and np.all(np.abs(Z) < 1e+200): # using the LinearRegression from scikit, the fit should be # simple as this: # from sklearn.linear_model import LogisticRegression # fit_model_ = LogisticRegression(solver='saga', penalty='none') # self.coef_ = fit_model_.coef_ # self.classes_ = fit_model_.classes_ # self.intercept_ = fit_model_.intercept_ # self._fitness = self.fitness_f(fit_model_.predict(Z), y) self.classes_ = np.unique(y) n_classes = len(self.classes_) n_terms = Z.shape[1] max_squared_sum = row_norms(Z, squared=True).max() # Preparing for the multinomial or log classification if n_classes > 2: multi_class = 'multinomial' # from scikit: "SAG multinomial solver needs LabelEncoder" target = LabelEncoder().fit_transform(y) \ .astype(Z.dtype, copy=False) w0 = np.zeros((self.classes_.size, n_terms + 1), order='F', dtype=Z.dtype) coef_init = w0.T else: multi_class = 'log' target = np.ones(y.shape, dtype=Z.dtype) target[~(y == self.classes_[1])] = -1. w0 = np.zeros(n_terms + 1, dtype=Z.dtype) coef_init = np.expand_dims(w0, axis=1) n_samples, n_features = Z.shape[0], Z.shape[1] # As in SGD, the alpha is scaled by n_samples. alpha_scaled = float(self.alpha) / n_samples beta_scaled = float(self.beta) / n_samples # if multi_class == 'multinomial', y should be label encoded. if multi_class == 'multinomial': n_classes = int(target.max()) + 1 else: n_classes = 1 # initialization sample_weight = np.ones(n_samples, dtype=Z.dtype) intercept_init = coef_init[-1, :] coef_init = coef_init[:-1, :] # Using Z to make the IT expression as decision function dataset, intercept_decay = make_dataset( Z, target, sample_weight, random_state=42) # Calculating the step size L = (0.25 * (max_squared_sum +1) + alpha_scaled) mun = min(2 * n_samples * alpha_scaled, L) step_size = 1. / (2 * L + mun) if step_size * alpha_scaled == 1: raise ZeroDivisionError( "Current sag implementation does not handle " "the case step_size * alpha_scaled == 1") # Choosing the c implementation of sag solver sag = sag64 if Z.dtype == np.float64 else sag32 # This is a c implementation, we need to provide all parameters tol = 0.001 sum_gradient_init = np.zeros((n_features, n_classes), dtype=Z.dtype, order='C') gradient_memory_init =np.zeros((n_samples, n_classes), dtype=Z.dtype, order='C') seen_init = np.zeros(n_samples, dtype=np.int32, order='C') num_seen_init = 0 fit_intercept = True intercept_sum_gradient = np.zeros(n_classes, dtype=Z.dtype) intercept_decay = intercept_decay is_saga = True verbose = False num_seen, n_iter_ = sag(dataset, coef_init, intercept_init, n_samples, n_features, n_classes, tol, self.max_iter, multi_class, step_size, alpha_scaled, beta_scaled, sum_gradient_init, gradient_memory_init, seen_init, num_seen_init, fit_intercept, intercept_sum_gradient, intercept_decay, is_saga, verbose) coef_init = np.vstack((coef_init, intercept_init)) if multi_class == 'multinomial': coef_ = coef_init.T else: coef_ = coef_init[:, 0] # Extracting the intercept if n_classes <= 2: coef_ = coef_.reshape(n_classes, n_terms + 1) intercept_ = coef_[:, -1] coef_ = coef_[:, :-1] # Saving the fitted parameters self.classes_ = self.classes_ self.intercept_ = intercept_ self.coef_ = coef_ # setting fitted to true to use prediction below self._is_fitted = True # Calculating the fitness using the predictions without doing # the input check prob = (safe_sparse_dot( self._eval(X), self.coef_.T) + self.intercept_) if len(self.classes_) <= 2: prob = np.hstack( (np.ones(X.shape[0]).reshape(-1, 1), prob) ) prob[:, 0] -= prob[:, 1] pred = self.classes_[np.argmax(softmax(prob), axis=1)] if self.fitness_f == 'accuracy_score' or self.fitness_f == None: self._fitness = accuracy_score(pred, y) else: raise ValueError('Unknown fitness function. passed ' f'value for fitness_f is {self.fitness_f}, expected ' 'one of ["accuracy_score"]') else: self.classes_ = np.unique(y) self.intercept_ = np.zeros( (len(self.classes_)) ) self._fitness = np.inf self.coef_ = np.ones( (len(self.classes_), self.n_terms) ) # Failed to fit. Default values were set and the is_fitted # is set to true to avoid repeated failing fits. self._is_fitted = True return self
[docs] def predict(self, X): """Predict class target for each sample in X. Parameters ---------- X : array-like of shape (n_samples, n_features) samples to be predicted. Must be a two-dimensional array. Returns ------- p : numpy.array of shape (n_samples, ) predicted target for each sample. Raises ------ NotFittedError If the expression was not fitted before calling this method. """ probabilities = self.predict_proba(X) return self.classes_[np.argmax(probabilities, axis=1)]
[docs] def predict_proba(self, X): """Predict probabilities for each possible target for each sample in X. If the expression fails to predict a finite value, then the default returned value is zero for the corresponding class. If the expression evaluates to infinity, then the largest possible finite number is returned. Parameters ---------- X : array-like of shape (n_samples, n_features) samples to be predicted. Must be a two-dimensional array. Returns ------- p : numpy.array of shape (n_samples, n_classes) prediction probability for each class target for each sample. Raises ------ NotFittedError If the expression was not fitted before calling this method. """ # scikit check - searches for attributes ending with '_' check_is_fitted(self) # my check, which indicates if the expression was changed by # manipulators or not fitted if not self._is_fitted: raise NotFittedError( "The expression was simplified and has not refitted.") X = check_array(X) prob = np.nan_to_num( safe_sparse_dot( self._eval(X), self.coef_.T) + self.intercept_, nan=0.0, posinf=0.0, neginf=0.0 ) # If is a binary classification, then we need to create the # complementary probability for the second class if len(self.classes_) <= 2: prob = np.hstack( (np.ones(X.shape[0]).reshape(-1, 1), prob) ) prob[:, 0] -= prob[:, 1] return softmax(prob)