Source code for itea.inspection._ITExpr_inspector

# Author:  Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.1
# Last modified: 06-17-2021 by Guilherme Aldeia


"""ITExpr_inspector class.
"""


import numpy     as np

from scipy import stats

from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.metrics          import mutual_info_score


[docs]class ITExpr_inspector():
    """class ITExpr_inspector.
    
    Based on a more statistical approach, this class 
    implements methods to measure the quality of the final expression by
    calculating information between individual terms.
    """

    def __init__(self, *, itexpr, tfuncs, decimal_places=3):
        """Constructor method.
        
        Parameters
        ----------
        itexpr : ITExpr_regressor or ITExpr_classifier
            fitted instance of an ``ITExpr`` class to be explained.
        
        tfuncs : dict
            transformations functions. Should always
            be a dict where the keys are the names of the transformation
            functions and the values are unary vectorized functions.
        """

        self.itexpr         = itexpr
        self.tfuncs         = tfuncs
        self.decimal_places = decimal_places


[docs]    def fit(self, X, y):
        """Fit method to store the data used in the training of the given
        itexpr instance.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            data used to train the itexpr model.

        y : array-like of shape (n_samples, )
            target data used to train the itexpr model.

        Returns
        -------
        self : ITExpr_inspector
            inspector with the calculated covariance matrix.
        """

        X = check_array(X)
        
        self.X_ = X
        self.y_ = y


        self.varcovar_beta = self.itexpr.covariance_matrix(X, y)
        
        if self.varcovar_beta.ndim == 2:
            self.varcovar_beta = np.array([self.varcovar_beta])

        # Matrix containing each term evaluation on training data
        self.Z = np.ones( (X.shape[0], self.itexpr.n_terms + 1) )
        self.Z[:, :-1] = self.itexpr._eval(X)

        return self   


    def _coef_stderr(self):
        """Method for estimating the standard error of the coefficients.

        The estimated standard deviations will be calculated by taking the
        square root from the main diagonal.
        """
        
        stderrs = []
        for i in range(self.varcovar_beta.shape[0]):
            stderrs.append(
                np.sqrt(np.diag(self.varcovar_beta[i]))
            )

        if len(stderrs) == 1:
            stderrs = stderrs[0]
        
        return [str(stderr.round(self.decimal_places))
                for stderr in np.array(stderrs).T]


    def _disentanglement(self):
        """Method for calculating the mean disentanglement for each term.
        
        The mean disentanglement is the mean Pearson's correlation between
        the term of interest and the remaining terms.

        The disentanglement (measured by the collinearity between the generated
        features) was proposed in "Learning feature spaces for regression with
        genetic programming". The idea is that, when creating new features, a
        disentangled representation ideally contains a minimal set of features.
        In this paper, the authors tries to minimize collinearity between
        features in order to promote disentanglement.

        This metric is reported to indicate if there is a high degree of
        disentanglement on the expression.

        Notes
        -----
        This calculation was proposed in
        "La Cava, W., Moore, J.H. Learning feature spaces for regression with
        genetic programming. Genet Program Evolvable Mach 21, 433–467 (2020)"
        """
        
        if self.itexpr.n_terms == 1:
            return [0.0]

        disentanglements = []
        for col in range(self.itexpr.n_terms):

            col_disentanglement = []
            for col_to_compare in range(self.itexpr.n_terms):

                if col != col_to_compare:
                    corr, p = stats.pearsonr(
                        self.Z[:, col], self.Z[:, col_to_compare])
                    
                    # Pearson's correlation divides by the std, and the
                    # existante of a result is not guaranted. We'll consider
                    # a zero correlation in this cases
                    corr = 0.0 if np.isnan(corr) else corr**2
                    
                    col_disentanglement.append(corr)

            disentanglements.append(
                np.mean(col_disentanglement).round(self.decimal_places))

        return disentanglements


    def _pred_var(self):
        """Method to calculate the variance of the predictions each term
        produces on the training data.
        """

        if hasattr(self.itexpr, 'classes_'):
            variances = []

            for coef, intercept in zip(
                self.itexpr.coef_, self.itexpr.intercept_):
                
                coef_and_intercept = np.append(
                    coef, intercept)
                    
                variances.append(np.var(
                    self.Z * coef_and_intercept,
                axis=0).round(self.decimal_places))

            return [str(v.round(self.decimal_places))
                    for v in np.array(variances).T]
        else:
            coef_and_intercept = np.append(
                self.itexpr.coef_, self.itexpr.intercept_)

            return np.var(
                self.Z * coef_and_intercept, axis=0).round(self.decimal_places)


    def _continuous_mutual_info(self):
        """Method to calculate the mean continuous mutual information for 
        each term. The mutual information is calculated between the term of
        interest and the remaining terms.
        """

        if self.itexpr.n_terms == 1:
            return [0.0]
            
        mutual_informations = []
        for col in range(self.itexpr.n_terms):

            col_mutual_information = []
            for col_to_compare in range(self.itexpr.n_terms):

                if col != col_to_compare:
                    bins = int(np.floor(np.sqrt(len(self.X_))))
                    
                    # first element is a histogram
                    c_xy = np.histogram2d(
                        self.Z[:, col],
                        self.Z[:, col_to_compare],
                        bins
                    )[0]
                    
                    col_mutual_information.append(
                        mutual_info_score(None, None, contingency=c_xy)
                    )

            mutual_informations.append(
                np.mean(col_mutual_information).round(self.decimal_places))

        return mutual_informations


[docs]    def terms_analysis(self):
        """Method to calculate different metrics for the terms composing
        the IT expression.
        
        Returns
        -------
        analysis : dict
            returns a dictionary containing several term information
            and metrics calculated for each term:

            - coef: coefficient of each term (or coefficients, if the
              itexpr is an instance of ``ITExpr_classifier``);
            - func: transformation function of each term;
            - strengths: the exponents of each term;
            - coef stderr.: the standard error of the coefficients;
            - mean pairwise disentanglement: the mean disentanglement between
              each term when compared with the others;
            - mean mutual information: the mean continuous mutual information
              between each term when compared with the others;
            - prediction var.: the variance of the predicted outcomes for
              each term when predicting the training data.
        """

        check_is_fitted(self)
        
        coefs, funcs, strengths = [], [], []
        for wi, (fi, ti) in zip(self.itexpr.coef_.T, self.itexpr.expr):
            
            funcs.append(fi)
            strengths.append(str(ti))
            
            coefs.append(str(wi.round(self.decimal_places)))

        # Calculating statistics related to the intercept
        coefs        = coefs + [
            str(np.round(self.itexpr.intercept_, self.decimal_places))]
        funcs        = funcs + ['intercept']
        strengths    = strengths + ['---']
        disentangles = self._disentanglement() + [0.0]
        mutual_info  = self._continuous_mutual_info() + [0.0]
        
        return {
            'coef'                           : coefs,
            'func'                           : funcs,
            'strengths'                      : strengths, 
            'coef\nstderr.'                  : self._coef_stderr(),
            'mean pairwise\ndisentanglement' : disentangles,
            'mean mutual\ninformation'       : mutual_info,
            'prediction\nvar.'               : self._pred_var(),
        }