# Author: Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.1
# Last modified: 06-17-2021 by Guilherme Aldeia
"""ITExpr_inspector class.
"""
import numpy as np
from scipy import stats
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.metrics import mutual_info_score
[docs]class ITExpr_inspector():
"""class ITExpr_inspector.
Based on a more statistical approach, this class
implements methods to measure the quality of the final expression by
calculating information between individual terms.
"""
def __init__(self, *, itexpr, tfuncs, decimal_places=3):
"""Constructor method.
Parameters
----------
itexpr : ITExpr_regressor or ITExpr_classifier
fitted instance of an ``ITExpr`` class to be explained.
tfuncs : dict
transformations functions. Should always
be a dict where the keys are the names of the transformation
functions and the values are unary vectorized functions.
"""
self.itexpr = itexpr
self.tfuncs = tfuncs
self.decimal_places = decimal_places
[docs] def fit(self, X, y):
"""Fit method to store the data used in the training of the given
itexpr instance.
Parameters
----------
X : array-like of shape (n_samples, n_features)
data used to train the itexpr model.
y : array-like of shape (n_samples, )
target data used to train the itexpr model.
Returns
-------
self : ITExpr_inspector
inspector with the calculated covariance matrix.
"""
X = check_array(X)
self.X_ = X
self.y_ = y
self.varcovar_beta = self.itexpr.covariance_matrix(X, y)
if self.varcovar_beta.ndim == 2:
self.varcovar_beta = np.array([self.varcovar_beta])
# Matrix containing each term evaluation on training data
self.Z = np.ones( (X.shape[0], self.itexpr.n_terms + 1) )
self.Z[:, :-1] = self.itexpr._eval(X)
return self
def _coef_stderr(self):
"""Method for estimating the standard error of the coefficients.
The estimated standard deviations will be calculated by taking the
square root from the main diagonal.
"""
stderrs = []
for i in range(self.varcovar_beta.shape[0]):
stderrs.append(
np.sqrt(np.diag(self.varcovar_beta[i]))
)
if len(stderrs) == 1:
stderrs = stderrs[0]
return [str(stderr.round(self.decimal_places))
for stderr in np.array(stderrs).T]
def _disentanglement(self):
"""Method for calculating the mean disentanglement for each term.
The mean disentanglement is the mean Pearson's correlation between
the term of interest and the remaining terms.
The disentanglement (measured by the collinearity between the generated
features) was proposed in "Learning feature spaces for regression with
genetic programming". The idea is that, when creating new features, a
disentangled representation ideally contains a minimal set of features.
In this paper, the authors tries to minimize collinearity between
features in order to promote disentanglement.
This metric is reported to indicate if there is a high degree of
disentanglement on the expression.
Notes
-----
This calculation was proposed in
"La Cava, W., Moore, J.H. Learning feature spaces for regression with
genetic programming. Genet Program Evolvable Mach 21, 433–467 (2020)"
"""
if self.itexpr.n_terms == 1:
return [0.0]
disentanglements = []
for col in range(self.itexpr.n_terms):
col_disentanglement = []
for col_to_compare in range(self.itexpr.n_terms):
if col != col_to_compare:
corr, p = stats.pearsonr(
self.Z[:, col], self.Z[:, col_to_compare])
# Pearson's correlation divides by the std, and the
# existante of a result is not guaranted. We'll consider
# a zero correlation in this cases
corr = 0.0 if np.isnan(corr) else corr**2
col_disentanglement.append(corr)
disentanglements.append(
np.mean(col_disentanglement).round(self.decimal_places))
return disentanglements
def _pred_var(self):
"""Method to calculate the variance of the predictions each term
produces on the training data.
"""
if hasattr(self.itexpr, 'classes_'):
variances = []
for coef, intercept in zip(
self.itexpr.coef_, self.itexpr.intercept_):
coef_and_intercept = np.append(
coef, intercept)
variances.append(np.var(
self.Z * coef_and_intercept,
axis=0).round(self.decimal_places))
return [str(v.round(self.decimal_places))
for v in np.array(variances).T]
else:
coef_and_intercept = np.append(
self.itexpr.coef_, self.itexpr.intercept_)
return np.var(
self.Z * coef_and_intercept, axis=0).round(self.decimal_places)
def _continuous_mutual_info(self):
"""Method to calculate the mean continuous mutual information for
each term. The mutual information is calculated between the term of
interest and the remaining terms.
"""
if self.itexpr.n_terms == 1:
return [0.0]
mutual_informations = []
for col in range(self.itexpr.n_terms):
col_mutual_information = []
for col_to_compare in range(self.itexpr.n_terms):
if col != col_to_compare:
bins = int(np.floor(np.sqrt(len(self.X_))))
# first element is a histogram
c_xy = np.histogram2d(
self.Z[:, col],
self.Z[:, col_to_compare],
bins
)[0]
col_mutual_information.append(
mutual_info_score(None, None, contingency=c_xy)
)
mutual_informations.append(
np.mean(col_mutual_information).round(self.decimal_places))
return mutual_informations
[docs] def terms_analysis(self):
"""Method to calculate different metrics for the terms composing
the IT expression.
Returns
-------
analysis : dict
returns a dictionary containing several term information
and metrics calculated for each term:
- coef: coefficient of each term (or coefficients, if the
itexpr is an instance of ``ITExpr_classifier``);
- func: transformation function of each term;
- strengths: the exponents of each term;
- coef stderr.: the standard error of the coefficients;
- mean pairwise disentanglement: the mean disentanglement between
each term when compared with the others;
- mean mutual information: the mean continuous mutual information
between each term when compared with the others;
- prediction var.: the variance of the predicted outcomes for
each term when predicting the training data.
"""
check_is_fitted(self)
coefs, funcs, strengths = [], [], []
for wi, (fi, ti) in zip(self.itexpr.coef_.T, self.itexpr.expr):
funcs.append(fi)
strengths.append(str(ti))
coefs.append(str(wi.round(self.decimal_places)))
# Calculating statistics related to the intercept
coefs = coefs + [
str(np.round(self.itexpr.intercept_, self.decimal_places))]
funcs = funcs + ['intercept']
strengths = strengths + ['---']
disentangles = self._disentanglement() + [0.0]
mutual_info = self._continuous_mutual_info() + [0.0]
return {
'coef' : coefs,
'func' : funcs,
'strengths' : strengths,
'coef\nstderr.' : self._coef_stderr(),
'mean pairwise\ndisentanglement' : disentangles,
'mean mutual\ninformation' : mutual_info,
'prediction\nvar.' : self._pred_var(),
}