# Author: Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.3
# Last modified: 24-11-2021 by Guilherme Aldeia
"""ITEA_summarizer class.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylatex import Document, Section, Command, Figure, Table
from pylatex.utils import NoEscape, make_temp_dir, rm_temp_dir
from pylatex.package import Package
from itea.inspection import ITExpr_inspector, ITExpr_explainer, ITExpr_texifier
from matplotlib.gridspec import GridSpecFromSubplotSpec
from sklearn.utils.validation import check_array, check_is_fitted
[docs]class ITEA_summarizer:
"""Class to automatically generate a pdf file reporting
several interpretability plots for the expression.
"""
def __init__(self, *, itea):
"""Constructor method.
Parameters
----------
itea : ITEA_classifier or ITEA_regressor
fitted instance of an ``ITEA`` class to be summarized.
"""
self.itea = itea
self.packages = {
'geometry' : {
"paperwidth" : "16cm",
"paperheight" : "12cm",
"tmargin" : "1.75cm",
"lmargin" : "1cm",
"rmargin" : "1cm",
"bmargin" : "1.5cm",
},
'fontenc' : ['T1'],
'babel' : 'english',
'datetime' : [],
'grffile' : [],
'booktabs' : [],
'amsfonts' : [],
'amssymb' : [],
'amsmath' : [],
'amsthm' : [],
'breqn' : [],
'fancyhdr' : [],
'indentfirst' : [],
'float' : [],
}
def _report_frontpage(self, doc, save_path):
"""Private method to create the frontpage of the report
"""
for k, v in self.packages.items():
doc.preamble.append(Package(k, v))
doc.preamble.append(Command('title', 'ITEA automatic report'))
doc.preamble.append(
Command('author', NoEscape(r'\textit{ITEA\_summarizer}')))
doc.preamble.append(Command('date', NoEscape(r'\today, \currenttime')))
doc.preamble.append(Command('pagestyle', 'fancy'))
doc.append(NoEscape(r"""\maketitle \vfill
Automatic report created by \textit{ITEA\_summarizer} package.
This report makes usage of several methods to automatically inspect
and explain the final expression found in the evolutionary process
performed by the ITEA algorithm.
\vfill \pagebreak"""))
def _report_pre_execution(self, doc, save_path):
"""Creates all pages with information related to pre-execution
of the algorithm (such as hyperparameters).
"""
# Header and footer
doc.append(NoEscape(r"""
\lhead{Pre-execution --- ITEA automatic report}
\chead{}
\rhead{\today, \currenttime}
\lfoot{}
\cfoot{}
\rfoot{\thepage\ | \pageref{LastPage}}"""))
# reporting descriptive statistics for the variables (maximum 5 to
# avoid overfull tables)
with doc.create(Section(
NoEscape('Descriptive statistics of the data'), numbering=False)):
# Retrieving the feature importances and selecting at most 5
feature_importances = self.itea.bestsol_.feature_importances_
order = np.argsort(-np.sum(feature_importances, axis=0))
# get the most relevant features
selected = order[:np.minimum(5, len(order))]
doc.append(NoEscape(f"""
Reporting descriptive statistics for {np.minimum(5, len(order))}
(from a total of {len(order)}) features contained on the
training data. The features were selected based on the absolute
final importance."""))
df_summary = pd.DataFrame(
np.array(self.X_)[:, selected],
columns=np.array(self.itea.labels)[selected]
)
with doc.create(Table(position='H')) as table:
table.append(Command('centering'))
table.append(Command('footnotesize'))
table.append(NoEscape(
df_summary.describe().to_latex(escape=True)))
doc.append(NoEscape(r"\vfill \pagebreak"))
# reporting the hyper parameters
with doc.create(Section(
NoEscape('Algorithm Hyper-parameters'), numbering=False)):
doc.append(NoEscape(r"""
The following hyperparameters were used to execute the
algorithm. If the random\_state parameter was set to an
integer value (or a numpy randomState instance was given), then
it is possible to repeat the exact execution by using the same
training data and the parameters listed below."""))
tfuncs_names = self.itea.tfuncs.keys()
doc.append(NoEscape(
r"{\footnotesize \begin{verbatim}" +
# almost all hyperparameters
'\n'.join([f" {k} : {v}"
for (k, v) in self.itea.get_params().items()
if 'funcs' not in k and k!= 'labels']) +
# displaying the transformation function keys
f"\n tfuncs : [{', '.join([k for k in tfuncs_names])}]" +
r"\end{verbatim} } \vfill \pagebreak"
))
def _report_execution(self, doc, save_path):
"""Creates pages with information about the ITEA and the ITExpr.
This section of the report does not include the post hoc explanations.
"""
# Auxiliary function to use in texifier
def term_wrapper_f(i, term):
return r'\underbrace{' + term + r'}_{\text{term ' + str(i) + '}}'
doc.append(NoEscape(r"""
\lhead{Execution --- ITEA automatic report}
\chead{}
\rhead{\today, \currenttime}
\lfoot{}
\cfoot{}
\rfoot{\thepage\ | \pageref{LastPage}}
"""))
# Convergence
with doc.create(Section(
NoEscape("Evolution convergence"), numbering=False)):
doc.append(NoEscape(f"""
The algorithm took {round(self.itea.exectime_, 3)} seconds to
completely run. Below are the plots for the average fitness
of the population and the best individual fitness for each
generation.""" +
r"\vfill"))
fig, axs = plt.subplots(1, 1, figsize=(8, 3))
self.plot_convergence(
data='fitness',
ax=axs,
show=False
)
plt.tight_layout()
plt.savefig(f"{save_path}/fitness_convergence.pdf")
plt.close()
with doc.create(Figure(position='H')) as figure_plot:
figure_plot.add_image(
f"{save_path}/fitness_convergence.pdf",
width=NoEscape(r'0.8\textwidth')
)
doc.append(NoEscape(r"\vfill \pagebreak"))
# Final expression descriptions
with doc.create(Section(
NoEscape('Best expression'), numbering=False)):
if hasattr(self.itea, 'classes_'):
type_itexpr = 'classifier'
else:
type_itexpr = 'regressor'
doc.append(NoEscape(r"""
The best expression corresponds to the expression with
the best fitness on the last generation before the evolution
ends. Not necessarily it will be the simpliest or the global
optimum expression of the evoution. """ +
f"""The final expression is a {type_itexpr} with a fitness of
{round(self.itea.fitness_, 5)}, and the number of IT terms is
{self.itea.bestsol_.n_terms}. Below is an LaTeX representation
of the expression:
""" +
r"\vfill {\small \begin{dmath}" +
NoEscape("ITExpr = " + ITExpr_texifier.to_latex(
self.itea.bestsol_,
term_wrapper = term_wrapper_f
)) +
r"\end{dmath} } \vfill \pagebreak"))
# inspector statistics
with doc.create(Section(
NoEscape('Best expression metrics'), numbering=False)):
doc.append(NoEscape(r"""On the next page is reported a table
containing the coefficients for the previous expression, as well as
some metrics calculated for each term individually:
\begin{itemize}
\item \textbf{coef:} coefficient of each term (or coefficients,
if the itexpr is an instance of ITExpr_classifier);
\item \textbf{coef stderr:} the standard error of the coefficients;
\item \textbf{disentang.:} mean pairwise disentanglement between
each term when compared with the others;
\item \textbf{M.I.:} mean continuous mutual information between
each term when compared with the others;
\item \textbf{pred. var.:} variance of the predicted outcomes for
each term when predicting the training data.
\end{itemize}
\vfill \pagebreak"""))
statistics = pd.DataFrame(self.inspector_.terms_analysis())
statistics = statistics.drop(columns='strengths')
statistics.columns = ['coef', 'func', 'coef stderr',
'disentang.', 'M.I.', 'pred. var.']
statistics = statistics.set_index(
'term ' + statistics.index.astype(str))
with doc.create(Table(position='H')) as table:
table.append(Command('centering'))
table.append(Command('footnotesize'))
table.append(NoEscape(statistics.to_latex(escape=True)))
doc.append(NoEscape(r"\vfill \pagebreak"))
# Partial derivatives
with doc.create(Section(
NoEscape('Partial derivatives'), numbering=False)):
derivatives_latex = ITExpr_texifier.derivatives_to_latex(
self.itea.bestsol_,
term_wrapper = term_wrapper_f
)
out = r"{\footnotesize"
for l, d in zip(self.itea.labels, derivatives_latex):
out += (
r"\begin{dmath}" +
r"\frac{\partial}{\partial " + str(l) + "} ITExpr = " + d +
r"\end{dmath}"
)
doc.append(NoEscape(out + r"} \vfill \pagebreak"))
def _report_post_execution(self, doc, save_path, importance_methods):
"""Post hoc interpretations of the ITExpr. Several plots will be
generated.
"""
if importance_methods is None:
importance_methods = 'pe'
importance_methods=np.array([importance_methods]).flatten()
if not set(importance_methods).issubset(set(['pe', 'ig', 'shapley'])):
raise ValueError(f'importance_methods not in est.classes_, ',
f'got {importance_methods}')
explainer_headers = {
'pe': r'Global importances with \textit{Average partial Effects}',
'ig': r'Global importances with \textit{Integrated Gradients}',
'shapley': r'Global importances with \textit{Shapley Values}',
}
explainers_descriptions = {
'pe' : r"""
Feature importances with Average Partial Effects. This method
attributes the importance to the i-th variable by calculating
the average of the partial derivative w.r.t. i, evaluated for
all data in the training set.
\vfill""",
'ig' : r"""
Feature importance using the Average Integrated Gradients
importances. The idea is to calculate a local
importance score for a feature $i$ by evaluating the integral of
the models' gradients $\frac{\partial f}{\partial x_i}$ along a
straight line between one baseline and the specific point.
\vfill""",
'shapley' : r"""
Feature importance with the average approximation of the
Shapley values. The shapley values are based on coalition game
theory, where players contribute differently to the team. The
Shapley value is the total contribution of the player, and
represents the overall contribution of the player.
\vfill"""
}
explainer_colors = {
'pe' : 'green',
'ig' : 'blue',
'shapley' : 'red'
}
# One image for each explainer
for importance_method in importance_methods:
doc.append(NoEscape(r"""
\lhead{post-execution --- ITEA automatic report}
\chead{}
\rhead{\today, \currenttime}
\lfoot{}
\cfoot{}
\rfoot{\thepage\ | \pageref{LastPage}}
"""))
# Average partial effects
with doc.create(Section(
NoEscape(explainer_headers[importance_method]), numbering=False)):
doc.append(NoEscape(explainers_descriptions[importance_method]))
fig, axs = plt.subplots(1, 1, figsize=(8, 4))
self.explainer_.plot_feature_importances(
X = self.X_,
ax = axs,
importance_method = importance_method,
grouping_threshold = 0.05,
target = None,
barh_kw = {
'edgecolor' : 'k',
'alpha' : 0.8,
'facecolor' : explainer_colors[importance_method]},
show = False
)
plt.tight_layout()
plt.savefig(f"{save_path}/{importance_method}.pdf")
plt.close()
with doc.create(Figure(position='H')) as figure_plot:
figure_plot.add_image(
f"{save_path}/{importance_method}.pdf",
width=NoEscape(r'0.8\textwidth')
)
doc.append(NoEscape(r"\vfill \pagebreak"))
# Normalized partial effects
with doc.create(Section(
NoEscape(r'\textit{Normalized partial Effects}'), numbering=False)):
doc.append(NoEscape(r"""
Feature importances with Normalized Partial Effects.
To create this plot, first, the output interval is discretized.
Then, for each interval, the partial effect of all samples
in the training set that results in a prediction within the
interval are calculated. Finally, they are normalized in
order to make the total contribution by 100\%.
\vfill"""))
fig, axs = plt.subplots(1, 1, figsize=(8, 4))
self.explainer_.plot_normalized_partial_effects(
ax = axs,
grouping_threshold = 0.05,
show = False
)
plt.tight_layout()
plt.savefig(f"{save_path}/normalized_partial_effects.pdf")
plt.close()
with doc.create(Figure(position='H')) as figure_plot:
figure_plot.add_image(
f"{save_path}/normalized_partial_effects.pdf",
width=NoEscape(r'0.8\textwidth')
)
doc.append(NoEscape(r"\vfill \pagebreak"))
# Partial effects at the means
with doc.create(Section(
NoEscape(r'\textit{Partial Effects at the Means}'),
numbering=False
)):
doc.append(NoEscape(r"""
Partial Effects plots created by fixing the co-variables at
the means and evaluating the model's output when only one
variable changes. For simplicity, at most 5 variables are
selected to create the plot (the 5 most important variables
considering their Average Partial Effects).
\vfill"""))
fig, axs = plt.subplots(1, 1, figsize=(9, 3))
feature_importances = self.itea.bestsol_.feature_importances_
order = np.argsort(-np.sum(feature_importances, axis=0))
# get the most relevant features
selected = order[:np.minimum(5, len(order))]
self.explainer_.plot_partial_effects_at_means(
X=self.X_,
features=selected,
ax=axs,
n_cols=5,
num_points=100,
share_y=True,
show_err=True,
show=False
)
plt.tight_layout()
plt.savefig(f"{save_path}/partial_effets_at_means.pdf")
plt.close()
with doc.create(Figure(position='H')) as figure_plot:
figure_plot.add_image(
f"{save_path}/partial_effets_at_means.pdf",
width=NoEscape(r'\textwidth')
)
doc.append(NoEscape(r"\vfill \pagebreak"))
[docs] def fit(self, X, y):
"""Fit method to store the data used in the training of the given
itea instance.
Parameters
----------
X : array-like of shape (n_samples, n_features)
data used to train the itexpr model.
y : array-like of shape (n_samples, )
target data used to train the itexpr model.
Returns
-------
self : ITEA_summarizer
"""
X = check_array(X)
self.X_ = X
self.y_ = y
self.inspector_ = ITExpr_inspector(
itexpr=self.itea.bestsol_, tfuncs=self.itea.tfuncs
).fit(self.X_, self.y_)
self.explainer_ = ITExpr_explainer(
itexpr=self.itea.bestsol_, tfuncs=self.itea.tfuncs,
tfuncs_dx=self.itea.tfuncs_dx
).fit(X, y)
return self
[docs] def plot_convergence(self,
*,
data = None,
n_cols = 1,
line_kw = None,
fill_kw = None,
ax = None,
show_err = True,
show = True
):
"""Plot of information about the ``itea`` evolutionary process.
This function is intended to help visualize the information on the
``itea.convergence_`` dictionary.
.. image:: assets/images/plot_convergence_1.png
:align: center
Parameters
----------
data : string, list of string, or None, default=None
the convergence information to generate the plots. It can be a
single string or a list with strings in
``['fitness', 'n_terms', 'complexity']``. If set to none, then
the whole list of strings will be used.
n_cols : int, default=1
number of columns to be used when creating the plot grids if ax is
None.
line_kw : dict or None, default=None
dictionary with keywords to be used when generating the plots.
When set to None, then ``line_kw= {}``.
fill_kw : dict or None, default=None
dictionary with keywords to be used when generating the plots.
When set to None, then ``fill_kw= {'alpha' : 0.15}``.
ax : matplotlib.axes or list of matplotlib.axes or None, default=None
axis to generate the plot. If none is given, then a new axis is
created. If is a single axis, the plot will be drawn within the
given axis. If ax is a list, then it must have the same number of
elements in ``data``.
show_err : bool, default=True
boolean variable indicating if the standard error should be plotted.
show : bool, default=True
boolean value indicating if the generated plot should be displayed
or not.
Raises
------
ValueError
If ``ax`` or ``data`` has invalid values.
"""
check_is_fitted(self)
if data is None:
data = ['fitness', 'n_terms', 'complexity']
data = np.array([data]).flatten()
if not set(data).issubset(['fitness', 'n_terms', 'complexity']):
raise ValueError("Data must be one string or a list containing "
"one or more of the following strings: "
"'fitness', 'n_terms', 'complexity'")
if ax is None:
fig, ax = plt.subplots()
elif not isinstance(ax, plt.Axes):
ax = np.asarray(ax, dtype=object)
if ax.size != len(data):
raise ValueError(
f"Expected ax to have {len(data)} axes, got {ax.size}. "
"The number of axes must be equal to the number of "
"values in `data` (or 1 if data is a string).")
default_fill_kw = {'alpha' : 0.15}
if fill_kw is None:
fill_kw = default_fill_kw
else:
fill_kw = {**default_fill_kw, **fill_kw}
default_line_kw = {}
if line_kw is None:
line_kw = default_line_kw
else:
line_kw = {**default_line_kw, **line_kw}
# Creating subplots if ax is a single axis
if isinstance(ax, plt.Axes):
n_cols = min(n_cols, len(data))
n_rows = int(np.ceil(len(data) / float(n_cols)))
ax.set_axis_off()
self.figure_ = ax.figure
self.axes_ = np.empty((n_rows, n_cols), dtype=object)
axes_ravel = self.axes_.ravel()
gs = GridSpecFromSubplotSpec(n_rows, n_cols,
subplot_spec=ax.get_subplotspec())
for i, spec in zip(range(len(data)), gs):
axes_ravel[i] = self.figure_.add_subplot(spec)
else:
if ax.ndim == 2:
n_cols = ax.shape[1]
else:
n_cols = None
self.axes_ = ax
self.figure_ = ax.ravel()[0].figure
best_of_generation = 'max' if self.itea._greater_is_better else 'min'
gens = range(len(self.itea.convergence_['fitness']['mean']))
for (axi, d) in zip(self.axes_.ravel(), data):
axi.plot(
gens,
self.itea.convergence_[d]['mean'],
label='mean',
**line_kw
)
if d == 'fitness':
# only the fitness can have either higher or lower values
# as the best for each generation. All other data always
# have smaller values as better values.
axi.plot(
gens,
self.itea.convergence_[d][best_of_generation],
label=f"best ({best_of_generation}imum)",
**line_kw
)
else:
axi.plot(
gens,
self.itea.convergence_[d]['min'],
label=f"best (minimum)",
**line_kw
)
if show_err:
low_bound = [y+std for y, std in zip(
self.itea.convergence_[d]['mean'],
self.itea.convergence_[d]['std'])]
upper_bound = [y-std for y, std in zip(
self.itea.convergence_[d]['mean'],
self.itea.convergence_[d]['std'])]
axi.fill_between(
gens, low_bound, upper_bound, **fill_kw)
axi.set_title(d)
axi.legend()
axi.set_xlabel("generation")
if show:
plt.show()
[docs] def autoreport(self,
importance_methods=None, save_path='.',name_suffix='', use_temp_folder=True):
"""automatically generate a pdf using the methods implemented in
``ITExpr_inspector``, ``ITExpr_explainer``, and ``ITExpr_texifier``.
The idea is to simplify the generation of the plots and tables,
removing from the user the need to understand, instantiate the classes
and call the plots functions.
All explanations are generated with the training data, and every
item in the report can be obtained manually by using the
``ITExpr_inspector``, ``ITExpr_explainer``, and ``ITExpr_texifier``.
This method makes usage of the ``PyLaTeX`` package and requires a
visible latex installation to work properly.
The .tex file used to generate the pdf will also be saved on the
designed path.
You can download one example of report
:download:`by clicking here </assets/files/Report.pdf>`.
Parameters
----------
importance_methods : string or list[strings] or None, default=None
Feature importance method(s) used to generate explanations in
the report. Must be one of the possible explainers implemented
(``['pe', 'ig', 'shapley']``) or a list containing one or more
of the explainers. If None, then ``'pe'`` will be used. The report
will contain one page for each method specified here.
save_path : string, default='.'
path to save the pdf report. The file will be saved as "Report.pdf",
unless a ``name_suffix`` is provided.
A Te
name_suffix : string, default=""
suffix to add in the name of the report.
use_temp_folder : boolean, defaut=True
specifies if a temporary folder should be used to save the plots
during the creation of the report. If false, then the plots will
be saved on the ``save_path``.
"""
check_is_fitted(self)
# Creating the doc and title page
doc = Document('Report')
if use_temp_folder:
temp_path = make_temp_dir()
else:
temp_path = save_path
self._report_frontpage(doc, temp_path)
self._report_pre_execution(doc, temp_path)
self._report_execution(doc, temp_path)
self._report_post_execution(doc, temp_path, importance_methods)
doc.generate_pdf(f'{save_path}/Report{name_suffix}', clean_tex=False)
if use_temp_folder:
rm_temp_dir()