Source code for itea.inspection._ITEA_summarizer

# Author:  Guilherme Aldeia
# Contact: guilherme.aldeia@ufabc.edu.br
# Version: 1.0.3
# Last modified: 24-11-2021 by Guilherme Aldeia


"""ITEA_summarizer class.
"""


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from pylatex         import Document, Section, Command, Figure, Table
from pylatex.utils   import NoEscape, make_temp_dir, rm_temp_dir
from pylatex.package import Package

from itea.inspection import ITExpr_inspector, ITExpr_explainer, ITExpr_texifier
from matplotlib.gridspec      import GridSpecFromSubplotSpec 

from sklearn.utils.validation import check_array, check_is_fitted


[docs]class ITEA_summarizer: """Class to automatically generate a pdf file reporting several interpretability plots for the expression. """ def __init__(self, *, itea): """Constructor method. Parameters ---------- itea : ITEA_classifier or ITEA_regressor fitted instance of an ``ITEA`` class to be summarized. """ self.itea = itea self.packages = { 'geometry' : { "paperwidth" : "16cm", "paperheight" : "12cm", "tmargin" : "1.75cm", "lmargin" : "1cm", "rmargin" : "1cm", "bmargin" : "1.5cm", }, 'fontenc' : ['T1'], 'babel' : 'english', 'datetime' : [], 'grffile' : [], 'booktabs' : [], 'amsfonts' : [], 'amssymb' : [], 'amsmath' : [], 'amsthm' : [], 'breqn' : [], 'fancyhdr' : [], 'indentfirst' : [], 'float' : [], } def _report_frontpage(self, doc, save_path): """Private method to create the frontpage of the report """ for k, v in self.packages.items(): doc.preamble.append(Package(k, v)) doc.preamble.append(Command('title', 'ITEA automatic report')) doc.preamble.append( Command('author', NoEscape(r'\textit{ITEA\_summarizer}'))) doc.preamble.append(Command('date', NoEscape(r'\today, \currenttime'))) doc.preamble.append(Command('pagestyle', 'fancy')) doc.append(NoEscape(r"""\maketitle \vfill Automatic report created by \textit{ITEA\_summarizer} package. This report makes usage of several methods to automatically inspect and explain the final expression found in the evolutionary process performed by the ITEA algorithm. \vfill \pagebreak""")) def _report_pre_execution(self, doc, save_path): """Creates all pages with information related to pre-execution of the algorithm (such as hyperparameters). """ # Header and footer doc.append(NoEscape(r""" \lhead{Pre-execution --- ITEA automatic report} \chead{} \rhead{\today, \currenttime} \lfoot{} \cfoot{} \rfoot{\thepage\ | \pageref{LastPage}}""")) # reporting descriptive statistics for the variables (maximum 5 to # avoid overfull tables) with doc.create(Section( NoEscape('Descriptive statistics of the data'), numbering=False)): # Retrieving the feature importances and selecting at most 5 feature_importances = self.itea.bestsol_.feature_importances_ order = np.argsort(-np.sum(feature_importances, axis=0)) # get the most relevant features selected = order[:np.minimum(5, len(order))] doc.append(NoEscape(f""" Reporting descriptive statistics for {np.minimum(5, len(order))} (from a total of {len(order)}) features contained on the training data. The features were selected based on the absolute final importance.""")) df_summary = pd.DataFrame( np.array(self.X_)[:, selected], columns=np.array(self.itea.labels)[selected] ) with doc.create(Table(position='H')) as table: table.append(Command('centering')) table.append(Command('footnotesize')) table.append(NoEscape( df_summary.describe().to_latex(escape=True))) doc.append(NoEscape(r"\vfill \pagebreak")) # reporting the hyper parameters with doc.create(Section( NoEscape('Algorithm Hyper-parameters'), numbering=False)): doc.append(NoEscape(r""" The following hyperparameters were used to execute the algorithm. If the random\_state parameter was set to an integer value (or a numpy randomState instance was given), then it is possible to repeat the exact execution by using the same training data and the parameters listed below.""")) tfuncs_names = self.itea.tfuncs.keys() doc.append(NoEscape( r"{\footnotesize \begin{verbatim}" + # almost all hyperparameters '\n'.join([f" {k} : {v}" for (k, v) in self.itea.get_params().items() if 'funcs' not in k and k!= 'labels']) + # displaying the transformation function keys f"\n tfuncs : [{', '.join([k for k in tfuncs_names])}]" + r"\end{verbatim} } \vfill \pagebreak" )) def _report_execution(self, doc, save_path): """Creates pages with information about the ITEA and the ITExpr. This section of the report does not include the post hoc explanations. """ # Auxiliary function to use in texifier def term_wrapper_f(i, term): return r'\underbrace{' + term + r'}_{\text{term ' + str(i) + '}}' doc.append(NoEscape(r""" \lhead{Execution --- ITEA automatic report} \chead{} \rhead{\today, \currenttime} \lfoot{} \cfoot{} \rfoot{\thepage\ | \pageref{LastPage}} """)) # Convergence with doc.create(Section( NoEscape("Evolution convergence"), numbering=False)): doc.append(NoEscape(f""" The algorithm took {round(self.itea.exectime_, 3)} seconds to completely run. Below are the plots for the average fitness of the population and the best individual fitness for each generation.""" + r"\vfill")) fig, axs = plt.subplots(1, 1, figsize=(8, 3)) self.plot_convergence( data='fitness', ax=axs, show=False ) plt.tight_layout() plt.savefig(f"{save_path}/fitness_convergence.pdf") plt.close() with doc.create(Figure(position='H')) as figure_plot: figure_plot.add_image( f"{save_path}/fitness_convergence.pdf", width=NoEscape(r'0.8\textwidth') ) doc.append(NoEscape(r"\vfill \pagebreak")) # Final expression descriptions with doc.create(Section( NoEscape('Best expression'), numbering=False)): if hasattr(self.itea, 'classes_'): type_itexpr = 'classifier' else: type_itexpr = 'regressor' doc.append(NoEscape(r""" The best expression corresponds to the expression with the best fitness on the last generation before the evolution ends. Not necessarily it will be the simpliest or the global optimum expression of the evoution. """ + f"""The final expression is a {type_itexpr} with a fitness of {round(self.itea.fitness_, 5)}, and the number of IT terms is {self.itea.bestsol_.n_terms}. Below is an LaTeX representation of the expression: """ + r"\vfill {\small \begin{dmath}" + NoEscape("ITExpr = " + ITExpr_texifier.to_latex( self.itea.bestsol_, term_wrapper = term_wrapper_f )) + r"\end{dmath} } \vfill \pagebreak")) # inspector statistics with doc.create(Section( NoEscape('Best expression metrics'), numbering=False)): doc.append(NoEscape(r"""On the next page is reported a table containing the coefficients for the previous expression, as well as some metrics calculated for each term individually: \begin{itemize} \item \textbf{coef:} coefficient of each term (or coefficients, if the itexpr is an instance of ITExpr_classifier); \item \textbf{coef stderr:} the standard error of the coefficients; \item \textbf{disentang.:} mean pairwise disentanglement between each term when compared with the others; \item \textbf{M.I.:} mean continuous mutual information between each term when compared with the others; \item \textbf{pred. var.:} variance of the predicted outcomes for each term when predicting the training data. \end{itemize} \vfill \pagebreak""")) statistics = pd.DataFrame(self.inspector_.terms_analysis()) statistics = statistics.drop(columns='strengths') statistics.columns = ['coef', 'func', 'coef stderr', 'disentang.', 'M.I.', 'pred. var.'] statistics = statistics.set_index( 'term ' + statistics.index.astype(str)) with doc.create(Table(position='H')) as table: table.append(Command('centering')) table.append(Command('footnotesize')) table.append(NoEscape(statistics.to_latex(escape=True))) doc.append(NoEscape(r"\vfill \pagebreak")) # Partial derivatives with doc.create(Section( NoEscape('Partial derivatives'), numbering=False)): derivatives_latex = ITExpr_texifier.derivatives_to_latex( self.itea.bestsol_, term_wrapper = term_wrapper_f ) out = r"{\footnotesize" for l, d in zip(self.itea.labels, derivatives_latex): out += ( r"\begin{dmath}" + r"\frac{\partial}{\partial " + str(l) + "} ITExpr = " + d + r"\end{dmath}" ) doc.append(NoEscape(out + r"} \vfill \pagebreak")) def _report_post_execution(self, doc, save_path, importance_methods): """Post hoc interpretations of the ITExpr. Several plots will be generated. """ if importance_methods is None: importance_methods = 'pe' importance_methods=np.array([importance_methods]).flatten() if not set(importance_methods).issubset(set(['pe', 'ig', 'shapley'])): raise ValueError(f'importance_methods not in est.classes_, ', f'got {importance_methods}') explainer_headers = { 'pe': r'Global importances with \textit{Average partial Effects}', 'ig': r'Global importances with \textit{Integrated Gradients}', 'shapley': r'Global importances with \textit{Shapley Values}', } explainers_descriptions = { 'pe' : r""" Feature importances with Average Partial Effects. This method attributes the importance to the i-th variable by calculating the average of the partial derivative w.r.t. i, evaluated for all data in the training set. \vfill""", 'ig' : r""" Feature importance using the Average Integrated Gradients importances. The idea is to calculate a local importance score for a feature $i$ by evaluating the integral of the models' gradients $\frac{\partial f}{\partial x_i}$ along a straight line between one baseline and the specific point. \vfill""", 'shapley' : r""" Feature importance with the average approximation of the Shapley values. The shapley values are based on coalition game theory, where players contribute differently to the team. The Shapley value is the total contribution of the player, and represents the overall contribution of the player. \vfill""" } explainer_colors = { 'pe' : 'green', 'ig' : 'blue', 'shapley' : 'red' } # One image for each explainer for importance_method in importance_methods: doc.append(NoEscape(r""" \lhead{post-execution --- ITEA automatic report} \chead{} \rhead{\today, \currenttime} \lfoot{} \cfoot{} \rfoot{\thepage\ | \pageref{LastPage}} """)) # Average partial effects with doc.create(Section( NoEscape(explainer_headers[importance_method]), numbering=False)): doc.append(NoEscape(explainers_descriptions[importance_method])) fig, axs = plt.subplots(1, 1, figsize=(8, 4)) self.explainer_.plot_feature_importances( X = self.X_, ax = axs, importance_method = importance_method, grouping_threshold = 0.05, target = None, barh_kw = { 'edgecolor' : 'k', 'alpha' : 0.8, 'facecolor' : explainer_colors[importance_method]}, show = False ) plt.tight_layout() plt.savefig(f"{save_path}/{importance_method}.pdf") plt.close() with doc.create(Figure(position='H')) as figure_plot: figure_plot.add_image( f"{save_path}/{importance_method}.pdf", width=NoEscape(r'0.8\textwidth') ) doc.append(NoEscape(r"\vfill \pagebreak")) # Normalized partial effects with doc.create(Section( NoEscape(r'\textit{Normalized partial Effects}'), numbering=False)): doc.append(NoEscape(r""" Feature importances with Normalized Partial Effects. To create this plot, first, the output interval is discretized. Then, for each interval, the partial effect of all samples in the training set that results in a prediction within the interval are calculated. Finally, they are normalized in order to make the total contribution by 100\%. \vfill""")) fig, axs = plt.subplots(1, 1, figsize=(8, 4)) self.explainer_.plot_normalized_partial_effects( ax = axs, grouping_threshold = 0.05, show = False ) plt.tight_layout() plt.savefig(f"{save_path}/normalized_partial_effects.pdf") plt.close() with doc.create(Figure(position='H')) as figure_plot: figure_plot.add_image( f"{save_path}/normalized_partial_effects.pdf", width=NoEscape(r'0.8\textwidth') ) doc.append(NoEscape(r"\vfill \pagebreak")) # Partial effects at the means with doc.create(Section( NoEscape(r'\textit{Partial Effects at the Means}'), numbering=False )): doc.append(NoEscape(r""" Partial Effects plots created by fixing the co-variables at the means and evaluating the model's output when only one variable changes. For simplicity, at most 5 variables are selected to create the plot (the 5 most important variables considering their Average Partial Effects). \vfill""")) fig, axs = plt.subplots(1, 1, figsize=(9, 3)) feature_importances = self.itea.bestsol_.feature_importances_ order = np.argsort(-np.sum(feature_importances, axis=0)) # get the most relevant features selected = order[:np.minimum(5, len(order))] self.explainer_.plot_partial_effects_at_means( X=self.X_, features=selected, ax=axs, n_cols=5, num_points=100, share_y=True, show_err=True, show=False ) plt.tight_layout() plt.savefig(f"{save_path}/partial_effets_at_means.pdf") plt.close() with doc.create(Figure(position='H')) as figure_plot: figure_plot.add_image( f"{save_path}/partial_effets_at_means.pdf", width=NoEscape(r'\textwidth') ) doc.append(NoEscape(r"\vfill \pagebreak"))
[docs] def fit(self, X, y): """Fit method to store the data used in the training of the given itea instance. Parameters ---------- X : array-like of shape (n_samples, n_features) data used to train the itexpr model. y : array-like of shape (n_samples, ) target data used to train the itexpr model. Returns ------- self : ITEA_summarizer """ X = check_array(X) self.X_ = X self.y_ = y self.inspector_ = ITExpr_inspector( itexpr=self.itea.bestsol_, tfuncs=self.itea.tfuncs ).fit(self.X_, self.y_) self.explainer_ = ITExpr_explainer( itexpr=self.itea.bestsol_, tfuncs=self.itea.tfuncs, tfuncs_dx=self.itea.tfuncs_dx ).fit(X, y) return self
[docs] def plot_convergence(self, *, data = None, n_cols = 1, line_kw = None, fill_kw = None, ax = None, show_err = True, show = True ): """Plot of information about the ``itea`` evolutionary process. This function is intended to help visualize the information on the ``itea.convergence_`` dictionary. .. image:: assets/images/plot_convergence_1.png :align: center Parameters ---------- data : string, list of string, or None, default=None the convergence information to generate the plots. It can be a single string or a list with strings in ``['fitness', 'n_terms', 'complexity']``. If set to none, then the whole list of strings will be used. n_cols : int, default=1 number of columns to be used when creating the plot grids if ax is None. line_kw : dict or None, default=None dictionary with keywords to be used when generating the plots. When set to None, then ``line_kw= {}``. fill_kw : dict or None, default=None dictionary with keywords to be used when generating the plots. When set to None, then ``fill_kw= {'alpha' : 0.15}``. ax : matplotlib.axes or list of matplotlib.axes or None, default=None axis to generate the plot. If none is given, then a new axis is created. If is a single axis, the plot will be drawn within the given axis. If ax is a list, then it must have the same number of elements in ``data``. show_err : bool, default=True boolean variable indicating if the standard error should be plotted. show : bool, default=True boolean value indicating if the generated plot should be displayed or not. Raises ------ ValueError If ``ax`` or ``data`` has invalid values. """ check_is_fitted(self) if data is None: data = ['fitness', 'n_terms', 'complexity'] data = np.array([data]).flatten() if not set(data).issubset(['fitness', 'n_terms', 'complexity']): raise ValueError("Data must be one string or a list containing " "one or more of the following strings: " "'fitness', 'n_terms', 'complexity'") if ax is None: fig, ax = plt.subplots() elif not isinstance(ax, plt.Axes): ax = np.asarray(ax, dtype=object) if ax.size != len(data): raise ValueError( f"Expected ax to have {len(data)} axes, got {ax.size}. " "The number of axes must be equal to the number of " "values in `data` (or 1 if data is a string).") default_fill_kw = {'alpha' : 0.15} if fill_kw is None: fill_kw = default_fill_kw else: fill_kw = {**default_fill_kw, **fill_kw} default_line_kw = {} if line_kw is None: line_kw = default_line_kw else: line_kw = {**default_line_kw, **line_kw} # Creating subplots if ax is a single axis if isinstance(ax, plt.Axes): n_cols = min(n_cols, len(data)) n_rows = int(np.ceil(len(data) / float(n_cols))) ax.set_axis_off() self.figure_ = ax.figure self.axes_ = np.empty((n_rows, n_cols), dtype=object) axes_ravel = self.axes_.ravel() gs = GridSpecFromSubplotSpec(n_rows, n_cols, subplot_spec=ax.get_subplotspec()) for i, spec in zip(range(len(data)), gs): axes_ravel[i] = self.figure_.add_subplot(spec) else: if ax.ndim == 2: n_cols = ax.shape[1] else: n_cols = None self.axes_ = ax self.figure_ = ax.ravel()[0].figure best_of_generation = 'max' if self.itea._greater_is_better else 'min' gens = range(len(self.itea.convergence_['fitness']['mean'])) for (axi, d) in zip(self.axes_.ravel(), data): axi.plot( gens, self.itea.convergence_[d]['mean'], label='mean', **line_kw ) if d == 'fitness': # only the fitness can have either higher or lower values # as the best for each generation. All other data always # have smaller values as better values. axi.plot( gens, self.itea.convergence_[d][best_of_generation], label=f"best ({best_of_generation}imum)", **line_kw ) else: axi.plot( gens, self.itea.convergence_[d]['min'], label=f"best (minimum)", **line_kw ) if show_err: low_bound = [y+std for y, std in zip( self.itea.convergence_[d]['mean'], self.itea.convergence_[d]['std'])] upper_bound = [y-std for y, std in zip( self.itea.convergence_[d]['mean'], self.itea.convergence_[d]['std'])] axi.fill_between( gens, low_bound, upper_bound, **fill_kw) axi.set_title(d) axi.legend() axi.set_xlabel("generation") if show: plt.show()
[docs] def autoreport(self, importance_methods=None, save_path='.',name_suffix='', use_temp_folder=True): """automatically generate a pdf using the methods implemented in ``ITExpr_inspector``, ``ITExpr_explainer``, and ``ITExpr_texifier``. The idea is to simplify the generation of the plots and tables, removing from the user the need to understand, instantiate the classes and call the plots functions. All explanations are generated with the training data, and every item in the report can be obtained manually by using the ``ITExpr_inspector``, ``ITExpr_explainer``, and ``ITExpr_texifier``. This method makes usage of the ``PyLaTeX`` package and requires a visible latex installation to work properly. The .tex file used to generate the pdf will also be saved on the designed path. You can download one example of report :download:`by clicking here </assets/files/Report.pdf>`. Parameters ---------- importance_methods : string or list[strings] or None, default=None Feature importance method(s) used to generate explanations in the report. Must be one of the possible explainers implemented (``['pe', 'ig', 'shapley']``) or a list containing one or more of the explainers. If None, then ``'pe'`` will be used. The report will contain one page for each method specified here. save_path : string, default='.' path to save the pdf report. The file will be saved as "Report.pdf", unless a ``name_suffix`` is provided. A Te name_suffix : string, default="" suffix to add in the name of the report. use_temp_folder : boolean, defaut=True specifies if a temporary folder should be used to save the plots during the creation of the report. If false, then the plots will be saved on the ``save_path``. """ check_is_fitted(self) # Creating the doc and title page doc = Document('Report') if use_temp_folder: temp_path = make_temp_dir() else: temp_path = save_path self._report_frontpage(doc, temp_path) self._report_pre_execution(doc, temp_path) self._report_execution(doc, temp_path) self._report_post_execution(doc, temp_path, importance_methods) doc.generate_pdf(f'{save_path}/Report{name_suffix}', clean_tex=False) if use_temp_folder: rm_temp_dir()