Source code for smefit.analyze.chi2_utils

# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from .latex_tools import chi2table_header, latex_packages



[docs]
class Chi2tableCalculator:
    r"""Compute the :math:`\chi^2` for each replica and produce:

        * Tables with :math:`\chi^2` for each dataset and datagroup.
        * Plot of :math:`\chi^2` for each dataset.
        * Plot of :math:`\chi^2` for each replica

    Parameters
    ----------
    data_info: pandas.DataFrame
        datasets information (references and data groups)

    """

    def __init__(self, data_info):
        self.data_info = data_info
        self.chi2_df_sm = pd.DataFrame()
        self.chi2_df_sm_grouped = pd.DataFrame()


[docs]
    @staticmethod
    def compute(datasets, smeft_predictions):
        r"""Compute the :math:`\chi^2` for each replica and dataset.

        Parameters
        ----------
            datasets: smefit.loader.DataTuple
                loaded datasets
            smeft_predictions: np.ndarray
                array with all the predictions for each replica

        Returns
        -------
        pd.DataFrame:
            :math:`\chi^2` for each dataset
        np.ndarray:
            :math:`\chi^2/n_{pts}` for each replica

        """
        chi2 = []
        chi2_sm = []
        chi2_rep = []

        diff = datasets.Commondata - np.mean(smeft_predictions, axis=0)
        covmat_diff = datasets.InvCovMat @ diff

        diff_sm = datasets.Commondata - datasets.SMTheory
        covmat_diff_sm = datasets.InvCovMat @ diff_sm

        # do the difference replica by replica and multiply by cov inverse
        diff_rep = (
            np.tile(datasets.Commondata, (smeft_predictions.shape[0], 1))
            - smeft_predictions
        )
        covmat_diff_rep = datasets.InvCovMat @ diff_rep.T

        # Compute per experiment
        cnt = 0
        for ndat_exp in datasets.NdataExp:
            chi2.append(
                np.dot(
                    diff[cnt : cnt + ndat_exp],
                    covmat_diff[cnt : cnt + ndat_exp],
                )
            )
            chi2_sm.append(
                np.dot(
                    diff_sm[cnt : cnt + ndat_exp],
                    covmat_diff_sm[cnt : cnt + ndat_exp],
                )
            )

            # Compute chi2 by replica
            # multiply the second term of the chi2 and
            # take the diagonal (replica left = replica right).
            # here np.einsum is faster than np.diag(a @ b), since
            # a and b are large usually
            chi2_rep.append(
                np.einsum(
                    "ij,ji->i",
                    diff_rep[:, cnt : cnt + ndat_exp],
                    covmat_diff_rep[cnt : cnt + ndat_exp],
                )
            )
            cnt += ndat_exp

        # compute chi2 for the whole dataset
        total_chi2_rep = np.einsum("ij,ji->i", diff_rep, covmat_diff_rep)

        return (
            pd.DataFrame(
                {
                    "ndat": datasets.NdataExp,
                    "chi2": np.array(chi2),
                    "chi2_std": np.std(chi2_rep, axis=1),
                    "chi2_sm": np.array(chi2_sm),
                },
                index=datasets.ExpNames,
            ),
            total_chi2_rep / datasets.Commondata.size,
        )



[docs]
    @staticmethod
    def add_normalized_chi2(chi2_df):
        r"""Add the normalized :math:`\chi^2` to the table.

        Parameters
        ----------
        chi2_df : pd.DataFrame
            :math:`\chi^2` table for each dataset

        Returns
        -------
        pd.DataFrame:
            :math:`\chi^2` table for each dataset with normalization
        """
        # reduced chi2
        chi2_df["chi2/ndat"] = chi2_df["chi2"] / chi2_df["ndat"]
        chi2_df["chi2_sm/ndat"] = chi2_df["chi2_sm"] / chi2_df["ndat"]
        return chi2_df


    @staticmethod
    def _add_chi2_df_colors(chi2_df):
        r"""Values higer than one std are labelled with blue.
        Values lowe  than one std are labelled with red.
        """
        chi2_upper = chi2_df["chi2"] + chi2_df["chi2_std"]
        chi2_lower = chi2_df["chi2"] - chi2_df["chi2_std"]
        chi2_df["color"] = "black"
        chi2_df.loc[chi2_df["chi2_sm"] > chi2_upper, "color"] = "blue"
        chi2_df.loc[chi2_df["chi2_sm"] < chi2_lower, "color"] = "red"
        return chi2_df


[docs]
    def group_chi2_df(self, chi2_df):
        r"""Group the :math:`\chi^2` according to the data type.

        Parameters
        ----------
        chi2_df : pd.DataFrame
            :math:`\chi^2` table for each dataset

        Returns
        -------
        pd.DataFrame:
            :math:`\chi^2` table with deviation info
        """
        chi2_df_grouped = pd.merge(
            self.data_info.reset_index(), chi2_df, left_on="level_1", right_index=True
        ).drop([0, "chi2_std"], axis=1)
        chi2_df_grouped = chi2_df_grouped.groupby("level_0").sum(numeric_only=True)
        chi2_df_grouped.index.name = "data_group"

        # add total values
        chi2_df_grouped.loc["Total"] = chi2_df_grouped.sum()
        chi2_df_grouped["chi2/ndat"] = chi2_df_grouped["chi2"] / chi2_df_grouped["ndat"]
        chi2_df_grouped["chi2_sm/ndat"] = (
            chi2_df_grouped["chi2_sm"] / chi2_df_grouped["ndat"]
        )
        return chi2_df_grouped


    def _split_table_entries_sm(self, chi2_dict, chi2_dict_group):
        """Update the chi2_df dict for all included datasets."""
        labels_to_include = ["ndat", "chi2_sm/ndat"]
        for chi2_df, chi2_df_grouped in zip(
            chi2_dict.values(), chi2_dict_group.values()
        ):
            self.chi2_df_sm = pd.concat(
                [self.chi2_df_sm, chi2_df[labels_to_include]],
                axis=0,
            )
            self.chi2_df_sm_grouped = pd.concat(
                [self.chi2_df_sm_grouped, chi2_df_grouped[labels_to_include]],
                axis=0,
            )
        # TODO: is this woking with different element in each group?
        self.chi2_df_sm = self.chi2_df_sm[
            ~self.chi2_df_sm.index.duplicated(keep="first")
        ]
        self.chi2_df_sm_grouped = self.chi2_df_sm_grouped.drop_duplicates()


[docs]
    def write(self, chi2_dict, chi2_dict_group):
        r"""Write the :math:`\chi^2` latex tables.

        Parameters
        ----------
        chi2_dict : dict
            tables computed with compute() method for each fit
        chi2_dict_group: dict
            tables obtained with group_chi2_df() method for each fit

        Returns
        -------
        list(str)
            list with the latex commands
        """
        self._split_table_entries_sm(chi2_dict, chi2_dict_group)
        L = latex_packages()
        L.extend([r"\usepackage{underscore}", r"\begin{document}"])
        L.extend(self.write_chi2_grouped(chi2_dict, chi2_dict_group))
        L.extend(["\n", "\n"])
        L.extend(self.write_chi2_summary(chi2_dict_group))
        return L



[docs]
    def write_chi2_grouped(self, chi2_dict, chi2_dict_group):
        r"""Write the :math:`\chi^2` latex tables for each data group.

        Parameters
        ----------
        chi2_dict : dict
            tables computed with compute() method per each fit

        Returns
        -------
        list(str)
            list with the latex commands
        """
        L = [
            r"$\chi^2$ table. Blue color text represents a value that is lower than the SM $\chi^2$ \
            by more than one standard deviation of the $\chi^2$ distribution.\
            Similarly, red color text represents values that are higher than the SM $\chi^2$ by more than one standard deviation.\
            In parenthesis is the total SM $\chi^2$ for the dataset included in the fit. \\"
        ]
        for group, datasets in self.data_info.groupby(level=0):
            L.extend(
                [
                    r"\begin{table}[H]",
                    r"\centering",
                    r"\begin{tabular}{|l|c|c|" + "c|" * len(chi2_dict) + "}",
                ]
            )
            L = chi2table_header(L, chi2_dict.keys())

            # loop over datasets
            for dataset, link in datasets.droplevel(0).items():
                temp = (
                    f"\\href{{{link}}}{{{dataset}}}"
                    + f" & {int(self.chi2_df_sm.loc[dataset,'ndat'])}"
                    + f" & {self.chi2_df_sm.loc[dataset,'chi2_sm/ndat'].round(3)}"
                )
                for chi2_df in chi2_dict.values():
                    temp += " & "
                    chi2_df = self._add_chi2_df_colors(chi2_df)
                    if dataset in chi2_df.index:
                        temp += f"\\textcolor{{{chi2_df.loc[dataset, 'color']}}}\
                            {{{chi2_df.loc[dataset, 'chi2/ndat']:.3f}}}"
                temp += r" \\ \hline"
                L.append(temp)

            closing_line = r"\hline Total & & "
            for chi2_df_grouped in chi2_dict_group.values():
                closing_line += " & "
                if group in chi2_df_grouped.index:
                    closing_line += f"{chi2_df_grouped.loc[group, 'chi2/ndat']:.3f} ({chi2_df_grouped.loc[group, 'chi2_sm/ndat']:.3f})"
            closing_line += r" \\ \hline"
            L.extend(
                [
                    closing_line,
                    r"\end{tabular}",
                    f"\\caption{{$\\chi^2$ table for {group} data}}",
                    r"\end{table}",
                ]
            )
        return L



[docs]
    def write_chi2_summary(self, chi2_dict_group):
        r"""Write the summary :math:`\chi^2` table for grouped data.

        Parameters
        ----------
        chi2_dict_group : dict
            tables obtained with group_chi2_df() method for each fit

        Returns
        -------
        list(str)
           list with the latex commands

        """
        L = [
            r"\begin{table}[H]",
            r"\centering",
            r"\begin{tabular}{|l|" + "c|c|" * len(chi2_dict_group) + "}",
            r"\hline",
        ]
        temp = r""
        for label in chi2_dict_group:
            temp += f"& \\multicolumn{{2}}{{c|}}{{{label}}}"
        temp += r"\\ \hline"
        L.append(temp)
        L.append(
            r"Process "
            + r" & $N_{\rm data}$ & $\chi^2/N_{\rm data}$" * len(chi2_dict_group)
            + r"\\ \hline",
        )

        for group in self.data_info.index.levels[0]:
            temp = f"{group}"
            for chi2_df_grouped in chi2_dict_group.values():
                if group in chi2_df_grouped.index:
                    temp += f" & {chi2_df_grouped.loc[group, 'ndat']} \
                        & {chi2_df_grouped.loc[group, 'chi2/ndat']:.3f} \
                            ({chi2_df_grouped.loc[group, 'chi2_sm/ndat']:.3f})"
                else:
                    temp += " & & "
            temp += r" \\ \hline"
            L.append(temp)

        temp = r" \hline Total"
        for chi2_df_grouped in chi2_dict_group.values():
            temp += f" & {chi2_df_grouped.loc['Total', 'ndat']} \
                & {chi2_df_grouped.loc['Total', 'chi2/ndat']:.3f} \
                    ({chi2_df_grouped.loc['Total', 'chi2_sm/ndat']:.3f})"
        temp += r" \\ \hline"
        L.extend(
            [
                temp,
                r"\end{tabular}",
                r"\caption{$\chi^2$ table for grouped data. In parenthesis is the total SM $\chi^2$ for the dataset included in the fit.\
                    The SM column refers to all the datasets available in the group}",
                r"\end{table}",
            ]
        )
        return L



[docs]
    def plot_exp(
        self,
        chi2_dict,
        fig_name,
        figsize=(10, 15),
    ):
        r"""Plots a bar plot of the :math:`\chi^2` values per experiment"""

        chi2_bar = pd.DataFrame()
        chi2_bar[r"${\rm SM}$"] = self.chi2_df_sm["chi2_sm/ndat"]
        for name, chi2_df in chi2_dict.items():
            chi2_bar[name] = chi2_df["chi2/ndat"]
        chi2_bar.index = [
            f"\\rm{{{name}}}".replace("_", r"\_") for name in chi2_bar.index
        ]
        chi2_bar.plot(kind="barh", width=0.7, figsize=figsize)

        plt.vlines(1, -1, chi2_bar.shape[0] * 10, ls="dashed", color="black", alpha=0.5)
        x_max = chi2_bar.max().max()
        plt.vlines(
            np.arange(2, int(x_max) + 1),
            -1,
            chi2_bar.shape[0] * 10,
            ls="dashed",
            color="grey",
            lw=0.5,
        )
        plt.tick_params(axis="x", direction="in", labelsize=15)
        plt.xlabel(r"$\chi^2$", fontsize=20)
        plt.xlim(-0.1, chi2_bar.max().max() + 0.2)
        plt.legend(loc="upper right", frameon=False, prop={"size": 11})
        plt.tight_layout()
        plt.savefig(f"{fig_name}.pdf")
        plt.savefig(f"{fig_name}.png")



[docs]
    def plot_dist(self, chi2_hist, fig_name, figsize=(7, 5)):
        r"""Plots the :math:`\chi^2` distribution."""
        plt.figure(figsize=figsize)
        ax = plt.subplot(111)

        for label, chi2_list in chi2_hist.items():
            ax.hist(
                chi2_list,
                bins="fd",
                density=True,
                edgecolor="k",
                alpha=0.3,
                label=label,
            )

        plt.tick_params(axis="x", direction="in", labelsize=15)
        plt.tick_params(axis="y", direction="in", labelsize=15)
        plt.xlabel(r"\rm $\chi^2$\ distribution", fontsize=20)
        plt.legend(loc="best", frameon=False, prop={"size": 11})
        plt.tight_layout()
        plt.savefig(f"{fig_name}.pdf")
        plt.savefig(f"{fig_name}.png")