Source code for smefit.analyze.chi2_utils

# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from .latex_tools import chi2table_header, latex_packages


[docs] class Chi2tableCalculator: r"""Compute the :math:`\chi^2` for each replica and produce: * Tables with :math:`\chi^2` for each dataset and datagroup. * Plot of :math:`\chi^2` for each dataset. * Plot of :math:`\chi^2` for each replica Parameters ---------- data_info: pandas.DataFrame datasets information (references and data groups) """ def __init__(self, data_info): self.data_info = data_info self.chi2_df_sm = pd.DataFrame() self.chi2_df_sm_grouped = pd.DataFrame()
[docs] @staticmethod def compute(datasets, smeft_predictions): r"""Compute the :math:`\chi^2` for each replica and dataset. Parameters ---------- datasets: smefit.loader.DataTuple loaded datasets smeft_predictions: np.ndarray array with all the predictions for each replica Returns ------- pd.DataFrame: :math:`\chi^2` for each dataset np.ndarray: :math:`\chi^2/n_{pts}` for each replica """ chi2 = [] chi2_sm = [] chi2_rep = [] diff = datasets.Commondata - np.mean(smeft_predictions, axis=0) covmat_diff = datasets.InvCovMat @ diff diff_sm = datasets.Commondata - datasets.SMTheory covmat_diff_sm = datasets.InvCovMat @ diff_sm # do the difference replica by replica and multiply by cov inverse diff_rep = ( np.tile(datasets.Commondata, (smeft_predictions.shape[0], 1)) - smeft_predictions ) covmat_diff_rep = datasets.InvCovMat @ diff_rep.T # Compute per experiment cnt = 0 for ndat_exp in datasets.NdataExp: chi2.append( np.dot( diff[cnt : cnt + ndat_exp], covmat_diff[cnt : cnt + ndat_exp], ) ) chi2_sm.append( np.dot( diff_sm[cnt : cnt + ndat_exp], covmat_diff_sm[cnt : cnt + ndat_exp], ) ) # Compute chi2 by replica # multiply the second term of the chi2 and # take the diagonal (replica left = replica right). # here np.einsum is faster than np.diag(a @ b), since # a and b are large usually chi2_rep.append( np.einsum( "ij,ji->i", diff_rep[:, cnt : cnt + ndat_exp], covmat_diff_rep[cnt : cnt + ndat_exp], ) ) cnt += ndat_exp # compute chi2 for the whole dataset total_chi2_rep = np.einsum("ij,ji->i", diff_rep, covmat_diff_rep) return ( pd.DataFrame( { "ndat": datasets.NdataExp, "chi2": np.array(chi2), "chi2_std": np.std(chi2_rep, axis=1), "chi2_sm": np.array(chi2_sm), }, index=datasets.ExpNames, ), total_chi2_rep / datasets.Commondata.size, )
[docs] @staticmethod def add_normalized_chi2(chi2_df): r"""Add the normalized :math:`\chi^2` to the table. Parameters ---------- chi2_df : pd.DataFrame :math:`\chi^2` table for each dataset Returns ------- pd.DataFrame: :math:`\chi^2` table for each dataset with normalization """ # reduced chi2 chi2_df["chi2/ndat"] = chi2_df["chi2"] / chi2_df["ndat"] chi2_df["chi2_sm/ndat"] = chi2_df["chi2_sm"] / chi2_df["ndat"] return chi2_df
@staticmethod def _add_chi2_df_colors(chi2_df): r"""Values higer than one std are labelled with blue. Values lowe than one std are labelled with red. """ chi2_upper = chi2_df["chi2"] + chi2_df["chi2_std"] chi2_lower = chi2_df["chi2"] - chi2_df["chi2_std"] chi2_df["color"] = "black" chi2_df.loc[chi2_df["chi2_sm"] > chi2_upper, "color"] = "blue" chi2_df.loc[chi2_df["chi2_sm"] < chi2_lower, "color"] = "red" return chi2_df
[docs] def group_chi2_df(self, chi2_df): r"""Group the :math:`\chi^2` according to the data type. Parameters ---------- chi2_df : pd.DataFrame :math:`\chi^2` table for each dataset Returns ------- pd.DataFrame: :math:`\chi^2` table with deviation info """ chi2_df_grouped = pd.merge( self.data_info.reset_index(), chi2_df, left_on="level_1", right_index=True ).drop([0, "chi2_std"], axis=1) chi2_df_grouped = chi2_df_grouped.groupby("level_0").sum(numeric_only=True) chi2_df_grouped.index.name = "data_group" # add total values chi2_df_grouped.loc["Total"] = chi2_df_grouped.sum() chi2_df_grouped["chi2/ndat"] = chi2_df_grouped["chi2"] / chi2_df_grouped["ndat"] chi2_df_grouped["chi2_sm/ndat"] = ( chi2_df_grouped["chi2_sm"] / chi2_df_grouped["ndat"] ) return chi2_df_grouped
def _split_table_entries_sm(self, chi2_dict, chi2_dict_group): """Update the chi2_df dict for all included datasets.""" labels_to_include = ["ndat", "chi2_sm/ndat"] for chi2_df, chi2_df_grouped in zip( chi2_dict.values(), chi2_dict_group.values() ): self.chi2_df_sm = pd.concat( [self.chi2_df_sm, chi2_df[labels_to_include]], axis=0, ) self.chi2_df_sm_grouped = pd.concat( [self.chi2_df_sm_grouped, chi2_df_grouped[labels_to_include]], axis=0, ) # TODO: is this woking with different element in each group? self.chi2_df_sm = self.chi2_df_sm[ ~self.chi2_df_sm.index.duplicated(keep="first") ] self.chi2_df_sm_grouped = self.chi2_df_sm_grouped.drop_duplicates()
[docs] def write(self, chi2_dict, chi2_dict_group): r"""Write the :math:`\chi^2` latex tables. Parameters ---------- chi2_dict : dict tables computed with compute() method for each fit chi2_dict_group: dict tables obtained with group_chi2_df() method for each fit Returns ------- list(str) list with the latex commands """ self._split_table_entries_sm(chi2_dict, chi2_dict_group) L = latex_packages() L.extend([r"\usepackage{underscore}", r"\begin{document}"]) L.extend(self.write_chi2_grouped(chi2_dict, chi2_dict_group)) L.extend(["\n", "\n"]) L.extend(self.write_chi2_summary(chi2_dict_group)) return L
[docs] def write_chi2_grouped(self, chi2_dict, chi2_dict_group): r"""Write the :math:`\chi^2` latex tables for each data group. Parameters ---------- chi2_dict : dict tables computed with compute() method per each fit Returns ------- list(str) list with the latex commands """ L = [ r"$\chi^2$ table. Blue color text represents a value that is lower than the SM $\chi^2$ \ by more than one standard deviation of the $\chi^2$ distribution.\ Similarly, red color text represents values that are higher than the SM $\chi^2$ by more than one standard deviation.\ In parenthesis is the total SM $\chi^2$ for the dataset included in the fit. \\" ] for group, datasets in self.data_info.groupby(level=0): L.extend( [ r"\begin{table}[H]", r"\centering", r"\begin{tabular}{|l|c|c|" + "c|" * len(chi2_dict) + "}", ] ) L = chi2table_header(L, chi2_dict.keys()) # loop over datasets for dataset, link in datasets.droplevel(0).items(): temp = ( f"\\href{{{link}}}{{{dataset}}}" + f" & {int(self.chi2_df_sm.loc[dataset,'ndat'])}" + f" & {self.chi2_df_sm.loc[dataset,'chi2_sm/ndat'].round(3)}" ) for chi2_df in chi2_dict.values(): temp += " & " chi2_df = self._add_chi2_df_colors(chi2_df) if dataset in chi2_df.index: temp += f"\\textcolor{{{chi2_df.loc[dataset, 'color']}}}\ {{{chi2_df.loc[dataset, 'chi2/ndat']:.3f}}}" temp += r" \\ \hline" L.append(temp) closing_line = r"\hline Total & & " for chi2_df_grouped in chi2_dict_group.values(): closing_line += " & " if group in chi2_df_grouped.index: closing_line += f"{chi2_df_grouped.loc[group, 'chi2/ndat']:.3f} ({chi2_df_grouped.loc[group, 'chi2_sm/ndat']:.3f})" closing_line += r" \\ \hline" L.extend( [ closing_line, r"\end{tabular}", f"\\caption{{$\\chi^2$ table for {group} data}}", r"\end{table}", ] ) return L
[docs] def write_chi2_summary(self, chi2_dict_group): r"""Write the summary :math:`\chi^2` table for grouped data. Parameters ---------- chi2_dict_group : dict tables obtained with group_chi2_df() method for each fit Returns ------- list(str) list with the latex commands """ L = [ r"\begin{table}[H]", r"\centering", r"\begin{tabular}{|l|" + "c|c|" * len(chi2_dict_group) + "}", r"\hline", ] temp = r"" for label in chi2_dict_group: temp += f"& \\multicolumn{{2}}{{c|}}{{{label}}}" temp += r"\\ \hline" L.append(temp) L.append( r"Process " + r" & $N_{\rm data}$ & $\chi^2/N_{\rm data}$" * len(chi2_dict_group) + r"\\ \hline", ) for group in self.data_info.index.levels[0]: temp = f"{group}" for chi2_df_grouped in chi2_dict_group.values(): if group in chi2_df_grouped.index: temp += f" & {chi2_df_grouped.loc[group, 'ndat']} \ & {chi2_df_grouped.loc[group, 'chi2/ndat']:.3f} \ ({chi2_df_grouped.loc[group, 'chi2_sm/ndat']:.3f})" else: temp += " & & " temp += r" \\ \hline" L.append(temp) temp = r" \hline Total" for chi2_df_grouped in chi2_dict_group.values(): temp += f" & {chi2_df_grouped.loc['Total', 'ndat']} \ & {chi2_df_grouped.loc['Total', 'chi2/ndat']:.3f} \ ({chi2_df_grouped.loc['Total', 'chi2_sm/ndat']:.3f})" temp += r" \\ \hline" L.extend( [ temp, r"\end{tabular}", r"\caption{$\chi^2$ table for grouped data. In parenthesis is the total SM $\chi^2$ for the dataset included in the fit.\ The SM column refers to all the datasets available in the group}", r"\end{table}", ] ) return L
[docs] def plot_exp( self, chi2_dict, fig_name, figsize=(10, 15), ): r"""Plots a bar plot of the :math:`\chi^2` values per experiment""" chi2_bar = pd.DataFrame() chi2_bar[r"${\rm SM}$"] = self.chi2_df_sm["chi2_sm/ndat"] for name, chi2_df in chi2_dict.items(): chi2_bar[name] = chi2_df["chi2/ndat"] chi2_bar.index = [ f"\\rm{{{name}}}".replace("_", r"\_") for name in chi2_bar.index ] chi2_bar.plot(kind="barh", width=0.7, figsize=figsize) plt.vlines(1, -1, chi2_bar.shape[0] * 10, ls="dashed", color="black", alpha=0.5) x_max = chi2_bar.max().max() plt.vlines( np.arange(2, int(x_max) + 1), -1, chi2_bar.shape[0] * 10, ls="dashed", color="grey", lw=0.5, ) plt.tick_params(axis="x", direction="in", labelsize=15) plt.xlabel(r"$\chi^2$", fontsize=20) plt.xlim(-0.1, chi2_bar.max().max() + 0.2) plt.legend(loc="upper right", frameon=False, prop={"size": 11}) plt.tight_layout() plt.savefig(f"{fig_name}.pdf") plt.savefig(f"{fig_name}.png")
[docs] def plot_dist(self, chi2_hist, fig_name, figsize=(7, 5)): r"""Plots the :math:`\chi^2` distribution.""" plt.figure(figsize=figsize) ax = plt.subplot(111) for label, chi2_list in chi2_hist.items(): ax.hist( chi2_list, bins="fd", density=True, edgecolor="k", alpha=0.3, label=label, ) plt.tick_params(axis="x", direction="in", labelsize=15) plt.tick_params(axis="y", direction="in", labelsize=15) plt.xlabel(r"\rm $\chi^2$\ distribution", fontsize=20) plt.legend(loc="best", frameon=False, prop={"size": 11}) plt.tight_layout() plt.savefig(f"{fig_name}.pdf") plt.savefig(f"{fig_name}.png")