Source code for smefit.core.chisquared

"""
Module for the computation of chi-squared values
"""
import numpy as np
from . import compute_theory as pr
from . import load_data as ld

# TODO: remove this dependency
# from NNPDF import CommonData


[docs]def compute_chi2(config, dataset, coeffs, labels):
    """ Compute the components for the chi2

    Here we also perform the cross-validation splitting at the level of the residuals,
    so as to prevent singular covariances matrices. 

    A mask is applied to each experiment and if the dataset has only 1 datapoint
    it is placed in the training set

    Returns the theory - exp vector and the inverse cov mat * (theory - exp)
    """

    corrected_theory = pr.make_predictions(config, dataset, coeffs, labels)

    dat = dataset.Commondata

    # If we are plotting chi2 we just want the experimental ones
    diff = dat + dataset.Noise - corrected_theory

    # The chi2 computation
    covmat_inv = np.linalg.inv(dataset.CovMat)

    # Multiply cov^-1 * diff but don't sum
    covmatdiff = np.einsum("ij,j->i", covmat_inv, diff)

    # Now we want to mask the diff and the cov mat experiment-by-experiment
    if config["cv"]:

        covmat_diff_mask = []
        diff_mask = []

        for mask in [dataset.TrainingMask, dataset.ValidationMask]:

            cov_mask = dataset.CovMat[:, mask]
            cov_mask = cov_mask[mask, :]
            cov_inv_mask = np.linalg.inv(cov_mask)

            # Get data, noise, and theory for split data
            split_cd = np.take(dat, mask)
            masked_noise = np.take(dataset.Noise, mask)
            theory_mask = np.take(corrected_theory, mask)

            # Compute theory - data for split data
            diff = split_cd + masked_noise - theory_mask

            # Take just the components we want for each of the sets
            diff_mask.append(diff)
            covmat_diff_mask.append(np.einsum("ij,j->i", cov_inv_mask, diff))

        # Return the tr and val masked arrays
        return diff_mask[0], diff_mask[1], covmat_diff_mask[0], covmat_diff_mask[1]

    # If cross-val is off, return everything with dummy lists for the validation sets
    return diff, [], covmatdiff, []


[docs]def compute_total_chi2(config, datasets, coefficients, labels):
    """Function to compute total central chi2 for all datasets
    assuming no cross-correlations. 

    Returns the chi2 for the tr/val sets (or total chi2 if cross-validation is off)
    """
    tr_diff_total = []
    val_diff_total = []
    tr_covmatdiff_total = []
    val_covmatdiff_total = []

    Ndat = []

    # Read in the results from the chi2 function
    trdiff, valdiff, trcovmatdiff, valcovmatdiff = compute_chi2(
        config, datasets, coefficients, labels
    )

    tr_diff_total.extend(trdiff)
    val_diff_total.extend(valdiff)

    tr_covmatdiff_total.extend(trcovmatdiff)
    val_covmatdiff_total.extend(valcovmatdiff)

    Ndat.append(len(datasets.Commondata))

    # If no cross-val just return the total chi2 (which is in the training set)
    # and total number of points
    if config["cv"]:
        tr_chi2_total = np.einsum("j,j->", tr_diff_total, tr_covmatdiff_total)
        val_chi2_total = np.einsum("j,j->", val_diff_total, val_covmatdiff_total)

        # Return the tr and val chi2 and number of points in each
        return (
            [tr_chi2_total, val_chi2_total],
            [len(tr_diff_total), len(val_diff_total)],
        )
    
    chi2_total = np.einsum("j,j->", tr_diff_total, tr_covmatdiff_total)
    return chi2_total, np.sum(Ndat)