Source code for utilities.utils

"""
Module containing general-use functions
"""
import traceback
import os
import sys
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from utilities.exceptions import IncorrectEfficiencyError, \
                                 IncorrectIterableError, IncoherentRocPlotError

warnings.formatwarning = lambda msg, *args, **kwargs: f'\n{msg}\n'


[docs]def default_rootpaths():
    """
    Returns the default root file paths of the package, where background is
    pions and signal is kaons.

    :return: Three element tuple containing the paths of the pion MC, the kaon MC and the mixed data root files, respectively.
    :rtype: tuple[str]

    """
    current_path = os.path.dirname(__file__)
    rel_path = '../data/root_files'
    rootnames = ['B0PiPi_MC.root', 'B0sKK_MC.root', 'Bhh_data.root']
    rootpaths = tuple([os.path.join(current_path, rel_path, file)
                       for file in rootnames])
    return rootpaths


[docs]def default_txtpaths():
    """
    Returns the .txt file paths containing the training MC array and the data
    array, to be used in DNN or DTC analyses.

    :return: Tuple containing the paths of the MC training array (50/50 signal/background for unbiased training) and the path of the data array, respectively.
    :rtype: tuple[str]

    """
    current_path = os.path.dirname(__file__)
    rel_path = '../data/txt'
    txtnames = ['train_array.txt', 'data_array.txt']
    txtpaths = tuple([os.path.join(current_path, rel_path, file)
                      for file in txtnames])
    return txtpaths


[docs]def default_vars():
    """
    Returns default variables used by the package in the pi-K analysis.

    :return: 13 element tuple containing the names of the default variables to use.
    :rtype: tuple[str]

    """
    return ('M0_Mpipi', 'M0_MKK', 'M0_MKpi', 'M0_MpiK', 'M0_p', 'M0_pt',
            'M0_eta', 'h1_thetaC0', 'h1_thetaC1', 'h1_thetaC2',
            'h2_thetaC0', 'h2_thetaC1', 'h2_thetaC2')


[docs]def default_figpath(figname, dir='fig', figtype='pdf'):
    """
    Returns the path to the figure folder with respect to the cwd in which to
    then save figures.

    :param figname: Name with which to save the figure.
    :type figname: str
    :param dir: Directory relative to cwd where to save the figure.
    :type dir: str
    :param figtype: Type of figure save file.
    :type figtype: str
    :return: Path where to save the figure.
    :rtype: str

    """
    wd_path = os.getcwd()
    figpath = os.path.join(wd_path, dir, figname+'.'+figtype)
    return figpath


[docs]def default_resultsdir(dir='outputs-PiKclassifier'):
    """
    Returns the path where to store the outputs of the package.

    :param dir: Directory where to save the outputs.
    :type dir: str
    :return: The figure path.
    :rtype: str

    """
    if os.path.exists(os.getcwd()+'/'+dir):
        pass
    else:
        os.mkdir(dir)
    figpath = os.path.join(os.getcwd(), dir)
    return figpath


[docs]def find_cut(pi_array, k_array, efficiency,
             specificity_mode=False, inverse_mode=False):
    """
    Finds where to cut a certain variable to obtain a certain
    sensitivity/specificity in a hypothesis test between two given species' arrays.

    :param pi_array: Array containing the background species.
    :type pi_array: numpy.array[float]
    :param k_array: Array containing the signal species.
    :type k_array: numpy.array[float]
    :param efficiency: Sensitivity required from the test (specificity if ``specificity_mode = True``).
    :type efficiency: float
    :param specificity_mode: If set to ``True`` the efficiency given is taken to be the intended specificity.
    :type specificity_mode: bool
    :param inverse_mode: Set to ``True`` if the signal events tend to have lower values.
    :type inverse_mode: bool
    :return: Two element tuple containing cut value and misidentification probability for the negative species (or sensitivity if ``specificity_mode = True``)
    :rtype: tuple[double]
"""
    

    if inverse_mode:
        efficiency = 1 - efficiency
        cut = - np.sort(-k_array)[int(efficiency*(len(k_array)-1))] \
            if not specificity_mode else np.sort(pi_array)[int(efficiency*(len(k_array)-1))]
        misid = (pi_array < cut).sum()/pi_array.size \
            if not specificity_mode else (k_array < cut).sum()/k_array.size
    else:
        cut = - np.sort(-k_array)[int(efficiency*(len(k_array)-1))] \
            if not specificity_mode else np.sort(pi_array)[int(efficiency*(len(k_array)-1))]
        misid = (pi_array > cut).sum()/pi_array.size \
            if not specificity_mode else (k_array > cut).sum()/k_array.size

    return cut, misid


[docs]def plot_rocs(rocx_arrays, rocy_arrays, roc_labels, roc_linestyles, roc_colors,
              x_pnts=(), y_pnts=(), point_labels=(''), eff=0,
              figtitle='ROC', figname=''):
    """
    Draws superimposed roc curves and/or points

    :param rocx_arrays: List or tuple of numpy arrays, each containing the respective x points of different roc curves to be plotted.
    :type rocx_arrays: list[numpy.array[float]] or tuple[numpy.array[float]]
    :param rocy_arrays: List or tuple of numpy arrays, each containing the respective y points of different roc curves to be plotted.
    :type rocy_arrays: list[numpy.array[float]] or tuple[numpy.array[float]]
    :param roc_labels: Names of the respective species whose roc coordinates were given.
    :type roc_labels: list[str] or tuple[str]
    :param roc_linestyles: Linestyles of the respective species whose roc coordinates were given.
    :type roc_linestyles: list[str] or tuple[str]
    :param roc_colors: Colors of the respective species whose roc coordinates were given.
    :type roc_colors: list[str] or tuple[str]
    :param x_pnts: List or tuple of the respective x coordinates of points to be plotted.
    :type x_pnts: list[double] or tuple[double]
    :param y_pnts: List or tuple of the respective y coordinates of points to be plotted.
    :type y_pnts: list[double] or tuple[double]
    :param point_labels: List or tuple of names of the respective species whose point coordinates were given.
    :type point_labels: list[str] or tuple[str]
    :param eff: If different than 0., draws a green dashed line at y = eff on the plot.
    :type eff: double
    :param figtitle: Title to be given to the figure.
    :type figtitle: str
    :param figname: If different than '', saves the figure as a pdf with name figname.
    :type figname: str

    """
    plt.figure(figtitle)
    plt.title(figtitle)
    plt.xlabel('False Positive Probability')
    plt.xlim(0, 1)
    plt.ylabel('True Positive Probability')
    plt.ylim(0, 1)

    try:
        if (type(rocx_arrays) == list or type(rocx_arrays) == tuple) is not True:
            raise IncorrectIterableError(rocx_arrays, 3, 'rocx_arrays')
    except IncorrectIterableError:
        print(traceback.format_exc())
        sys.exit()
    try:
        if (type(rocy_arrays) == list or type(rocy_arrays) == tuple) is not True:
            raise IncorrectIterableError(rocy_arrays, 3, 'rocy_arrays')
    except IncorrectIterableError:
        print(traceback.format_exc())
        sys.exit()
    try:
        if (type(roc_labels) == list or type(roc_labels) == tuple) is not True:
            raise IncorrectIterableError(roc_labels, 3, 'roc_labels')
    except IncorrectIterableError:
        print(traceback.format_exc())
        sys.exit()
    try:
        if (type(roc_linestyles) == list or type(roc_linestyles) == tuple) is not True:
            raise IncorrectIterableError(roc_linestyles, 3, 'roc_linestyles')
    except IncorrectIterableError:
        print(traceback.format_exc())
        sys.exit()
    try:
        if (type(roc_colors) == list or type(roc_colors) == tuple) is not True:
            raise IncorrectIterableError(roc_colors, 3, 'roc_colors')
    except IncorrectIterableError:
        print(traceback.format_exc())
        sys.exit()

    # Check if all the lists/tuples have same lengths
    try:
        if len(set([len(i) for i in [rocx_arrays, rocy_arrays, roc_labels, roc_linestyles, roc_colors]])) != 1:
            raise IncoherentRocPlotError
    except IncoherentRocPlotError:
        print(traceback.format_exc())
        sys.exit()

    for idx in range(len(rocx_arrays)):
        plt.plot(rocx_arrays[idx], rocy_arrays[idx], label=roc_labels[idx],
                 color=roc_colors[idx], linestyle=roc_linestyles[idx])
    for idx in range(len(x_pnts)):
        plt.plot((x_pnts[idx]), (y_pnts[idx]),
                 label=point_labels[idx], marker='o')

    if eff != 0:
        plt.axhline(y=eff, color='green', linestyle='--',
                    label='Efficiency chosen at ' + str(eff))
    plt.axline((0, 0), (1, 1), linestyle='--', label='AUC = 0.5')
    plt.legend()
    plt.draw()
    if figname == '':
        plt.savefig(default_figpath(figtitle))
    else:
        plt.savefig(figname)


[docs]def roc(pi_array, k_array, inverse_mode=False, makefig=False, eff=0, name="ROC"):
    """
    Returns the roc curve's x and y coordinates given two arrays of values for
    two different species. optionally draws the roc curve using plot_rocs().

    :param pi_array: Array containing the "negative" species.
    :type pi_array: numpy.array[float]
    :param k_array: Array containing the "positive" species.
    :type k_array: numpy.array[float]
    :param inverse_mode: To activate if the "positive" events tend to have lower values
    :type inverse_mode: bool
    :param makefig: If set to True draws the roc curve
    :type makefig: bool
    :param eff: If different than 0. and makefig = True , draws a green dashed line at y = eff on the plot.
    :type eff: double
    :param name: If makefig = True , name of the saved figure.
    :type name: str
    :return: Three element tuple containing: numpy array of floats of x coordinates of the roc curve, numpy array of floats of y coordinates of the roc curve, AUC of the ROC curve.
    :rtype: tuple[numpy.array[float], numpy.array[float], float]

    """
    true_array = np.concatenate(
        (np.zeros(pi_array.size), np.ones(k_array.size)))
    y_array = np.concatenate((pi_array, k_array))
    rocx, rocy, _ = metrics.roc_curve(true_array, y_array)
    auc = metrics.roc_auc_score(true_array, y_array)

    # need to invert the roc to make sense when in inverse mode
    if inverse_mode:
        rocx, rocy = np.ones(rocx.size)-rocx, np.ones(rocy.size)-rocy
        auc = 1 - auc

    if makefig:
        plot_rocs((rocx,), (rocy,), ("ROC",), ("-",), ("blue",), eff=eff,
                  figname=name)

    return rocx, rocy, auc


[docs]def stat_error(fraction, data_size, eff, misid):
    """
    Evaluates the statistical error on fraction estimate due to the finite
    sample of the data set, using the variance of sum of two binomials (of
    signal and background events respectively).

    :param fraction: Estimated fraction by the algorithm.
    :type fraction: float
    :param data_size: Size of the data set.
    :type template_sizes: int
    :param eff: Estimated efficiency of the algorithm.
    :type eff: float
    :param misid: Estimated misidentification probability (false positive) of the algorithm.
    :type misid: float
    :return: The statistical error associated to the fraction.
    :rtype: float

    """

    d_Nk = data_size*fraction*eff*(1-eff)
    d_Npi = data_size*(1-fraction)*misid*(1-misid)

    d_frac = np.sqrt(d_Nk+d_Npi)/(data_size*(eff-misid))

    return d_frac


[docs]def syst_error(fraction, template_sizes, eff, misid):
    """
    Evaluates the systematic error on fraction estimate due to the finite
    sample used to evaluate the "efficiency" and "misid" parameters.

    :param fraction: Estimated fraction by the algorithm.
    :type fraction: float
    :param template_sizes: Two element list or tuple of sizes of the evaluation arrays (background and signal dataset, in this order).
    :type template_sizes: list[int] tuple[int]
    :param eff: Estimated efficiency of the algorithm.
    :type eff: float
    :param misid: Estimated misidentification probability (false positive) of the algorithm
    :type misid: float
    :return: The systematic error associated to the fraction.
    :rtype: float

    """
    d_eff = np.sqrt(eff*(1-eff)/template_sizes[1])
    d_misid = np.sqrt(misid*(1-misid)/template_sizes[0])

    d_frac = np.sqrt((d_misid*(1-fraction))**2
                     + (d_eff*fraction)**2)/(eff-misid)

    return d_frac


if __name__ == '__main__':
    print('Running this module as main module is not supported. Feel free to \
add a custom main or run the package as a whole (see README.md)')
Source code for utilities.utils

cmepda-PiKclassifier

Navigation

Related Topics