"""
Generates the datasets needed for the analysis, starting from two toy events root files
.
"""
import traceback
import sys
import warnings
import ROOT
import uproot
import numpy as np
from utilities.utils import default_vars, default_rootpaths
from utilities.exceptions import IncorrectFractionError, IncorrectNumGenError, IncorrectIterableError
warnings.formatwarning = lambda msg, *args, **kwargs: f'\n{msg}\n'
[docs]def gen_from_toy(filepaths_in=('../data/root_files/toyMC_B0PiPi.root',
'../data/root_files/toyMC_B0sKK.root'),
filepaths_out=default_rootpaths(), tree='t_M0pipi;1',
num_mc=0, num_data=0, fraction=0.42, vars=default_vars()):
"""
Generates mixed signal+background datasets to be analysed, starting from two root files of
toy events of background only and signal only processes respectively.
:param filepaths_in: 2 element tuple with path to the two toys, first being background, the second being signal species.
:type filepaths_in: list[str] or tuple[str]
:param filepaths_out: Three element tuple of .root file paths. The first should indicate the root file containing the "background" species (flag=0), the second the "signal" species (flag=1), the third the data mix to be generated.
:type filepaths_out: list[str] or tuple[str]
:param tree: Name of the tree in which the desired variables are stored in the toy files (must be the same for both files).
:type tree: str
:param num_mc: Number of events generated for each output MC file. If both ``num_mc`` and ``num_data` are set to zero the maximum possible number of events is extracted from the toys
:type num_mc: int
:param num_data: Number of events generated for the output data file (mixed). If both ``num_mc`` and ``num_data` are set to zero the maximum possible number of events is extracted from the toys
:type num_data: int
:param fraction: Ideal fraction of signal events in the generated mixed sample. Actual fraction will be different if fraction*num_data is not an integer.
:type fraction: double
:param vars: List or tuple of variables to export from the toy files.
:type vars: list[str] or tuple[str]
"""
if tree.endswith(";1"):
tree = tree.replace(";1", "")
try:
if type(fraction) is not float:
raise IncorrectFractionError(fraction)
elif fraction<=0 or fraction>=1:
raise IncorrectFractionError(fraction)
except IncorrectFractionError:
print(traceback.format_exc())
sys.exit()
try:
if (type(filepaths_in) is not list and type(filepaths_in) is not tuple):
raise IncorrectIterableError(filepaths_in, 2, 'filepaths_in')
elif len(filepaths_in) < 2:
raise IncorrectIterableError(filepaths_in, 2, 'filepaths_in')
except IncorrectIterableError:
print(traceback.format_exc())
sys.exit()
if len(filepaths_in) >= 3:
msg = '***WARNING*** \nInput filepaths given are more than two. \
Using only the first two...\n*************\n'
warnings.warn(msg, stacklevel=2)
try:
if (type(filepaths_out) is not list and type(filepaths_out) is not tuple):
raise IncorrectIterableError(filepaths_out, 3, 'filepaths_out')
elif len(filepaths_out) < 3:
raise IncorrectIterableError(filepaths_out, 3, 'filepaths_out')
except IncorrectIterableError:
print(traceback.format_exc())
sys.exit()
if len(filepaths_out) >= 4:
msg = '***WARNING*** \nOutput filepaths given are more than three. \
Using only the first three...\n*************\n'
warnings.warn(msg, stacklevel=2)
dataframes = [ROOT.RDataFrame(tree, filepath) for filepath in filepaths_in]
# Number of events in the dataframes
n_evts_toymc_pi = dataframes[0].Count()
n_evts_toymc_pi = n_evts_toymc_pi.GetValue()
n_evts_toymc_k = dataframes[1].Count()
n_evts_toymc_k = n_evts_toymc_k.GetValue()
alpha = 0.2
# If num_mc and num_data are BOTH set to zero, the datasets are generated
# by taking from the toyMCs the maximum possible number of events (*) and
# by imposing the condition num_data/(2*num_mc) = alpha
# (*): for the cases fraction<0.5 and fraction>=0.5 respectively, we impose
# the following conditions: n_evts_toymc_pi == num_mc+num_pions,
# n_evts_toymc_k == num_mc+num_kaons
if int(num_mc) == 0 and int(num_data) == 0:
if fraction < 0.5:
num_mc = n_evts_toymc_pi/(1 + (2*alpha*(1-fraction)))
if fraction >= 0.5:
num_mc = n_evts_toymc_k/(1 + (2*alpha*fraction))
num_pions, num_kaons = int(
0.2*(1-fraction)*num_mc), int(0.2*fraction*num_mc)
num_data = num_pions + num_kaons
num_mc = int(num_mc)
else:
try:
num_pions, num_kaons = int(
(1-fraction)*num_data), int(fraction*num_data)
if (num_pions+num_mc > n_evts_toymc_pi) or \
(num_kaons+num_mc > n_evts_toymc_k):
raise IncorrectNumGenError(
num_mc, num_pions+num_kaons, n_evts_toymc_pi, n_evts_toymc_k)
except IncorrectNumGenError:
print(traceback.format_exc())
sys.exit()
print(f'Actual fraction of signal events = {num_kaons/num_data}')
# Takes the first num_mc events of the input toy files
df_mc_pi = dataframes[0].Range(num_mc)
# Creates a .root file with the chosen vars as branches
df_mc_pi.Snapshot(tree, filepaths_out[0], vars)
df_mc_k = dataframes[1].Range(num_mc)
df_mc_k.Snapshot(tree, filepaths_out[1], vars)
# Takes the rest of the input toys to be used as data
df_data_pi = dataframes[0].Range(num_mc, num_mc+num_pions)
df_data_k = dataframes[1].Range(num_mc, num_mc+num_kaons)
# Since data set needs to be shuffled, passing through numpy arrays
var_list = []
for var in vars:
v_temp_pi = df_data_pi.AsNumpy()[var]
v_temp_k = df_data_k.AsNumpy()[var]
v_temp = np.concatenate((v_temp_pi, v_temp_k), axis=0)
var_list.append(v_temp)
var_array = np.stack(var_list, axis=1)
np.random.shuffle(var_array)
var_dictionary = {} # Dictionary of vars to be saved in the data outfile
for idx in range(len(vars)):
var_dictionary.update({vars[idx]: var_array[:, idx]})
file = uproot.recreate(filepaths_out[2])
file[tree] = var_dictionary
file[tree].show()
file[tree].close()
if __name__ == '__main__':
print('Running this module as main module is not supported. Feel free to \
add a custom main or run the package as a whole (see README.md)')