umami.preprocessing_tools package#

Subpackages#

umami.preprocessing_tools.resampling package

Submodules#

umami.preprocessing_tools.configuration module#

Configuration module for preprocessing.

class umami.preprocessing_tools.configuration.GeneralSettings(outfile_name: str | None = None, outfile_name_validation: str | None = None, plot_name: str | None = None, plot_type: str = 'pdf', apply_atlas_style: bool = True, use_atlas_tag: bool = True, atlas_first_tag: str = 'Simulation Internal', atlas_second_tag: str | None = None, legend_sample_category: bool = True, var_file: str | None = None, dict_file: str | None = None, compression: str | None = None, precision: str | None = None, concat_jet_tracks: bool = False, convert_to_tfrecord: dict | None = None)#

Bases: object

Class handling general preprocessing options.

apply_atlas_style: bool = True#

as_dict()#

Return the class attributes as dict

Returns:: Class attributes as dict
Return type:: dict

atlas_first_tag: str = 'Simulation Internal'#

atlas_second_tag: str = None#

compression: str = None#

concat_jet_tracks: bool = False#

convert_to_tfrecord: dict = None#

dict_file: str = None#

legend_sample_category: bool = True#

outfile_name: str = None#

outfile_name_validation: str = None#

plot_name: str = None#

plot_options_as_dict()#

Return the plotting related class attributes as dict. These values are the ones which can be passed to PUMA.

Returns:: Plotting related class attributes
Return type:: dict

plot_type: str = 'pdf'#

precision: str = None#

use_atlas_tag: bool = True#

var_file: str = None#

class umami.preprocessing_tools.configuration.Preparation(settings: dict)#

Bases: object

Class handling preprocessing options in preparation block.

get_input_files(sample_type: str)#

Provides

Parameters:: sample_type (str) – Sample type, e.g. ttbar
Returns:: List of h5 input files
Return type:: list

get_sample(sample_name: str)#

Retrieve information about sample.

Parameters:: sample_name (str) – Name of sample
Returns:: sample class of specified sample
Return type:: Sample
Raises:: KeyError – if specified sample not in config file

class umami.preprocessing_tools.configuration.PreprocessConfiguration(yaml_config: str)#

Bases: Configuration

Preprocessing Configuration class.

check_resampling_options()#

Checking that n_jets* are defined correctly for the given resampling method.

Raises:: ValueError – If the value is smaller than 1 for all methods beside pdf

copy_to_out_dir(suffix: str, out_dir: str | None = None) → None#

Write the current config object to a new file, in the output dir of the current preprocessing job

Parameters:

suffix (str) – Append this string to the copied config file name
out_dir (str) – Output directory to which the files are copied.

get_configuration() → None#: Assign configuration from file to class variables.

get_file_name(option: str | None = None, extension: str = '.h5', custom_path: str | None = None, use_val: bool = False) → str#

Get the file name for different preprocessing steps.

Parameters:

option (str, optional) – Option name for file, by default None
extension (str, optional) – File extension, by default “.h5”
custom_path (str, optional) – Custom path to file, by default None
use_val (bool, optinal) – Decide if the outfile name from the training or from the validation will be loaded. With True, the validation file name will be used. By default False.

Returns:

Path of the output file.

Return type:

str

Bases: object

Class storing sample info.

Parameters:

name (str) – Name of sample
type (str) – Sample type
category (str) – Sample category, e.g. bjets
n_jets (int) – Number of jets to load from sample
cuts (dict) – Dictionary containing cuts which will be applied on sample
output_name (str) – Name of output file

category: str = None#

cuts: dict = None#

n_jets: int = None#

name: str = None#

output_name: str = None#

type: str = None#

class umami.preprocessing_tools.configuration.Sampling(class_labels: list | None = None, method: str | None = None, options: object | None = None, use_validation_samples: bool = False)#

Bases: object

Class handling preprocessing options in sampling block.

as_dict()#

Return the class attributes as dict

Returns:: Class attributes as dict
Return type:: dict

class_labels: list = None#

method: str = None#

options: object = None#

use_validation_samples: bool = False#

class umami.preprocessing_tools.configuration.SamplingOptions(sampling_variables: list | None = None, samples_training: dict | None = None, samples_validation: dict | None = None, custom_n_jets_initial: dict | None = None, fractions: dict | None = None, max_upsampling_ratio: dict | None = None, sampling_fraction: dict | None = None, n_jets: int | None = None, n_jets_validation: int | None = None, n_jets_scaling: int | None = None, save_tracks: bool = False, tracks_names: list | None = None, save_track_labels: bool = False, intermediate_index_file: str | None = None, intermediate_index_file_validation: str | None = None, weighting_target_flavour: str | None = None, bool_attach_sample_weights: bool | None = None, n_jets_to_plot: int | None = None, target_distribution: str | None = None)#

Bases: object

Class handling preprocessing options in sampling block.

as_dict()#

Return the class attributes as dict

Returns:: Class attributes as dict
Return type:: dict

bool_attach_sample_weights: bool = None#

custom_n_jets_initial: dict = None#

fractions: dict = None#

intermediate_index_file: str = None#

intermediate_index_file_validation: str = None#

max_upsampling_ratio: dict = None#

n_jets: int = None#

n_jets_scaling: int = None#

n_jets_to_plot: int = None#

n_jets_validation: int = None#

samples_training: dict = None#

samples_validation: dict = None#

sampling_fraction: dict = None#

sampling_variables: list = None#

save_track_labels: bool = False#

save_tracks: bool = False#

target_distribution: str = None#

tracks_names: list = None#

weighting_target_flavour: str = None#

umami.preprocessing_tools.configuration.check_key(location, old_key: str, new_key: str) → None#

Helper function to check

Parameters:

location (object) – location in which to check the keys
old_key (str) – name of old key/option
new_key (str) – name of new key/option

Raises:

KeyError – If deprecated keys are being used

umami.preprocessing_tools.merging module#

Helper functions to merge hdf5 (big) files

umami.preprocessing_tools.merging.add_data(source, output, data_range)#

Add content of “source” to “output” hdf5 file.

Parameters:

source (str, dict) – input hdf5 file path / input hdf5 file / dictionary
output (h5py File) – output hdf5 file
data_range (list) – where to save data in output arrays

umami.preprocessing_tools.merging.check_keys(data1, data2) → bool#

Check it both files have the same datasets. Return True if both have the same datasets or raise a ValueError otherwise.

Parameters:

data1 (dict) – current data dictionary
data2 (dict) – data dictionary to be added

Returns:

True if everything works.

Return type:

bool

Raises:

ValueError – If Files have different datasets!

umami.preprocessing_tools.merging.check_shapes(data1, data2)#

Check if shapes of datasets are the same. Return True if both datasets have the same shapes or raise a ValueError otherwise.

Parameters:

data1 (dict) – current data dictionary
data2 (dict) – data dictionary to be added

Returns:

If shapes are the same, return True, else False

Return type:

bool

Raises:

ValueError – If shapes are different.

umami.preprocessing_tools.merging.check_size(data) → int#

Check if #entries is the same for all keys and return it

Parameters:: data (dict) – Array with the entries inside.
Returns:: Number of entries
Return type:: int
Raises:: ValueError – If not all datasets have the same number of entries.

umami.preprocessing_tools.merging.create_datasets(output, source: dict, size)#

Prepare datasets for merged file based on dictionary.

Parameters:

output (h5py File) – output merged hdf5 file
source (dict) – dict with arrays to save per key or path to one input hdf5 file or one input hdf5 file
size (int) – total number of entries per dataset

umami.preprocessing_tools.merging.get_size(filelist: list)#

Get total size of datasets; return size and ranges per file.

Parameters:

filelist (list) – List of input files

Returns:

total_size (list) – Size of the files.
ranges (list) – Ranges of the files.

umami.preprocessing_tools.preparation module#

Helper functions to creating hybrid hdf5 samples from ttbar and Zprime ntuples.

class umami.preprocessing_tools.preparation.PrepareSamples(args, config)#

Bases: object

This class is preparing the samples for further processing defined in the configuration file:

extracts the selected jets (applying cuts: flavour, pT etc.)

writes these iteratively to h5 output files

This class will take the information provided in the samples block in the preprocessing config.

get_batches_per_file(filename: str)#

Split the file into batches to avoid that the loaded data is too large.

Parameters:

filename (str) – name of file to be split in batches

Returns:

str – filename
list – tuples of start and end index of batch

jets_generator(files_in_batches: list) → tuple#

Helper function to extract jet and track information from a h5 ntuple.

Parameters:

files_in_batches (list) – tuples of filename and tuple of start and end index of batch

Yields:

numpy.ndarray – jets
numpy.ndarray – tracks if self.save_tracks is set to True

run()#: Run over Ntuples to extract jets (and potentially also tracks).

umami.preprocessing_tools.scaling module#

Scaling module to perform variable scaling and shifting.

class umami.preprocessing_tools.scaling.CalculateScaling(config: object)#

Bases: object

Scaling class. Can calculate the scaling and shifting for training dataset and can apply it.

get_scale_dict(input_file: str | None = None, chunk_size: int = 100000)#

Calculates the scaling, shifting and default values and saves them to json.

Parameters:

input_file (str, optional) – File which is used to calculate scaling/shifting, by default None
chunk_size (int, optional) – Scale dict calculated using the given file, by default 1e5

Raises:

ValueError – If one of the scaling/shifting values is inf

get_scaling(vec: ndarray, varname: str, custom_defaults_vars: dict)#

Calculates the weighted average and std for vector vec.

Parameters:

vec (np.ndarray) – Array with variable values for the jets
varname (str) – Name of the variable which is to be scaled
custom_defaults_vars (dict) – Dict with custom default variable values

Returns:

varname (str) – Name of the variable
average (float) – Average of the variable
std (float) – Std of the variable
default (float) – Default value of the variable

get_scaling_dict_generator(input_file: str, n_jets: int, chunk_size: int = 100000)#

Set up a generator that loads the jets in chunks and calculates the mean/std.

Parameters:

input_file (str) – File which is to be scaled.
n_jets (int) – Number of jets which are to be scaled.
chunk_size (int, optional) – The number of jets which are loaded and scaled/shifted per step, by default 1_000_000

Yields:

scale_dict_trk (dict) – Dict with the scale/shift values for each variable.
n_jets (int) – Number of jets used for scaling/shifting.

get_scaling_tracks(data: ndarray, var_names: list, track_mask: ndarray, tracks_name: str)#

Calculate the scale dict for the tracks and return the dict.

Parameters:

data (np.ndarray) – Structured tracks array with shape (n_jets, n_trks, n_trk_features)
var_names (list) – List of variables which are to be scaled
track_mask (np.ndarray) – Boolen array where False denotes padded tracks, with shape (n_jets, n_trks)
tracks_name (str) – Name of used tracks collection

Returns:

scale_dict (dict) – Scale dict with scaling/shifting values for each variable
n_trks (int) – Number of tracks used to calculate the scaling/shifting

get_scaling_tracks_generator(input_file: str, n_jets: int, tracks_name: str, chunk_size: int = 100000)#

Set up a generator that loads the tracks in chunks and calculates the mean/std.

Parameters:

input_file (str) – File which is to be scaled.
n_jets (int) – Number of jets which are to be scaled.
tracks_name (str) – Name of the tracks
chunk_size (int, optional) – The number of jets which are loaded and scaled/shifted per step, by default int(1_000_000)

Yields:

scale_dict_trk (dict) – Dict with the scale/shift values for each variable.
nTrks (int) – Number of tracks used for scaling/shifting.

join_scale_dicts(first_scale_dict: dict, second_scale_dict: dict, first_n: int, second_n: int)#

Combining the scale dicts of two track chunks.

Parameters:

first_scale_dict (dict) – First scale dict to join.
second_scale_dict (dict) – Second scale dict to join.
first_n (int) – Number for the first scale dict.
second_n (int) – Number for the second scale dict.

Returns:

combined_scale_dict (dict) – The combined scale dict.
combined_n (int) – The combined number of objects.

umami.preprocessing_tools.scaling.apply_scaling_jets(jets: DataFrame, variables_list: dict, scale_dict: dict) → DataFrame#

Apply the jet scaling and shifting for the given jets.

Parameters:

jets (pd.DataFrame) – Loaded jets which are to be scaled/shifted.
variables_list (dict) – Train variables which will be scaled/shifted. For all variables, the scaling/shifting values must be in the scaling dict.
scale_dict (dict) – Loaded scaling dict with the scaling/shifting values for the variables defined in variables_list.

Returns:

Scaled/Shifted jets with the variables defined in variables_list

Return type:

pd.DataFrame

Raises:

ValueError – When jets is neither a pandas DataFrame nor a structured numpy ndarray
KeyError – When for the variable which is to be scaled no shift/scale values are available in the scale dict.
ValueError – If the scale parameter for the variable is either 0 or inf.
ValueError – If the scaled/shifted variable has infs or NaNs.

umami.preprocessing_tools.scaling.apply_scaling_trks(trks: ndarray, variable_config: dict, scale_dict: dict, tracks_name: str, save_track_labels: bool = False, track_label_variables: list | None = None)#

Apply the scaling/shifting to the tracks.

Parameters:

trks (np.ndarray) – Loaded tracks as numpy array.
variable_config (dict) – Loaded variable config.
scale_dict (dict) – Loaded scale dict.
tracks_name (str) – Name of the tracks.
save_track_labels (bool) – Save the track labels
track_label_variables (list) – List of the track label variables which are to be saved.

Returns:

scaled_trks (np.ndarray) – The tracks scaled and shifted.
valid (np.ndarray) – Bool array specifying which tracks are valid vs padding.
trk_labels (np.ndarray) – The track labels, if defined in the variable config.

Raises:

ValueError – If a value of a variable which is to be used in log form is zero/negative.
ValueError – If scale is found to be 0 or inf for any track variable.
ValueError – If the scaled/shifted variable has infs or NaNs.

umami.preprocessing_tools.scaling.as_full(data_type: dtype)#

Convert float type to full precision

Parameters:: data_type (np.dtype) – type to check for float
Returns:: Return an element of a dtype as a full precision float if we stored half.
Return type:: np.dtype

umami.preprocessing_tools.scaling.get_track_mask(tracks: ndarray) → ndarray#

Return the mask for the tracks

Parameters:: tracks (np.ndarray) – Loaded tracks with shape (n_jets, nTrks, nTrkFeatures). Note, the input tracks should not already be converted with np.nan_to_num, as this function relies on a np.isnan check in the case where the valid flag is not present.
Returns:: A bool array (n_jets, nTrks), True for tracks that are present.
Return type:: np.ndarray
Raises:: ValueError – If no ‘valid’ flag or at least one float variable in your input tracks.

umami.preprocessing_tools.ttbar_merge module#

Helper functions for merging single leptonic and dileptonic ttbar samples.

class umami.preprocessing_tools.ttbar_merge.MergeConfig(yaml_config: str)#

Bases: Configuration

Merge config class.

class umami.preprocessing_tools.ttbar_merge.TTbarMerge(config: object)#

Bases: object

This class merges the single and dilepton ttbar samples in the required ratio to match Run-2 MC non-allhadronic ttbar sample.

get_indices() → None#

Get indices for dilepton sample to match ratio to single lepton sample. Indices are saved to file for later use.

Raises:: ValueError – If no dilepton sample found.

get_input_files(channel: str) → list#

Get input files for a given channel.

Parameters:

channel (str) – Channel to get input files for - single or dilepton.

Returns:

List of input files.

Return type:

list

Raises:

ValueError – If no input files found for given channel.
KeyError – If no channel specified doesn’t exist in config.

load_jets_generator(input_file: str, chunk_size: int = 100000, indices: list | None = None, save_tracks: bool = False) → ndarray#

Yield jets (and tracks) from input file in batches with option to select events by index.

Parameters:

input_file (str) – Path to input file.
chunk_size (int) – Number of jets or tracks to load at a time.
indices (list) – Indices of jets or tracks to load.
save_tracks (bool) – Whether to load tracks as well as jets, by default false.

Yields:

np.ndarray – numpy arrays of jets and tracks if save_tracks is true.

merge(file_range: list, index_dir: str) → None#

Merge single and dilepton ttbar samples.

Parameters:

file_range (list) – List of output files to merge, allows splitting across multiple jobs.
index_dir (str) –
Directory containing index file dictionaries.

Raises
------ –
ValueError – If the file range passed via –file_range does not consist of two arguments

write_file(index_dict: dict, output_file: str) → None#

Write merged output file from passed index dictionary.

Parameters:

index_dict (dict) – Dictionary of indices to be used from each input file.
output_file (str) – Name of output file.

umami.preprocessing_tools.ttbar_merge.event_indices(input_file_list: list, event_numbers: ndarray) → list#

Get indices for each input file for jets from selected subsample of dilepton events.

Parameters:

input_file_list (list) – List of input files.
event_numbers (np.ndarray) – Array of event numbers.

Returns:

List of numpy arrays of indices for each input file.

Return type:

list

umami.preprocessing_tools.ttbar_merge.event_list(input_file_list: list) → tuple#

Get list of unique event numbers from input files and number of jets in sample.

Parameters:

input_file_list (list) – List of input files.

Returns:

np.ndarray – Array of unique event numbers.
int – Number of jets in sample.

umami.preprocessing_tools.utils module#

Collection of utility functions for preprocessing tools.

umami.preprocessing_tools.utils.binarise_jet_labels(labels: DataFrame, internal_labels: list, column: str = 'label') → ndarray#

Transforms labels to binary labels

Parameters:

labels (pd.DataFrame or np.ndarray) – Dataframe or array with the labels inside.
internal_labels (list) – List with the used labels.
column (str, optional) – Column name of the labels if pd.DataFrame is given, by default “label”

Returns:

containing binary label with shape (len(labels), n_classes)

Return type:

np.ndarray

Raises:

TypeError – If given labels are neither pd.DataFrame nor np.ndarray
ValueError – If the given labels are empty

umami.preprocessing_tools.utils.get_scale_dict(file_path: str, dict_key: str) → list#

Load the scale dict from file or return the scale dict if given input is an already loaded scale dict.

Parameters:

file_path (str or dict) – Input path to the scale dict or the already loaded scale dict
dict_key (str) – Dict key of the tracks/jets name.

Returns:

Loaded list with the scaling variables. Each variable is a dict in the list.

Return type:

list

Raises:

ValueError – If given input is neither a string with the correct path nor the already loaded scale dict.

umami.preprocessing_tools.utils.get_variable_dict(file_path: str) → dict#

Reads yaml_file containig the variables and exports them to a dict.

Parameters:: file_path (str or dict) – Input yaml file containing trainig variables or the already loaded var dict
Returns:: out_dict – Dictionary containing training variables
Return type:: dict
Raises:: ValueError – If given input is neither a string with a path nor the already loaded dict.

umami.preprocessing_tools.utils.join_structured_arrays(arrays: list)#

Join a list of structured numpy arrays.

See https://github.com/umami-hep/atlas-ftag-tools/blob/main/ftag/hdf5/h5utils.py

Parameters:: arrays (list) – List of structured numpy arrays to join
Returns:: A merged structured array
Return type:: np.array

umami.preprocessing_tools.writing_train_file module#

Module handling training file writing to disk.

class umami.preprocessing_tools.writing_train_file.TrainSampleWriter(config: object, compression: str | None = None, shuffling: bool = True)#

Bases: object

Class to write training files to disk.

better_shuffling(thearray: ndarray, n_jets: int, slice_size: int = 10000) → ndarray#

Shuffles the index list with fixed slices.

Parameters:

thearray (np.ndarray) – Input array with the values to shuffle.
n_jets (int) – Number of jets in the array
slice_size (int, optional) – How much values are shuffeld at one, by default int(1e4)

Returns:

Shuffeld input array.

Return type:

np.ndarray

calculate_weights(weights_dict: dict, jets: ndarray, labels: ndarray)#

Finds the according weight for the jet, with the weights calculated from the GetFlavorWeights method. Writes it onto the jets[“weight”].

Parameters:

weights_dict (dict) –
weights_dict per flavor and some additional info written into a pickle file at /hybrids/flavour_weights
- ’bjets’, etc.
- ’bins_x’ : pt bins
- ’bins_y’ : eta bins
- ’bin_indices_flat’ : flattened indices of the bins in the histogram
- ’label_map’ : {0: ‘ujets’, 1: ‘cjets’, 2: ‘bjets’}
jets (np.ndarray) – Containing values of jet variables
labels (np.ndarray) – Binarized truth value of flavor for jet with shape (n_jets x (nFlavor x 1))

init_jet_datasets(n_jets: int, jets: ndarray, labels_one_hot: ndarray, additional_var_labels: list)#

Create jet datasets

Parameters:

n_jets (int) – total number of jets that will be written
jets (np.ndarray) – jet input feature array
labels_one_hot (np.ndarray) – jet label array, one hot
additional_var_labels (list) – list of additional variables to be saved

Returns:

h5 group containing jet datasets

Return type:

h5py.Group

init_track_datasets(n_jets: int, tracks: list, valid: list, labels: list)#

Create track-like datasets

Parameters:

n_jets (int) – total number of jets that will be written
tracks (list) – list of track arrays
valid (list) – list of track valid arrays
labels (list) – list of track label arrays

Returns:

list of track h5 groups

Return type:

list

load_scaled_generator(input_file: str, index: list, n_jets: int, scale_dict: dict, jet_add_labels: list | None = None, chunk_size: int = 100000)#

Set up a generator who loads the scaled file and save it in the format for training.

Parameters:

input_file (str) – File which is to be scaled.
index (list) – List with the indicies.
n_jets (int) – Number of jets used.
jet_add_labels (list, optional) – List of additional per-jet labels to include in the output file, by default None
scale_dict (dict) – Scale dict of the jet and track variables.
chunk_size (int, optional) – The number of jets which are loaded and scaled/shifted per step, by default 100_000

Yields:

jets (np.ndarray) – Yielded jets
tracks (np.ndarray) – Yielded tracks
labels (np.ndarray) – Yielded labels
tracks_labels (np.ndarray) – Yielded track labels
valid (np.ndarray) – Yielded valid flag
flavour (np.ndarray) – Yielded flavours
jet_additional_labels (np.ndarray) – Yielded additional jet labels

save_chunk(load_generator, chunk_counter: int, jet_idx: int, n_jets: int, weights_dict: dict)#

Save a single chunk of ready to train data to file

Parameters:

load_generator (Generator) – Yields data
chunk_counter (int) – Index of the current chunk being written
jet_idx (int) – Start index for the current chunk
n_jets (int) – Total number of jets to write
weights_dict (dict) – Jet weight dictionary

Returns:

Stop index for the current chunk

Return type:

int

write_train_sample(input_file: str | None = None, out_file: str | None = None, chunk_size: int = 100000) → None#

Write the training file.

Parameters:

input_file (str, optional) – File with scaled/shifted jets. Default is name from config + resampled_scaled, by default None
out_file (str, optional) – Name of the output file. Default is name from config + resampled_scaled_shuffled., by default None
chunk_size (int, optional) – The number of jets which are loaded and written per step, by default 100_000

umami.preprocessing_tools package#

Subpackages#

Submodules#

umami.preprocessing_tools.configuration module#

umami.preprocessing_tools.merging module#

umami.preprocessing_tools.preparation module#

umami.preprocessing_tools.scaling module#

umami.preprocessing_tools.ttbar_merge module#

umami.preprocessing_tools.utils module#

umami.preprocessing_tools.writing_train_file module#

Module contents#