umami.preprocessing_tools package#

Subpackages#

Submodules#

umami.preprocessing_tools.configuration module#

Configuration module for preprocessing.

class umami.preprocessing_tools.configuration.GeneralSettings(outfile_name: str | None = None, outfile_name_validation: str | None = None, plot_name: str | None = None, plot_type: str = 'pdf', apply_atlas_style: bool = True, use_atlas_tag: bool = True, atlas_first_tag: str = 'Simulation Internal', atlas_second_tag: str | None = None, legend_sample_category: bool = True, var_file: str | None = None, dict_file: str | None = None, compression: str | None = None, precision: str | None = None, concat_jet_tracks: bool = False, convert_to_tfrecord: dict | None = None)#

Bases: object

Class handling general preprocessing options.

apply_atlas_style: bool = True#
as_dict()#

Return the class attributes as dict

Returns:

Class attributes as dict

Return type:

dict

atlas_first_tag: str = 'Simulation Internal'#
atlas_second_tag: str = None#
compression: str = None#
concat_jet_tracks: bool = False#
convert_to_tfrecord: dict = None#
dict_file: str = None#
legend_sample_category: bool = True#
outfile_name: str = None#
outfile_name_validation: str = None#
plot_name: str = None#
plot_options_as_dict()#

Return the plotting related class attributes as dict. These values are the ones which can be passed to PUMA.

Returns:

Plotting related class attributes

Return type:

dict

plot_type: str = 'pdf'#
precision: str = None#
use_atlas_tag: bool = True#
var_file: str = None#
class umami.preprocessing_tools.configuration.Preparation(settings: dict)#

Bases: object

Class handling preprocessing options in preparation block.

get_input_files(sample_type: str)#

Provides

Parameters:

sample_type (str) – Sample type, e.g. ttbar

Returns:

List of h5 input files

Return type:

list

get_sample(sample_name: str)#

Retrieve information about sample.

Parameters:

sample_name (str) – Name of sample

Returns:

sample class of specified sample

Return type:

Sample

Raises:

KeyError – if specified sample not in config file

class umami.preprocessing_tools.configuration.PreprocessConfiguration(yaml_config: str)#

Bases: Configuration

Preprocessing Configuration class.

check_resampling_options()#

Checking that n_jets* are defined correctly for the given resampling method.

Raises:

ValueError – If the value is smaller than 1 for all methods beside pdf

copy_to_out_dir(suffix: str, out_dir: str | None = None) None#

Write the current config object to a new file, in the output dir of the current preprocessing job

Parameters:
  • suffix (str) – Append this string to the copied config file name

  • out_dir (str) – Output directory to which the files are copied.

get_configuration() None#

Assign configuration from file to class variables.

get_file_name(option: str | None = None, extension: str = '.h5', custom_path: str | None = None, use_val: bool = False) str#

Get the file name for different preprocessing steps.

Parameters:
  • option (str, optional) – Option name for file, by default None

  • extension (str, optional) – File extension, by default “.h5”

  • custom_path (str, optional) – Custom path to file, by default None

  • use_val (bool, optinal) – Decide if the outfile name from the training or from the validation will be loaded. With True, the validation file name will be used. By default False.

Returns:

Path of the output file.

Return type:

str

class umami.preprocessing_tools.configuration.Sample(name: str | None = None, type: str | None = None, category: str | None = None, n_jets: int | None = None, cuts: dict | None = None, output_name: str | None = None)#

Bases: object

Class storing sample info.

Parameters:
  • name (str) – Name of sample

  • type (str) – Sample type

  • category (str) – Sample category, e.g. bjets

  • n_jets (int) – Number of jets to load from sample

  • cuts (dict) – Dictionary containing cuts which will be applied on sample

  • output_name (str) – Name of output file

category: str = None#
cuts: dict = None#
n_jets: int = None#
name: str = None#
output_name: str = None#
type: str = None#
class umami.preprocessing_tools.configuration.Sampling(class_labels: list | None = None, method: str | None = None, options: object | None = None, use_validation_samples: bool = False)#

Bases: object

Class handling preprocessing options in sampling block.

as_dict()#

Return the class attributes as dict

Returns:

Class attributes as dict

Return type:

dict

class_labels: list = None#
method: str = None#
options: object = None#
use_validation_samples: bool = False#
class umami.preprocessing_tools.configuration.SamplingOptions(sampling_variables: list | None = None, samples_training: dict | None = None, samples_validation: dict | None = None, custom_n_jets_initial: dict | None = None, fractions: dict | None = None, max_upsampling_ratio: dict | None = None, sampling_fraction: dict | None = None, n_jets: int | None = None, n_jets_validation: int | None = None, n_jets_scaling: int | None = None, save_tracks: bool = False, tracks_names: list | None = None, save_track_labels: bool = False, intermediate_index_file: str | None = None, intermediate_index_file_validation: str | None = None, weighting_target_flavour: str | None = None, bool_attach_sample_weights: bool | None = None, n_jets_to_plot: int | None = None, target_distribution: str | None = None)#

Bases: object

Class handling preprocessing options in sampling block.

as_dict()#

Return the class attributes as dict

Returns:

Class attributes as dict

Return type:

dict

bool_attach_sample_weights: bool = None#
custom_n_jets_initial: dict = None#
fractions: dict = None#
intermediate_index_file: str = None#
intermediate_index_file_validation: str = None#
max_upsampling_ratio: dict = None#
n_jets: int = None#
n_jets_scaling: int = None#
n_jets_to_plot: int = None#
n_jets_validation: int = None#
samples_training: dict = None#
samples_validation: dict = None#
sampling_fraction: dict = None#
sampling_variables: list = None#
save_track_labels: bool = False#
save_tracks: bool = False#
target_distribution: str = None#
tracks_names: list = None#
weighting_target_flavour: str = None#
umami.preprocessing_tools.configuration.check_key(location, old_key: str, new_key: str) None#

Helper function to check

Parameters:
  • location (object) – location in which to check the keys

  • old_key (str) – name of old key/option

  • new_key (str) – name of new key/option

Raises:

KeyError – If deprecated keys are being used

umami.preprocessing_tools.merging module#

Helper functions to merge hdf5 (big) files

umami.preprocessing_tools.merging.add_data(source, output, data_range)#

Add content of “source” to “output” hdf5 file.

Parameters:
  • source (str, dict) – input hdf5 file path / input hdf5 file / dictionary

  • output (h5py File) – output hdf5 file

  • data_range (list) – where to save data in output arrays

umami.preprocessing_tools.merging.check_keys(data1, data2) bool#

Check it both files have the same datasets. Return True if both have the same datasets or raise a ValueError otherwise.

Parameters:
  • data1 (dict) – current data dictionary

  • data2 (dict) – data dictionary to be added

Returns:

True if everything works.

Return type:

bool

Raises:

ValueError – If Files have different datasets!

umami.preprocessing_tools.merging.check_shapes(data1, data2)#

Check if shapes of datasets are the same. Return True if both datasets have the same shapes or raise a ValueError otherwise.

Parameters:
  • data1 (dict) – current data dictionary

  • data2 (dict) – data dictionary to be added

Returns:

If shapes are the same, return True, else False

Return type:

bool

Raises:

ValueError – If shapes are different.

umami.preprocessing_tools.merging.check_size(data) int#

Check if #entries is the same for all keys and return it

Parameters:

data (dict) – Array with the entries inside.

Returns:

Number of entries

Return type:

int

Raises:

ValueError – If not all datasets have the same number of entries.

umami.preprocessing_tools.merging.create_datasets(output, source: dict, size)#

Prepare datasets for merged file based on dictionary.

Parameters:
  • output (h5py File) – output merged hdf5 file

  • source (dict) – dict with arrays to save per key or path to one input hdf5 file or one input hdf5 file

  • size (int) – total number of entries per dataset

umami.preprocessing_tools.merging.get_size(filelist: list)#

Get total size of datasets; return size and ranges per file.

Parameters:

filelist (list) – List of input files

Returns:

  • total_size (list) – Size of the files.

  • ranges (list) – Ranges of the files.

umami.preprocessing_tools.preparation module#

Helper functions to creating hybrid hdf5 samples from ttbar and Zprime ntuples.

class umami.preprocessing_tools.preparation.PrepareSamples(args, config)#

Bases: object

This class is preparing the samples for further processing defined in the configuration file:

  • extracts the selected jets (applying cuts: flavour, pT etc.)

  • writes these iteratively to h5 output files

This class will take the information provided in the samples block in the preprocessing config.

get_batches_per_file(filename: str)#

Split the file into batches to avoid that the loaded data is too large.

Parameters:

filename (str) – name of file to be split in batches

Returns:

  • str – filename

  • list – tuples of start and end index of batch

jets_generator(files_in_batches: list) tuple#

Helper function to extract jet and track information from a h5 ntuple.

Parameters:

files_in_batches (list) – tuples of filename and tuple of start and end index of batch

Yields:
  • numpy.ndarray – jets

  • numpy.ndarray – tracks if self.save_tracks is set to True

run()#

Run over Ntuples to extract jets (and potentially also tracks).

umami.preprocessing_tools.scaling module#

Scaling module to perform variable scaling and shifting.

class umami.preprocessing_tools.scaling.CalculateScaling(config: object)#

Bases: object

Scaling class. Can calculate the scaling and shifting for training dataset and can apply it.

get_scale_dict(input_file: str | None = None, chunk_size: int = 100000)#

Calculates the scaling, shifting and default values and saves them to json.

Parameters:
  • input_file (str, optional) – File which is used to calculate scaling/shifting, by default None

  • chunk_size (int, optional) – Scale dict calculated using the given file, by default 1e5

Raises:

ValueError – If one of the scaling/shifting values is inf

get_scaling(vec: ndarray, varname: str, custom_defaults_vars: dict)#

Calculates the weighted average and std for vector vec.

Parameters:
  • vec (np.ndarray) – Array with variable values for the jets

  • varname (str) – Name of the variable which is to be scaled

  • custom_defaults_vars (dict) – Dict with custom default variable values

Returns:

  • varname (str) – Name of the variable

  • average (float) – Average of the variable

  • std (float) – Std of the variable

  • default (float) – Default value of the variable

get_scaling_dict_generator(input_file: str, n_jets: int, chunk_size: int = 100000)#

Set up a generator that loads the jets in chunks and calculates the mean/std.

Parameters:
  • input_file (str) – File which is to be scaled.

  • n_jets (int) – Number of jets which are to be scaled.

  • chunk_size (int, optional) – The number of jets which are loaded and scaled/shifted per step, by default 1_000_000

Yields:
  • scale_dict_trk (dict) – Dict with the scale/shift values for each variable.

  • n_jets (int) – Number of jets used for scaling/shifting.

get_scaling_tracks(data: ndarray, var_names: list, track_mask: ndarray, tracks_name: str)#

Calculate the scale dict for the tracks and return the dict.

Parameters:
  • data (np.ndarray) – Structured tracks array with shape (n_jets, n_trks, n_trk_features)

  • var_names (list) – List of variables which are to be scaled

  • track_mask (np.ndarray) – Boolen array where False denotes padded tracks, with shape (n_jets, n_trks)

  • tracks_name (str) – Name of used tracks collection

Returns:

  • scale_dict (dict) – Scale dict with scaling/shifting values for each variable

  • n_trks (int) – Number of tracks used to calculate the scaling/shifting

get_scaling_tracks_generator(input_file: str, n_jets: int, tracks_name: str, chunk_size: int = 100000)#

Set up a generator that loads the tracks in chunks and calculates the mean/std.

Parameters:
  • input_file (str) – File which is to be scaled.

  • n_jets (int) – Number of jets which are to be scaled.

  • tracks_name (str) – Name of the tracks

  • chunk_size (int, optional) – The number of jets which are loaded and scaled/shifted per step, by default int(1_000_000)

Yields:
  • scale_dict_trk (dict) – Dict with the scale/shift values for each variable.

  • nTrks (int) – Number of tracks used for scaling/shifting.

join_scale_dicts(first_scale_dict: dict, second_scale_dict: dict, first_n: int, second_n: int)#

Combining the scale dicts of two track chunks.

Parameters:
  • first_scale_dict (dict) – First scale dict to join.

  • second_scale_dict (dict) – Second scale dict to join.

  • first_n (int) – Number for the first scale dict.

  • second_n (int) – Number for the second scale dict.

Returns:

  • combined_scale_dict (dict) – The combined scale dict.

  • combined_n (int) – The combined number of objects.

umami.preprocessing_tools.scaling.apply_scaling_jets(jets: DataFrame, variables_list: dict, scale_dict: dict) DataFrame#

Apply the jet scaling and shifting for the given jets.

Parameters:
  • jets (pd.DataFrame) – Loaded jets which are to be scaled/shifted.

  • variables_list (dict) – Train variables which will be scaled/shifted. For all variables, the scaling/shifting values must be in the scaling dict.

  • scale_dict (dict) – Loaded scaling dict with the scaling/shifting values for the variables defined in variables_list.

Returns:

Scaled/Shifted jets with the variables defined in variables_list

Return type:

pd.DataFrame

Raises:
  • ValueError – When jets is neither a pandas DataFrame nor a structured numpy ndarray

  • KeyError – When for the variable which is to be scaled no shift/scale values are available in the scale dict.

  • ValueError – If the scale parameter for the variable is either 0 or inf.

  • ValueError – If the scaled/shifted variable has infs or NaNs.

umami.preprocessing_tools.scaling.apply_scaling_trks(trks: ndarray, variable_config: dict, scale_dict: dict, tracks_name: str, save_track_labels: bool = False, track_label_variables: list | None = None)#

Apply the scaling/shifting to the tracks.

Parameters:
  • trks (np.ndarray) – Loaded tracks as numpy array.

  • variable_config (dict) – Loaded variable config.

  • scale_dict (dict) – Loaded scale dict.

  • tracks_name (str) – Name of the tracks.

  • save_track_labels (bool) – Save the track labels

  • track_label_variables (list) – List of the track label variables which are to be saved.

Returns:

  • scaled_trks (np.ndarray) – The tracks scaled and shifted.

  • valid (np.ndarray) – Bool array specifying which tracks are valid vs padding.

  • trk_labels (np.ndarray) – The track labels, if defined in the variable config.

Raises:
  • ValueError – If a value of a variable which is to be used in log form is zero/negative.

  • ValueError – If scale is found to be 0 or inf for any track variable.

  • ValueError – If the scaled/shifted variable has infs or NaNs.

umami.preprocessing_tools.scaling.as_full(data_type: dtype)#

Convert float type to full precision

Parameters:

data_type (np.dtype) – type to check for float

Returns:

Return an element of a dtype as a full precision float if we stored half.

Return type:

np.dtype

umami.preprocessing_tools.scaling.get_track_mask(tracks: ndarray) ndarray#

Return the mask for the tracks

Parameters:

tracks (np.ndarray) – Loaded tracks with shape (n_jets, nTrks, nTrkFeatures). Note, the input tracks should not already be converted with np.nan_to_num, as this function relies on a np.isnan check in the case where the valid flag is not present.

Returns:

A bool array (n_jets, nTrks), True for tracks that are present.

Return type:

np.ndarray

Raises:

ValueError – If no ‘valid’ flag or at least one float variable in your input tracks.

umami.preprocessing_tools.ttbar_merge module#

Helper functions for merging single leptonic and dileptonic ttbar samples.

class umami.preprocessing_tools.ttbar_merge.MergeConfig(yaml_config: str)#

Bases: Configuration

Merge config class.

class umami.preprocessing_tools.ttbar_merge.TTbarMerge(config: object)#

Bases: object

This class merges the single and dilepton ttbar samples in the required ratio to match Run-2 MC non-allhadronic ttbar sample.

get_indices() None#

Get indices for dilepton sample to match ratio to single lepton sample. Indices are saved to file for later use.

Raises:

ValueError – If no dilepton sample found.

get_input_files(channel: str) list#

Get input files for a given channel.

Parameters:

channel (str) – Channel to get input files for - single or dilepton.

Returns:

List of input files.

Return type:

list

Raises:
  • ValueError – If no input files found for given channel.

  • KeyError – If no channel specified doesn’t exist in config.

load_jets_generator(input_file: str, chunk_size: int = 100000, indices: list | None = None, save_tracks: bool = False) ndarray#

Yield jets (and tracks) from input file in batches with option to select events by index.

Parameters:
  • input_file (str) – Path to input file.

  • chunk_size (int) – Number of jets or tracks to load at a time.

  • indices (list) – Indices of jets or tracks to load.

  • save_tracks (bool) – Whether to load tracks as well as jets, by default false.

Yields:

np.ndarray – numpy arrays of jets and tracks if save_tracks is true.

merge(file_range: list, index_dir: str) None#

Merge single and dilepton ttbar samples.

Parameters:
  • file_range (list) – List of output files to merge, allows splitting across multiple jobs.

  • index_dir (str) –

    Directory containing index file dictionaries.

    Raises

  • ------

  • ValueError – If the file range passed via –file_range does not consist of two arguments

write_file(index_dict: dict, output_file: str) None#

Write merged output file from passed index dictionary.

Parameters:
  • index_dict (dict) – Dictionary of indices to be used from each input file.

  • output_file (str) – Name of output file.

umami.preprocessing_tools.ttbar_merge.event_indices(input_file_list: list, event_numbers: ndarray) list#

Get indices for each input file for jets from selected subsample of dilepton events.

Parameters:
  • input_file_list (list) – List of input files.

  • event_numbers (np.ndarray) – Array of event numbers.

Returns:

List of numpy arrays of indices for each input file.

Return type:

list

umami.preprocessing_tools.ttbar_merge.event_list(input_file_list: list) tuple#

Get list of unique event numbers from input files and number of jets in sample.

Parameters:

input_file_list (list) – List of input files.

Returns:

  • np.ndarray – Array of unique event numbers.

  • int – Number of jets in sample.

umami.preprocessing_tools.utils module#

Collection of utility functions for preprocessing tools.

umami.preprocessing_tools.utils.binarise_jet_labels(labels: DataFrame, internal_labels: list, column: str = 'label') ndarray#

Transforms labels to binary labels

Parameters:
  • labels (pd.DataFrame or np.ndarray) – Dataframe or array with the labels inside.

  • internal_labels (list) – List with the used labels.

  • column (str, optional) – Column name of the labels if pd.DataFrame is given, by default “label”

Returns:

containing binary label with shape (len(labels), n_classes)

Return type:

np.ndarray

Raises:
  • TypeError – If given labels are neither pd.DataFrame nor np.ndarray

  • ValueError – If the given labels are empty

umami.preprocessing_tools.utils.get_scale_dict(file_path: str, dict_key: str) list#

Load the scale dict from file or return the scale dict if given input is an already loaded scale dict.

Parameters:
  • file_path (str or dict) – Input path to the scale dict or the already loaded scale dict

  • dict_key (str) – Dict key of the tracks/jets name.

Returns:

Loaded list with the scaling variables. Each variable is a dict in the list.

Return type:

list

Raises:

ValueError – If given input is neither a string with the correct path nor the already loaded scale dict.

umami.preprocessing_tools.utils.get_variable_dict(file_path: str) dict#

Reads yaml_file containig the variables and exports them to a dict.

Parameters:

file_path (str or dict) – Input yaml file containing trainig variables or the already loaded var dict

Returns:

out_dict – Dictionary containing training variables

Return type:

dict

Raises:

ValueError – If given input is neither a string with a path nor the already loaded dict.

umami.preprocessing_tools.utils.join_structured_arrays(arrays: list)#

Join a list of structured numpy arrays.

See https://github.com/umami-hep/atlas-ftag-tools/blob/main/ftag/hdf5/h5utils.py

Parameters:

arrays (list) – List of structured numpy arrays to join

Returns:

A merged structured array

Return type:

np.array

umami.preprocessing_tools.writing_train_file module#

Module handling training file writing to disk.

class umami.preprocessing_tools.writing_train_file.TrainSampleWriter(config: object, compression: str | None = None, shuffling: bool = True)#

Bases: object

Class to write training files to disk.

better_shuffling(thearray: ndarray, n_jets: int, slice_size: int = 10000) ndarray#

Shuffles the index list with fixed slices.

Parameters:
  • thearray (np.ndarray) – Input array with the values to shuffle.

  • n_jets (int) – Number of jets in the array

  • slice_size (int, optional) – How much values are shuffeld at one, by default int(1e4)

Returns:

Shuffeld input array.

Return type:

np.ndarray

calculate_weights(weights_dict: dict, jets: ndarray, labels: ndarray)#

Finds the according weight for the jet, with the weights calculated from the GetFlavorWeights method. Writes it onto the jets[“weight”].

Parameters:
  • weights_dict (dict) –

    weights_dict per flavor and some additional info written into a pickle file at /hybrids/flavour_weights

    • ’bjets’, etc.

    • ’bins_x’ : pt bins

    • ’bins_y’ : eta bins

    • ’bin_indices_flat’ : flattened indices of the bins in the histogram

    • ’label_map’ : {0: ‘ujets’, 1: ‘cjets’, 2: ‘bjets’}

  • jets (np.ndarray) – Containing values of jet variables

  • labels (np.ndarray) – Binarized truth value of flavor for jet with shape (n_jets x (nFlavor x 1))

init_jet_datasets(n_jets: int, jets: ndarray, labels_one_hot: ndarray, additional_var_labels: list)#

Create jet datasets

Parameters:
  • n_jets (int) – total number of jets that will be written

  • jets (np.ndarray) – jet input feature array

  • labels_one_hot (np.ndarray) – jet label array, one hot

  • additional_var_labels (list) – list of additional variables to be saved

Returns:

h5 group containing jet datasets

Return type:

h5py.Group

init_track_datasets(n_jets: int, tracks: list, valid: list, labels: list)#

Create track-like datasets

Parameters:
  • n_jets (int) – total number of jets that will be written

  • tracks (list) – list of track arrays

  • valid (list) – list of track valid arrays

  • labels (list) – list of track label arrays

Returns:

list of track h5 groups

Return type:

list

load_scaled_generator(input_file: str, index: list, n_jets: int, scale_dict: dict, jet_add_labels: list | None = None, chunk_size: int = 100000)#

Set up a generator who loads the scaled file and save it in the format for training.

Parameters:
  • input_file (str) – File which is to be scaled.

  • index (list) – List with the indicies.

  • n_jets (int) – Number of jets used.

  • jet_add_labels (list, optional) – List of additional per-jet labels to include in the output file, by default None

  • scale_dict (dict) – Scale dict of the jet and track variables.

  • chunk_size (int, optional) – The number of jets which are loaded and scaled/shifted per step, by default 100_000

Yields:
  • jets (np.ndarray) – Yielded jets

  • tracks (np.ndarray) – Yielded tracks

  • labels (np.ndarray) – Yielded labels

  • tracks_labels (np.ndarray) – Yielded track labels

  • valid (np.ndarray) – Yielded valid flag

  • flavour (np.ndarray) – Yielded flavours

  • jet_additional_labels (np.ndarray) – Yielded additional jet labels

save_chunk(load_generator, chunk_counter: int, jet_idx: int, n_jets: int, weights_dict: dict)#

Save a single chunk of ready to train data to file

Parameters:
  • load_generator (Generator) – Yields data

  • chunk_counter (int) – Index of the current chunk being written

  • jet_idx (int) – Start index for the current chunk

  • n_jets (int) – Total number of jets to write

  • weights_dict (dict) – Jet weight dictionary

Returns:

Stop index for the current chunk

Return type:

int

write_train_sample(input_file: str | None = None, out_file: str | None = None, chunk_size: int = 100000) None#

Write the training file.

Parameters:
  • input_file (str, optional) – File with scaled/shifted jets. Default is name from config + resampled_scaled, by default None

  • out_file (str, optional) – Name of the output file. Default is name from config + resampled_scaled_shuffled., by default None

  • chunk_size (int, optional) – The number of jets which are loaded and written per step, by default 100_000

Module contents#