diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py index 0cdb476dc366f..b42bbec3c9773 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_batchgenerator.py @@ -14,13 +14,14 @@ from __future__ import annotations -from typing import Any, Callable, Tuple, TYPE_CHECKING import atexit +from typing import TYPE_CHECKING, Any, Callable, Tuple if TYPE_CHECKING: import numpy as np import tensorflow as tf import torch + import ROOT @@ -82,10 +83,10 @@ def get_template( def __init__( self, - rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), batch_size: int = 0, chunk_size: int = 0, - block_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -103,76 +104,66 @@ def __init__( ): """Wrapper around the Cpp RBatchGenerator - Args: - rdataframe (RNode): Name of RNode object. - batch_size (int): Size of the returned chunks. - chunk_size (int): - The size of the chunks loaded from the ROOT file. Higher chunk size - results in better randomization, but also higher memory usage. - block_size (int): - The size of the blocks of consecutive entries from the dataframe. - A chunk is build up from multiple blocks. Lower block size results in - a better randomization, but also higher memory usage. - columns (list[str], optional): - Columns to be returned. If not given, all columns are used. - max_vec_sizes (dict[std, int], optional): - Size of each column that consists of vectors. - Required when using vector based columns. - vec_padding (int): - Value to pad vectors with if the vector is smaller - than the given max vector length. Defaults is 0 - target (str|list[str], optional): - Column(s) used as target. - weights (str, optional): - Column used to weight events. - Can only be used when a target is given. - validation_split (float, optional): - The ratio of batches being kept for validation. - Value has to be between 0 and 1. Defaults to 0.0. - max_chunks (int, optional): - The number of chunks that should be loaded for an epoch. - If not given, the whole file is used. - shuffle (bool): - Batches consist of random events and are shuffled every epoch. - Defaults to True. - drop_remainder (bool): - Drop the remainder of data that is too small to compose full batch. - Defaults to True. - set_seed (int): - For reproducibility: Set the seed for the random number generator used - to split the dataset into training and validation and shuffling of the chunks - Defaults to 0 which means that the seed is set to the random device. - load_eager (bool): - Load the full dataframe(s) into memory (True) or - load chunks from the dataframe into memory (False). - Defuaults to False. - sampling_type (str): - Describes the mode of sampling from the minority and majority dataframes. - Options: 'undersampling' and 'oversampling'. Requires load_eager = True. Defaults to ''. - For 'undersampling' and 'oversampling' it requires a list of exactly two dataframes as input, - where the dataframe with the most entries is the majority dataframe - and the dataframe with the fewest entries is the minority dataframe. - sampling_ratio (float): - Ratio of minority and majority entries in the resampled dataset. - Requires load_eager = True and sampling_type = 'undersampling' or 'oversampling'. Defaults to 1.0. - replacement (bool): - Whether the sampling is with (True) or without (False) replacement. - Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. + Args: + rdataframe (RNode): Name of RNode object. + batch_size (int): Size of the returned chunks. + chunk_size (int): + The size of the chunks loaded from the ROOT file. Higher chunk size + results in better randomization, but also higher memory usage. + block_size (int): + The size of the blocks of consecutive entries from the dataframe. + A chunk is build up from multiple blocks. Lower block size results in + a better randomization, but also higher memory usage. + columns (list[str], optional): + Columns to be returned. If not given, all columns are used. + max_vec_sizes (dict[std, int], optional): + Size of each column that consists of vectors. + Required when using vector based columns. + vec_padding (int): + Value to pad vectors with if the vector is smaller + than the given max vector length. Defaults is 0 + target (str|list[str], optional): + Column(s) used as target. + weights (str, optional): + Column used to weight events. + Can only be used when a target is given. + validation_split (float, optional): + The ratio of batches being kept for validation. + Value has to be between 0 and 1. Defaults to 0.0. + max_chunks (int, optional): + The number of chunks that should be loaded for an epoch. + If not given, the whole file is used. + shuffle (bool): + Batches consist of random events and are shuffled every epoch. + Defaults to True. + drop_remainder (bool): + Drop the remainder of data that is too small to compose full batch. + Defaults to True. + set_seed (int): + For reproducibility: Set the seed for the random number generator used + to split the dataset into training and validation and shuffling of the chunks + Defaults to 0 which means that the seed is set to the random device. + load_eager (bool): + Load the full dataframe(s) into memory (True) or + load chunks from the dataframe into memory (False). + Defuaults to False. + sampling_type (str): + Describes the mode of sampling from the minority and majority dataframes. + Options: 'undersampling' and 'oversampling'. Requires load_eager = True. Defaults to ''. + For 'undersampling' and 'oversampling' it requires a list of exactly two dataframes as input, + where the dataframe with the most entries is the majority dataframe + and the dataframe with the fewest entries is the minority dataframe. + sampling_ratio (float): + Ratio of minority and majority entries in the resampled dataset. + Requires load_eager = True and sampling_type = 'undersampling' or 'oversampling'. Defaults to 1.0. + replacement (bool): + Whether the sampling is with (True) or without (False) replacement. + Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. """ - import ROOT from ROOT import RDF - try: - import numpy as np - - except ImportError: - raise ImportError( - "Failed to import NumPy during init. NumPy is required when \ - using RBatchGenerator" - ) - - if load_eager == False and chunk_size < batch_size: + if not load_eager and chunk_size < batch_size: raise ValueError( f"chunk_size cannot be smaller than batch_size: chunk_size: \ {chunk_size}, batch_size: {batch_size}" @@ -186,18 +177,18 @@ def __init__( if load_eager: # TODO: overhead, check if we can improve the following lines - if sampling_type == "undersampling" and replacement == False: + if sampling_type == "undersampling" and not replacement: rdf_0 = rdataframes[0].Count().GetValue() - rdf_1 = rdataframes[1].Count().GetValue() + rdf_1 = rdataframes[1].Count().GetValue() rdf_minor = min(rdf_0, rdf_1) rdf_major = max(rdf_0, rdf_1) if rdf_major < rdf_minor / sampling_ratio: raise ValueError( f"The sampling_ratio is too low: not enough entries in the majority class to sample from. \n \ - Choose sampling_ratio > {round(rdf_minor/rdf_major, 3)} or set replacement to False." + Choose sampling_ratio > {round(rdf_minor / rdf_major, 3)} or set replacement to False." ) - - if not hasattr(rdataframes, "__iter__"): + + if not hasattr(rdataframes, "__iter__"): rdataframes = [rdataframes] self.noded_rdfs = [RDF.AsRNode(rdf) for rdf in rdataframes] @@ -207,9 +198,7 @@ def __init__( self.target_columns = target self.weights_column = weights - template, max_vec_sizes_list = self.get_template( - rdataframes[0], columns, max_vec_sizes - ) + template, max_vec_sizes_list = self.get_template(rdataframes[0], columns, max_vec_sizes) self.num_columns = len(self.all_columns) self.batch_size = batch_size @@ -222,35 +211,32 @@ def __init__( if target not in self.all_columns: raise ValueError( f"Provided target not in given columns: \ntarget => \ - {target}\ncolumns => {self.all_columns}") + {target}\ncolumns => {self.all_columns}" + ) - self.target_indices = [self.all_columns.index( - target) for target in self.target_columns] + self.target_indices = [self.all_columns.index(target) for target in self.target_columns] # Handle weights if self.weights_given: if weights in self.all_columns: - self.weights_index = self.all_columns.index( - self.weights_column) - self.train_indices = [c for c in range( - len(self.all_columns)) if c not in self.target_indices+[self.weights_index]] + self.weights_index = self.all_columns.index(self.weights_column) + self.train_indices = [ + c for c in range(len(self.all_columns)) if c not in self.target_indices + [self.weights_index] + ] else: raise ValueError( f"Provided weights not in given columns: \nweights => \ {weights}\ncolumns => {self.all_columns}" ) else: - self.train_indices = [c for c in range( - len(self.all_columns)) if c not in self.target_indices] + self.train_indices = [c for c in range(len(self.all_columns)) if c not in self.target_indices] elif self.weights_given: - raise ValueError( - "Weights can only be used when a target is provided") + raise ValueError("Weights can only be used when a target is provided") else: self.train_indices = [c for c in range(len(self.all_columns))] - self.train_columns = [ - c for c in self.all_columns if c not in self.target_columns+[self.weights_column]] + self.train_columns = [c for c in self.all_columns if c not in self.target_columns + [self.weights_column]] from ROOT import TMVA, EnableThreadSafety @@ -263,7 +249,7 @@ def __init__( self.generator = TMVA.Experimental.Internal.RBatchGenerator(template)( self.noded_rdfs, chunk_size, - block_size, + block_size, batch_size, self.given_columns, max_vec_sizes_list, @@ -287,7 +273,7 @@ def is_active(self): def is_training_active(self): return self.generator.TrainingIsActive() - + def Activate(self): """Initialize the generator to be used for a loop""" self.generator.Activate() @@ -303,7 +289,7 @@ def ActivateTrainingEpoch(self): def ActivateValidationEpoch(self): """Activate the generator""" self.generator.ActivateValidationEpoch() - + def DeActivateTrainingEpoch(self): """Deactivate the generator""" self.generator.DeActivateTrainingEpoch() @@ -340,8 +326,7 @@ def GetSample(self): if not self.weights_given: if len(self.target_indices) == 1: - return np.zeros((self.batch_size, self.num_columns - 1)), np.zeros( - (self.batch_size)).reshape(-1, 1) + return np.zeros((self.batch_size, self.num_columns - 1)), np.zeros((self.batch_size)).reshape(-1, 1) return np.zeros((self.batch_size, self.num_columns - 1)), np.zeros( (self.batch_size, len(self.target_indices)) @@ -360,7 +345,7 @@ def GetSample(self): np.zeros((self.batch_size)).reshape(-1, 1), ) - def ConvertBatchToNumpy(self, batch: "RTensor") -> np.ndarray: + def ConvertBatchToNumpy(self, batch) -> np.ndarray: """Convert a RTensor into a NumPy array Args: @@ -411,16 +396,15 @@ def ConvertBatchToPyTorch(self, batch: Any) -> torch.Tensor: Returns: torch.Tensor: converted batch """ - import torch import numpy as np + import torch data = batch.GetData() batch_size, num_columns = tuple(batch.GetShape()) data.reshape((batch_size * num_columns,)) - return_data = torch.as_tensor(np.asarray(data)).reshape( - batch_size, num_columns) + return_data = torch.as_tensor(np.asarray(data)).reshape(batch_size, num_columns) # Splice target column from the data if target is given if self.target_given: @@ -463,20 +447,16 @@ def ConvertBatchToTF(self, batch: Any) -> Any: return_data = tf.constant(data, shape=(batch_size, num_columns)) if batch_size != self.batch_size: - return_data = tf.pad(return_data, tf.constant( - [[0, self.batch_size - batch_size], [0, 0]])) + return_data = tf.pad(return_data, tf.constant([[0, self.batch_size - batch_size], [0, 0]])) # Splice target column from the data if weight is given if self.target_given: - train_data = tf.gather( - return_data, indices=self.train_indices, axis=1) - target_data = tf.gather( - return_data, indices=self.target_indices, axis=1) + train_data = tf.gather(return_data, indices=self.train_indices, axis=1) + target_data = tf.gather(return_data, indices=self.target_indices, axis=1) # Splice weight column from the data if weight is given if self.weights_given: - weights_data = tf.gather(return_data, indices=[ - self.weights_index], axis=1) + weights_data = tf.gather(return_data, indices=[self.weights_index], axis=1) return train_data, target_data, weights_data @@ -520,8 +500,8 @@ class LoadingThreadContext: def __init__(self, base_generator: BaseGenerator): self.base_generator = base_generator # create training batches from the first chunk - self.base_generator.CreateTrainBatches(); - + self.base_generator.CreateTrainBatches() + def __enter__(self): self.base_generator.ActivateTrainingEpoch() @@ -545,7 +525,6 @@ def __init__(self, base_generator: BaseGenerator, conversion_function: Callable) self.base_generator = base_generator self.conversion_function = conversion_function - def Activate(self): """Start the loading of training batches""" self.base_generator.Activate() @@ -580,7 +559,6 @@ def last_batch_no_of_rows(self) -> int: return self.base_generator.generator.TrainRemainderRows() def __iter__(self): - self._callable = self.__call__() return self @@ -600,20 +578,21 @@ def __call__(self) -> Any: Union[np.NDArray, torch.Tensor]: A batch of data """ - with LoadingThreadContext(self.base_generator): + with LoadingThreadContext(self.base_generator): while True: batch = self.base_generator.GetTrainBatch() if batch is None: break yield self.conversion_function(batch) - - return None - + + return None + + class LoadingThreadContextVal: def __init__(self, base_generator: BaseGenerator): self.base_generator = base_generator # create validation batches from the first chunk - self.base_generator.CreateValidationBatches() + self.base_generator.CreateValidationBatches() def __enter__(self): self.base_generator.ActivateValidationEpoch() @@ -621,7 +600,6 @@ def __enter__(self): def __exit__(self, type, value, traceback): self.base_generator.DeActivateValidationEpoch() return True - class ValidationRBatchGenerator: @@ -683,22 +661,23 @@ def __call__(self) -> Any: Yields: Union[np.NDArray, torch.Tensor]: A batch of data """ - - with LoadingThreadContextVal(self.base_generator): + + with LoadingThreadContextVal(self.base_generator): while True: batch = self.base_generator.GetValidationBatch() if batch is None: - self.base_generator.DeActivateValidationEpoch() + self.base_generator.DeActivateValidationEpoch() break yield self.conversion_function(batch) - - return None - + + return None + + def CreateNumPyGenerators( - rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), batch_size: int = 0, chunk_size: int = 0, - block_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -776,7 +755,7 @@ def CreateNumPyGenerators( Requires load_eager = True and sampling_type = 'undersampling' or 'oversampling'. Defaults to 1.0. replacement (bool): Whether the sampling is with (True) or without (False) replacement. - Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. + Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. Returns: TrainRBatchGenerator or @@ -789,13 +768,11 @@ def CreateNumPyGenerators( validation generator will return no batches. """ - import numpy as np - base_generator = BaseGenerator( rdataframes, batch_size, chunk_size, - block_size, + block_size, columns, max_vec_sizes, vec_padding, @@ -812,25 +789,21 @@ def CreateNumPyGenerators( replacement, ) - train_generator = TrainRBatchGenerator( - base_generator, base_generator.ConvertBatchToNumpy - ) + train_generator = TrainRBatchGenerator(base_generator, base_generator.ConvertBatchToNumpy) if validation_split == 0.0: return train_generator, None - validation_generator = ValidationRBatchGenerator( - base_generator, base_generator.ConvertBatchToNumpy - ) + validation_generator = ValidationRBatchGenerator(base_generator, base_generator.ConvertBatchToNumpy) return train_generator, validation_generator def CreateTFDatasets( - rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), batch_size: int = 0, chunk_size: int = 0, - block_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -908,7 +881,7 @@ def CreateTFDatasets( Requires load_eager = True and sampling_type = 'undersampling' or 'oversampling'. Defaults to 1.0. replacement (bool): Whether the sampling is with (True) or without (False) replacement. - Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. + Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. Returns: TrainRBatchGenerator or @@ -943,44 +916,32 @@ def CreateTFDatasets( replacement, ) - train_generator = TrainRBatchGenerator( - base_generator, base_generator.ConvertBatchToTF - ) - validation_generator = ValidationRBatchGenerator( - base_generator, base_generator.ConvertBatchToTF - ) + train_generator = TrainRBatchGenerator(base_generator, base_generator.ConvertBatchToTF) + validation_generator = ValidationRBatchGenerator(base_generator, base_generator.ConvertBatchToTF) num_train_columns = len(train_generator.train_columns) num_target_columns = len(train_generator.target_columns) # No target and weights given if target == "": - batch_signature = tf.TensorSpec( - shape=(batch_size, num_train_columns), dtype=tf.float32 - ) + batch_signature = tf.TensorSpec(shape=(batch_size, num_train_columns), dtype=tf.float32) # Target given, no weights given elif weights == "": batch_signature = ( - tf.TensorSpec(shape=(batch_size, num_train_columns), - dtype=tf.float32), - tf.TensorSpec(shape=(batch_size, num_target_columns), - dtype=tf.float32), + tf.TensorSpec(shape=(batch_size, num_train_columns), dtype=tf.float32), + tf.TensorSpec(shape=(batch_size, num_target_columns), dtype=tf.float32), ) # Target and weights given else: batch_signature = ( - tf.TensorSpec(shape=(batch_size, num_train_columns), - dtype=tf.float32), - tf.TensorSpec(shape=(batch_size, num_target_columns), - dtype=tf.float32), + tf.TensorSpec(shape=(batch_size, num_train_columns), dtype=tf.float32), + tf.TensorSpec(shape=(batch_size, num_target_columns), dtype=tf.float32), tf.TensorSpec(shape=(batch_size, 1), dtype=tf.float32), ) - ds_train = tf.data.Dataset.from_generator( - train_generator, output_signature=batch_signature - ) + ds_train = tf.data.Dataset.from_generator(train_generator, output_signature=batch_signature) # Give access to the columns function of the training set setattr(ds_train, "columns", train_generator.columns) @@ -992,26 +953,23 @@ def CreateTFDatasets( if validation_split == 0.0: return ds_train - ds_validation = tf.data.Dataset.from_generator( - validation_generator, output_signature=batch_signature - ) + ds_validation = tf.data.Dataset.from_generator(validation_generator, output_signature=batch_signature) # Give access to the columns function of the validation set setattr(ds_validation, "columns", train_generator.columns) setattr(ds_validation, "train_columns", train_generator.train_columns) setattr(ds_validation, "target_column", train_generator.target_columns) setattr(ds_validation, "weights_column", train_generator.weights_column) - setattr(ds_validation, "number_of_batches", - validation_generator.number_of_batches) + setattr(ds_validation, "number_of_batches", validation_generator.number_of_batches) return ds_train, ds_validation def CreatePyTorchGenerators( - rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), + rdataframes: ROOT.RDF.RNode | list[ROOT.RDF.RNode] = list(), batch_size: int = 0, chunk_size: int = 0, - block_size: int = 0, + block_size: int = 0, columns: list[str] = list(), max_vec_sizes: dict[str, int] = dict(), vec_padding: int = 0, @@ -1089,7 +1047,7 @@ def CreatePyTorchGenerators( Requires load_eager = True and sampling_type = 'undersampling' or 'oversampling'. Defaults to 1.0. replacement (bool): Whether the sampling is with (True) or without (False) replacement. - Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. + Requires load_eager = True and sampling_type = 'undersampling'. Defaults to False. Returns: TrainRBatchGenerator or @@ -1122,15 +1080,11 @@ def CreatePyTorchGenerators( replacement, ) - train_generator = TrainRBatchGenerator( - base_generator, base_generator.ConvertBatchToPyTorch - ) + train_generator = TrainRBatchGenerator(base_generator, base_generator.ConvertBatchToPyTorch) if validation_split == 0.0: return train_generator - validation_generator = ValidationRBatchGenerator( - base_generator, base_generator.ConvertBatchToPyTorch - ) + validation_generator = ValidationRBatchGenerator(base_generator, base_generator.ConvertBatchToPyTorch) return train_generator, validation_generator diff --git a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py b/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py index bd2b6df3f1886..3b0ad08a60d69 100644 --- a/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py +++ b/bindings/pyroot/pythonizations/test/rbatchgenerator_completeness.py @@ -1,16 +1,15 @@ -import unittest import os -import ROOT -from ROOT import RVec +import unittest +from random import randrange, uniform + import numpy as np -from random import randrange -from random import uniform +import ROOT -class RBatchGeneratorMultipleFiles(unittest.TestCase): +class RBatchGeneratorMultipleFiles(unittest.TestCase): file_name1 = "first_half.root" file_name2 = "second_half.root" - file_name3 = "vector_columns.root" + file_name3 = "vector_columns.root" tree_name = "mytree" # default constants @@ -20,29 +19,30 @@ class RBatchGeneratorMultipleFiles(unittest.TestCase): # Helpers def define_rdf(self, num_of_entries=10): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_").Define("b2", "(double) b1*b1") return df def create_file(self, num_of_entries=10): - self.define_rdf(num_of_entries).Snapshot( - self.tree_name, self.file_name1) + self.define_rdf(num_of_entries).Snapshot(self.tree_name, self.file_name1) def create_5_entries_file(self): - df1 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 10")\ - .Define("b2", "(double) b1 * b1")\ + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) rdfentry_ + 10") + .Define("b2", "(double) b1 * b1") .Snapshot(self.tree_name, self.file_name2) + ) def create_vector_file(self, num_of_entries=10): - df3 = ROOT.RDataFrame(10)\ - .Define("b1", "(int) rdfentry_")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name3) - + ( + ROOT.RDataFrame(10) + .Define("b1", "(int) rdfentry_") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name3) + ) + def teardown_file(self, file): os.remove(file) @@ -51,24 +51,24 @@ def test01_each_element_is_generated_unshuffled(self): try: df = ROOT.RDataFrame(self.tree_name, self.file_name1) - + entries_before = df.AsNumpy(["rdfentry_"])["rdfentry_"] - + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, - block_size=2, + block_size=2, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) - results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] - results_x_val = [6.0, 7.0, 8.0, 9.0] + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] - results_y_val = [36.0, 49.0, 64.0, 81.0] + results_y_val = [36.0, 49.0, 64.0, 81.0] collected_x_train = [] collected_x_val = [] @@ -76,15 +76,15 @@ def test01_each_element_is_generated_unshuffled(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (3, 1)) self.assertTrue(y.shape == (3, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (3, 1)) @@ -98,11 +98,9 @@ def test01_each_element_is_generated_unshuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -111,10 +109,10 @@ def test01_each_element_is_generated_unshuffled(self): self.assertEqual(results_y_val, flat_y_val) entries_after = df.AsNumpy(["rdfentry_"])["rdfentry_"] - + # check if the dataframe is correctly reset self.assertTrue(np.array_equal(entries_before, entries_after)) - + self.teardown_file(self.file_name1) except: @@ -131,11 +129,11 @@ def test02_each_element_is_generated_shuffled(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target="b2", validation_split=0.4, shuffle=True, - drop_remainder=False + drop_remainder=False, ) collected_x_train = [] @@ -166,11 +164,9 @@ def test02_each_element_is_generated_shuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) @@ -196,11 +192,11 @@ def test03_chunk_input_smaller_than_batch_size(self): df, batch_size=3, chunk_size=3, - block_size=2, + block_size=2, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) next(iter(gen_train)) @@ -221,11 +217,11 @@ def test04_dropping_remainder(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=True + drop_remainder=True, ) collected_x = [] @@ -257,24 +253,22 @@ def test05_more_than_one_file(self): self.create_5_entries_file() try: - df = ROOT.RDataFrame( - self.tree_name, [self.file_name1, self.file_name2]) + df = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name2]) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0, 3.0, 4.0, 8.0] results_x_val = [9.0, 10.0, 11.0, 12.0, 13.0, 14.0] - results_y_train = [0.0, 1.0, 4.0, 25.0, - 36.0, 49.0, 9.0, 16.0, 64.0] + results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0, 9.0, 16.0, 64.0] results_y_val = [81.0, 100.0, 121.0, 144.0, 169.0, 196.0] collected_x_train = [] @@ -294,11 +288,9 @@ def test05_more_than_one_file(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -317,12 +309,9 @@ def test05_more_than_one_file(self): def test06_multiple_target_columns(self): file_name = "multiple_target_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -330,22 +319,21 @@ def test06_multiple_target_columns(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target=["b2", "b4"], weights="b3", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_x_val = [6.0, 7.0, 8.0, 9.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] results_z_val = [60.0, 70.0, 80.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -382,14 +370,11 @@ def test06_multiple_target_columns(self): collected_y_val.append(y.tolist()) collected_z_val.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -408,11 +393,9 @@ def test06_multiple_target_columns(self): def test07_multiple_input_columns(self): file_name = "multiple_input_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -421,15 +404,14 @@ def test07_multiple_input_columns(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) - results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, - 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0] + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0] results_x_val = [6.0, 60.0, 7.0, 70.0, 8.0, 80.0, 9.0, 90.0] results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] results_y_val = [36.0, 49.0, 64.0, 81.0] @@ -462,11 +444,9 @@ def test07_multiple_input_columns(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -489,16 +469,16 @@ def test08_filtered(self): dff = df.Filter("b1 % 2 == 0", "name") filter_entries_before = dff.AsNumpy(["rdfentry_"])["rdfentry_"] - + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( dff, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) results_x_train = [0.0, 2.0, 4.0] @@ -526,11 +506,9 @@ def test08_filtered(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -539,10 +517,10 @@ def test08_filtered(self): self.assertEqual(results_y_val, flat_y_val) filter_entries_after = dff.AsNumpy(["rdfentry_"])["rdfentry_"] - + # check if the dataframe is correctly reset self.assertTrue(np.array_equal(filter_entries_before, filter_entries_after)) - + self.teardown_file(self.file_name1) except: @@ -553,10 +531,9 @@ def test09_filtered_last_chunk(self): file_name = "filtered_last_chunk.root" tree_name = "myTree" - ROOT.RDataFrame(20)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(20).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Snapshot( + tree_name, file_name + ) try: df = ROOT.RDataFrame(tree_name, file_name) @@ -567,17 +544,15 @@ def test09_filtered_last_chunk(self): dff, batch_size=3, chunk_size=9, - block_size=1, + block_size=1, target="b2", validation_split=0, shuffle=False, - drop_remainder=False + drop_remainder=False, ) - results_x_train = [0.0, 2.0, 4.0, 6.0, - 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] - results_y_train = [0.0, 4.0, 16.0, 36.0, - 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] collected_x_train = [] collected_y_train = [] @@ -597,10 +572,8 @@ def test09_filtered_last_chunk(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) self.assertEqual(results_y_train, flat_y_train) @@ -625,7 +598,7 @@ def test10_two_epochs_shuffled(self): target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) both_epochs_collected_x_val = [] @@ -660,14 +633,10 @@ def test10_two_epochs_shuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) self.assertEqual(len(flat_x_val), 4) @@ -677,10 +646,8 @@ def test10_two_epochs_shuffled(self): both_epochs_collected_x_val.append(collected_x_val) both_epochs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) - self.assertEqual( - both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + self.assertEqual(both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual(both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) @@ -694,11 +661,11 @@ def test11_number_of_training_and_validation_batches_remainder(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target="b2", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) number_of_training_batches = 0 @@ -710,10 +677,8 @@ def test11_number_of_training_and_validation_batches_remainder(self): for _ in gen_validation: number_of_validation_batches += 1 - self.assertEqual(gen_train.number_of_batches, - number_of_training_batches) - self.assertEqual(gen_validation.number_of_batches, - number_of_validation_batches) + self.assertEqual(gen_train.number_of_batches, number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, number_of_validation_batches) self.assertEqual(gen_train.last_batch_no_of_rows, 0) self.assertEqual(gen_validation.last_batch_no_of_rows, 1) @@ -724,16 +689,11 @@ def test11_number_of_training_and_validation_batches_remainder(self): raise def test12_PyTorch(self): - import torch - file_name = "multiple_target_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -742,18 +702,17 @@ def test12_PyTorch(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target=["b2", "b4"], weights="b3", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_x_val = [6.0, 7.0, 8.0, 9.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] results_z_val = [60.0, 70.0, 80.0, 90.0] @@ -794,14 +753,11 @@ def test12_PyTorch(self): collected_y_val.append(y.tolist()) collected_z_val.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -818,16 +774,11 @@ def test12_PyTorch(self): raise def test13_TensorFlow(self): - import tensorflow as tf - file_name = "multiple_target_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -836,22 +787,21 @@ def test13_TensorFlow(self): df, batch_size=3, chunk_size=5, - block_size=1, + block_size=1, target=["b2", "b4"], weights="b3", validation_split=0.4, shuffle=False, - drop_remainder=False + drop_remainder=False, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_x_val = [6.0, 7.0, 8.0, 9.0, 0.0, 0.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] results_z_val = [60.0, 70.0, 80.0, 90.0, 0.0, 0.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -888,14 +838,11 @@ def test13_TensorFlow(self): collected_y_val.append(y.numpy().tolist()) collected_z_val.append(z.numpy().tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -923,13 +870,11 @@ def test14_big_data(self): Number of entries: {entries_in_rdf}" def define_rdf(num_of_entries): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) def test(size_of_batch, size_of_chunk, num_of_entries): define_rdf(num_of_entries) @@ -941,12 +886,12 @@ def test(size_of_batch, size_of_chunk, num_of_entries): df, batch_size=size_of_batch, chunk_size=size_of_chunk, - block_size=1, + block_size=1, target=["b3", "b5"], weights="b2", validation_split=0.3, shuffle=False, - drop_remainder=False + drop_remainder=False, ) collect_x = [] @@ -954,10 +899,10 @@ def test(size_of_batch, size_of_chunk, num_of_entries): train_remainder = gen_train.last_batch_no_of_rows val_remainder = gen_validation.last_batch_no_of_rows - n_train_batches = gen_train.number_of_batches - \ - 1 if train_remainder else gen_train.number_of_batches - n_val_batches = gen_validation.number_of_batches - \ - 1 if val_remainder else gen_validation.number_of_batches + n_train_batches = gen_train.number_of_batches - 1 if train_remainder else gen_train.number_of_batches + n_val_batches = ( + gen_validation.number_of_batches - 1 if val_remainder else gen_validation.number_of_batches + ) iter_train = iter(gen_train) iter_val = iter(gen_validation) @@ -965,65 +910,50 @@ def test(size_of_batch, size_of_chunk, num_of_entries): for i in range(n_train_batches): x, y, z = next(iter_train) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - - self.assertTrue( - np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") - self.assertTrue( - np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue(np.all(x[:, 0] * (-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue(np.all(x[:, 0] + 10192 == y[:, 0]), error_message + f" row: {i}") # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) - self.assertTrue( - np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + self.assertTrue(np.all(x[:, 0] * 2 == z[:, 0]), error_message + f" row: {i}") collect_x.extend(list(x[:, 0])) if train_remainder: x, y, z = next(iter_train) - self.assertTrue(x.shape == ( - train_remainder, 2), error_message) - self.assertTrue(y.shape == ( - train_remainder, 2), error_message) - self.assertTrue(z.shape == ( - train_remainder, 1), error_message) + self.assertTrue(x.shape == (train_remainder, 2), error_message) + self.assertTrue(y.shape == (train_remainder, 2), error_message) + self.assertTrue(z.shape == (train_remainder, 1), error_message) collect_x.extend(list(x[:, 0])) for _ in range(n_val_batches): x, y, z = next(iter_val) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - - self.assertTrue( - np.all(x[:, 0]*(-1) == x[:, 1]), error_message) - self.assertTrue( - np.all(x[:, 0]+10192 == y[:, 0]), error_message) + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue(np.all(x[:, 0] * (-1) == x[:, 1]), error_message) + self.assertTrue(np.all(x[:, 0] + 10192 == y[:, 0]), error_message) # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) - self.assertTrue( - np.all(x[:, 0]*2 == z[:, 0]), error_message) + self.assertTrue(np.all(x[:, 0] * 2 == z[:, 0]), error_message) collect_x.extend(list(x[:, 0])) if val_remainder: x, y, z = next(iter_val) - self.assertTrue(x.shape == ( - val_remainder, 2), error_message) - self.assertTrue(y.shape == ( - val_remainder, 2), error_message) - self.assertTrue(z.shape == ( - val_remainder, 1), error_message) + self.assertTrue(x.shape == (val_remainder, 2), error_message) + self.assertTrue(y.shape == (val_remainder, 2), error_message) + self.assertTrue(z.shape == (val_remainder, 1), error_message) collect_x.extend(list(x[:, 0])) - self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ - generated length {len(set(i for i in range(num_of_entries)))}") + self.assertTrue( + set(collect_x) == set(i for i in range(num_of_entries)), + f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}", + ) except: self.teardown_file(file_name) @@ -1031,18 +961,15 @@ def test(size_of_batch, size_of_chunk, num_of_entries): test(batch_size, chunk_size, entries_in_rdf) - def test15_two_runs_set_seed(self): self.create_file() try: - both_runs_collected_x_val = [] both_runs_collected_y_val = [] - - df = ROOT.RDataFrame(self.tree_name, self.file_name1) - for _ in range(2): + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + for _ in range(2): gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, batch_size=3, @@ -1052,7 +979,7 @@ def test15_two_runs_set_seed(self): validation_split=0.4, shuffle=True, drop_remainder=False, - set_seed = 42 + set_seed=42, ) collected_x_train = [] @@ -1083,14 +1010,10 @@ def test15_two_runs_set_seed(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) self.assertEqual(len(flat_x_val), 4) @@ -1099,20 +1022,18 @@ def test15_two_runs_set_seed(self): both_runs_collected_x_val.append(collected_x_val) both_runs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_runs_collected_x_val[0], both_runs_collected_x_val[1]) - self.assertEqual( - both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + self.assertEqual(both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual(both_runs_collected_y_val[0], both_runs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - + def test16_vector_padding(self): self.create_vector_file() try: df = ROOT.RDataFrame(self.tree_name, self.file_name3) max_vec_sizes = {"v1": 3, "v2": 2} - + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, batch_size=3, @@ -1120,24 +1041,67 @@ def test16_vector_padding(self): block_size=2, target="b1", validation_split=0.4, - max_vec_sizes=max_vec_sizes, + max_vec_sizes=max_vec_sizes, shuffle=False, drop_remainder=False, ) - - - results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 10.0, 0, 100.0, 1000.0, - 2.0, 20.0, 0, 200.0, 2000.0, - 3.0, 30.0, 0, 300.0, 3000.0, - 4.0, 40.0, 0, 400.0, 4000.0, - 5.0, 50.0, 0, 500.0, 5000.0] - results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] - results_x_val = [6.0, 60.0, 0.0, 600.0, 6000.0, - 7.0, 70.0, 0.0, 700.0, 7000.0, - 8.0, 80.0, 0.0, 800.0, 8000.0, - 9.0, 90.0, 0.0, 900.0, 9000.0] - results_y_val = [6.0, 7.0, 8.0, 9.0] + + results_x_train = [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 10.0, + 0, + 100.0, + 1000.0, + 2.0, + 20.0, + 0, + 200.0, + 2000.0, + 3.0, + 30.0, + 0, + 300.0, + 3000.0, + 4.0, + 40.0, + 0, + 400.0, + 4000.0, + 5.0, + 50.0, + 0, + 500.0, + 5000.0, + ] + results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [ + 6.0, + 60.0, + 0.0, + 600.0, + 6000.0, + 7.0, + 70.0, + 0.0, + 700.0, + 7000.0, + 8.0, + 80.0, + 0.0, + 800.0, + 8000.0, + 9.0, + 90.0, + 0.0, + 900.0, + 9000.0, + ] + results_y_val = [6.0, 7.0, 8.0, 9.0] collected_x_train = [] collected_x_val = [] @@ -1145,15 +1109,15 @@ def test16_vector_padding(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (3, 5)) self.assertTrue(y.shape == (3, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (3, 5)) @@ -1167,11 +1131,9 @@ def test16_vector_padding(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -1185,11 +1147,11 @@ def test16_vector_padding(self): self.teardown_file(self.file_name3) raise -class RBatchGeneratorEagerLoading(unittest.TestCase): +class RBatchGeneratorEagerLoading(unittest.TestCase): file_name1 = "first_half.root" file_name2 = "second_half.root" - file_name3 = "vector_columns.root" + file_name3 = "vector_columns.root" tree_name = "mytree" # default constants @@ -1199,29 +1161,30 @@ class RBatchGeneratorEagerLoading(unittest.TestCase): # Helpers def define_rdf(self, num_of_entries=10): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_").Define("b2", "(double) b1*b1") return df def create_file(self, num_of_entries=10): - self.define_rdf(num_of_entries).Snapshot( - self.tree_name, self.file_name1) + self.define_rdf(num_of_entries).Snapshot(self.tree_name, self.file_name1) def create_5_entries_file(self): - df1 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 10")\ - .Define("b2", "(double) b1 * b1")\ + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) rdfentry_ + 10") + .Define("b2", "(double) b1 * b1") .Snapshot(self.tree_name, self.file_name2) + ) def create_vector_file(self, num_of_entries=10): - df3 = ROOT.RDataFrame(10)\ - .Define("b1", "(int) rdfentry_")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name3) - + ( + ROOT.RDataFrame(10) + .Define("b1", "(int) rdfentry_") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name3) + ) + def teardown_file(self, file): os.remove(file) @@ -1241,10 +1204,10 @@ def test01_each_element_is_generated_unshuffled(self): load_eager=True, ) - results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] - results_x_val = [6.0, 7.0, 8.0, 9.0] + results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [6.0, 7.0, 8.0, 9.0] results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] - results_y_val = [36.0, 49.0, 64.0, 81.0] + results_y_val = [36.0, 49.0, 64.0, 81.0] collected_x_train = [] collected_x_val = [] @@ -1252,33 +1215,31 @@ def test01_each_element_is_generated_unshuffled(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (3, 1)) self.assertTrue(y.shape == (3, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (3, 1)) self.assertTrue(y.shape == (3, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - + x, y = next(val_iter) self.assertTrue(x.shape == (self.val_remainder, 1)) self.assertTrue(y.shape == (self.val_remainder, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -1299,13 +1260,7 @@ def test02_each_element_is_generated_shuffled(self): df = ROOT.RDataFrame(self.tree_name, self.file_name1) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( - df, - batch_size=3, - target="b2", - validation_split=0.4, - shuffle=True, - drop_remainder=False, - load_eager=True + df, batch_size=3, target="b2", validation_split=0.4, shuffle=True, drop_remainder=False, load_eager=True ) collected_x_train = [] @@ -1336,11 +1291,9 @@ def test02_each_element_is_generated_shuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) @@ -1361,13 +1314,7 @@ def test04_dropping_remainder(self): df = ROOT.RDataFrame(self.tree_name, self.file_name1) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( - df, - batch_size=3, - target="b2", - validation_split=0.4, - shuffle=False, - drop_remainder=True, - load_eager=True + df, batch_size=3, target="b2", validation_split=0.4, shuffle=False, drop_remainder=True, load_eager=True ) collected_x = [] @@ -1394,14 +1341,12 @@ def test04_dropping_remainder(self): self.teardown_file(self.file_name1) raise - def test05_more_than_one_file(self): self.create_file() self.create_5_entries_file() try: - df = ROOT.RDataFrame( - self.tree_name, [self.file_name1, self.file_name2]) + df = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name2]) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, @@ -1410,13 +1355,12 @@ def test05_more_than_one_file(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] results_x_val = [9.0, 10.0, 11.0, 12.0, 13.0, 14.0] - results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, - 25.0, 36.0, 49.0, 64.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0] results_y_val = [81.0, 100.0, 121.0, 144.0, 169.0, 196.0] collected_x_train = [] @@ -1436,11 +1380,9 @@ def test05_more_than_one_file(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -1459,34 +1401,30 @@ def test05_more_than_one_file(self): def test06_multiple_target_columns(self): file_name = "multiple_target_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, - batch_size=3, + batch_size=3, target=["b2", "b4"], weights="b3", validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_x_val = [6.0, 7.0, 8.0, 9.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] results_z_val = [60.0, 70.0, 80.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -1523,14 +1461,11 @@ def test06_multiple_target_columns(self): collected_y_val.append(y.tolist()) collected_z_val.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -1549,11 +1484,9 @@ def test06_multiple_target_columns(self): def test07_multiple_input_columns(self): file_name = "multiple_input_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -1565,11 +1498,10 @@ def test07_multiple_input_columns(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) - results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, - 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0] + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, 20.0, 3.0, 30.0, 4.0, 40.0, 5.0, 50.0] results_x_val = [6.0, 60.0, 7.0, 70.0, 8.0, 80.0, 9.0, 90.0] results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0] results_y_val = [36.0, 49.0, 64.0, 81.0] @@ -1602,11 +1534,9 @@ def test07_multiple_input_columns(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -1635,7 +1565,7 @@ def test08_filtered(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 2.0, 4.0] @@ -1663,11 +1593,9 @@ def test08_filtered(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -1685,10 +1613,9 @@ def test09_filtered_last_chunk(self): file_name = "filtered_last_chunk.root" tree_name = "myTree" - ROOT.RDataFrame(20)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(20).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Snapshot( + tree_name, file_name + ) try: df = ROOT.RDataFrame(tree_name, file_name) @@ -1696,19 +1623,11 @@ def test09_filtered_last_chunk(self): dff = df.Filter("b1 % 2 == 0", "name") gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( - dff, - batch_size=3, - target="b2", - validation_split=0, - shuffle=False, - drop_remainder=False, - load_eager=True + dff, batch_size=3, target="b2", validation_split=0, shuffle=False, drop_remainder=False, load_eager=True ) - results_x_train = [0.0, 2.0, 4.0, 6.0, - 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] - results_y_train = [0.0, 4.0, 16.0, 36.0, - 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] collected_x_train = [] collected_y_train = [] @@ -1728,10 +1647,8 @@ def test09_filtered_last_chunk(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) self.assertEqual(results_y_train, flat_y_train) @@ -1755,7 +1672,7 @@ def test10_two_epochs_shuffled(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) both_epochs_collected_x_val = [] @@ -1790,14 +1707,10 @@ def test10_two_epochs_shuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) self.assertEqual(len(flat_x_val), 4) @@ -1807,10 +1720,8 @@ def test10_two_epochs_shuffled(self): both_epochs_collected_x_val.append(collected_x_val) both_epochs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) - self.assertEqual( - both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + self.assertEqual(both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual(both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) @@ -1827,7 +1738,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) number_of_training_batches = 0 @@ -1839,10 +1750,8 @@ def test11_number_of_training_and_validation_batches_remainder(self): for _ in gen_validation: number_of_validation_batches += 1 - self.assertEqual(gen_train.number_of_batches, - number_of_training_batches) - self.assertEqual(gen_validation.number_of_batches, - number_of_validation_batches) + self.assertEqual(gen_train.number_of_batches, number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, number_of_validation_batches) self.assertEqual(gen_train.last_batch_no_of_rows, 0) self.assertEqual(gen_validation.last_batch_no_of_rows, 1) @@ -1853,16 +1762,11 @@ def test11_number_of_training_and_validation_batches_remainder(self): raise def test12_PyTorch(self): - import torch - file_name = "multiple_target_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -1875,13 +1779,12 @@ def test12_PyTorch(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_x_val = [6.0, 7.0, 8.0, 9.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0] results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] results_z_val = [60.0, 70.0, 80.0, 90.0] @@ -1922,14 +1825,11 @@ def test12_PyTorch(self): collected_y_val.append(y.tolist()) collected_z_val.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -1946,16 +1846,11 @@ def test12_PyTorch(self): raise def test13_TensorFlow(self): - import tensorflow as tf - file_name = "multiple_target_columns.root" - ROOT.RDataFrame(10)\ - .Define("b1", "(Short_t) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Define("b3", "(double) rdfentry_ * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name) + ROOT.RDataFrame(10).Define("b1", "(Short_t) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Define( + "b3", "(double) rdfentry_ * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name) try: df = ROOT.RDataFrame("myTree", file_name) @@ -1968,17 +1863,16 @@ def test13_TensorFlow(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_x_val = [6.0, 7.0, 8.0, 9.0, 0.0, 0.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 9.0, 300.0, 16.0, 400.0, 25.0, 500.0] results_y_val = [36.0, 600.0, 49.0, 700.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] results_z_train = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0] results_z_val = [60.0, 70.0, 80.0, 90.0, 0.0, 0.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -2015,14 +1909,11 @@ def test13_TensorFlow(self): collected_y_val.append(y.numpy().tolist()) collected_z_val.append(z.numpy().tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -2050,13 +1941,11 @@ def test14_big_data(self): Number of entries: {entries_in_rdf}" def define_rdf(num_of_entries): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) def test(size_of_batch, size_of_chunk, num_of_entries): define_rdf(num_of_entries) @@ -2072,7 +1961,7 @@ def test(size_of_batch, size_of_chunk, num_of_entries): validation_split=0.3, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) collect_x = [] @@ -2080,10 +1969,10 @@ def test(size_of_batch, size_of_chunk, num_of_entries): train_remainder = gen_train.last_batch_no_of_rows val_remainder = gen_validation.last_batch_no_of_rows - n_train_batches = gen_train.number_of_batches - \ - 1 if train_remainder else gen_train.number_of_batches - n_val_batches = gen_validation.number_of_batches - \ - 1 if val_remainder else gen_validation.number_of_batches + n_train_batches = gen_train.number_of_batches - 1 if train_remainder else gen_train.number_of_batches + n_val_batches = ( + gen_validation.number_of_batches - 1 if val_remainder else gen_validation.number_of_batches + ) iter_train = iter(gen_train) iter_val = iter(gen_validation) @@ -2091,65 +1980,50 @@ def test(size_of_batch, size_of_chunk, num_of_entries): for i in range(n_train_batches): x, y, z = next(iter_train) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - - self.assertTrue( - np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") - self.assertTrue( - np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue(np.all(x[:, 0] * (-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue(np.all(x[:, 0] + 10192 == y[:, 0]), error_message + f" row: {i}") # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) - self.assertTrue( - np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + self.assertTrue(np.all(x[:, 0] * 2 == z[:, 0]), error_message + f" row: {i}") collect_x.extend(list(x[:, 0])) if train_remainder: x, y, z = next(iter_train) - self.assertTrue(x.shape == ( - train_remainder, 2), error_message) - self.assertTrue(y.shape == ( - train_remainder, 2), error_message) - self.assertTrue(z.shape == ( - train_remainder, 1), error_message) + self.assertTrue(x.shape == (train_remainder, 2), error_message) + self.assertTrue(y.shape == (train_remainder, 2), error_message) + self.assertTrue(z.shape == (train_remainder, 1), error_message) collect_x.extend(list(x[:, 0])) for _ in range(n_val_batches): x, y, z = next(iter_val) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - - self.assertTrue( - np.all(x[:, 0]*(-1) == x[:, 1]), error_message) - self.assertTrue( - np.all(x[:, 0]+10192 == y[:, 0]), error_message) + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue(np.all(x[:, 0] * (-1) == x[:, 1]), error_message) + self.assertTrue(np.all(x[:, 0] + 10192 == y[:, 0]), error_message) # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) - self.assertTrue( - np.all(x[:, 0]*2 == z[:, 0]), error_message) + self.assertTrue(np.all(x[:, 0] * 2 == z[:, 0]), error_message) collect_x.extend(list(x[:, 0])) if val_remainder: x, y, z = next(iter_val) - self.assertTrue(x.shape == ( - val_remainder, 2), error_message) - self.assertTrue(y.shape == ( - val_remainder, 2), error_message) - self.assertTrue(z.shape == ( - val_remainder, 1), error_message) + self.assertTrue(x.shape == (val_remainder, 2), error_message) + self.assertTrue(y.shape == (val_remainder, 2), error_message) + self.assertTrue(z.shape == (val_remainder, 1), error_message) collect_x.extend(list(x[:, 0])) - self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ - generated length {len(set(i for i in range(num_of_entries)))}") + self.assertTrue( + set(collect_x) == set(i for i in range(num_of_entries)), + f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}", + ) except: self.teardown_file(file_name) @@ -2157,18 +2031,15 @@ def test(size_of_batch, size_of_chunk, num_of_entries): test(batch_size, chunk_size, entries_in_rdf) - def test15_two_runs_set_seed(self): self.create_file() try: - both_runs_collected_x_val = [] both_runs_collected_y_val = [] - - df = ROOT.RDataFrame(self.tree_name, self.file_name1) - for _ in range(2): + df = ROOT.RDataFrame(self.tree_name, self.file_name1) + for _ in range(2): gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, batch_size=3, @@ -2176,8 +2047,8 @@ def test15_two_runs_set_seed(self): validation_split=0.4, shuffle=True, drop_remainder=False, - set_seed = 42, - load_eager=True + set_seed=42, + load_eager=True, ) collected_x_train = [] @@ -2208,14 +2079,10 @@ def test15_two_runs_set_seed(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) self.assertEqual(len(flat_x_val), 4) @@ -2224,44 +2091,85 @@ def test15_two_runs_set_seed(self): both_runs_collected_x_val.append(collected_x_val) both_runs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_runs_collected_x_val[0], both_runs_collected_x_val[1]) - self.assertEqual( - both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + self.assertEqual(both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual(both_runs_collected_y_val[0], both_runs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - + def test16_vector_padding(self): self.create_vector_file() try: df = ROOT.RDataFrame(self.tree_name, self.file_name3) max_vec_sizes = {"v1": 3, "v2": 2} - + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( df, batch_size=3, target="b1", validation_split=0.4, - max_vec_sizes=max_vec_sizes, + max_vec_sizes=max_vec_sizes, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) - - - results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 10.0, 0, 100.0, 1000.0, - 2.0, 20.0, 0, 200.0, 2000.0, - 3.0, 30.0, 0, 300.0, 3000.0, - 4.0, 40.0, 0, 400.0, 4000.0, - 5.0, 50.0, 0, 500.0, 5000.0] - results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] - results_x_val = [6.0, 60.0, 0.0, 600.0, 6000.0, - 7.0, 70.0, 0.0, 700.0, 7000.0, - 8.0, 80.0, 0.0, 800.0, 8000.0, - 9.0, 90.0, 0.0, 900.0, 9000.0] - results_y_val = [6.0, 7.0, 8.0, 9.0] + + results_x_train = [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 10.0, + 0, + 100.0, + 1000.0, + 2.0, + 20.0, + 0, + 200.0, + 2000.0, + 3.0, + 30.0, + 0, + 300.0, + 3000.0, + 4.0, + 40.0, + 0, + 400.0, + 4000.0, + 5.0, + 50.0, + 0, + 500.0, + 5000.0, + ] + results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + results_x_val = [ + 6.0, + 60.0, + 0.0, + 600.0, + 6000.0, + 7.0, + 70.0, + 0.0, + 700.0, + 7000.0, + 8.0, + 80.0, + 0.0, + 800.0, + 8000.0, + 9.0, + 90.0, + 0.0, + 900.0, + 9000.0, + ] + results_y_val = [6.0, 7.0, 8.0, 9.0] collected_x_train = [] collected_x_val = [] @@ -2269,15 +2177,15 @@ def test16_vector_padding(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (3, 5)) self.assertTrue(y.shape == (3, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (3, 5)) @@ -2291,11 +2199,9 @@ def test16_vector_padding(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -2309,13 +2215,13 @@ def test16_vector_padding(self): self.teardown_file(self.file_name3) raise -class RBatchGeneratorEagerLoadingMultipleDataframes(unittest.TestCase): +class RBatchGeneratorEagerLoadingMultipleDataframes(unittest.TestCase): file_name1 = "first_half.root" file_name2 = "second_half.root" - file_name3 = "second_file.root" + file_name3 = "second_file.root" file_name4 = "vector_columns_1.root" - file_name5 = "vector_columns_2.root" + file_name5 = "vector_columns_2.root" tree_name = "mytree" # default constants @@ -2325,54 +2231,53 @@ class RBatchGeneratorEagerLoadingMultipleDataframes(unittest.TestCase): # Helpers def define_rdf1(self, num_of_entries=5): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_").Define("b2", "(double) b1*b1") return df def define_rdf2(self, num_of_entries=5): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_ + 5")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_ + 5").Define("b2", "(double) b1*b1") return df - + def create_file1(self, num_of_entries=5): - self.define_rdf1(num_of_entries).Snapshot( - self.tree_name, self.file_name1) + self.define_rdf1(num_of_entries).Snapshot(self.tree_name, self.file_name1) def create_file2(self, num_of_entries=5): - self.define_rdf2(num_of_entries).Snapshot( - self.tree_name, self.file_name2) - + self.define_rdf2(num_of_entries).Snapshot(self.tree_name, self.file_name2) + def create_5_entries_file(self): - df1 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 10")\ - .Define("b2", "(double) b1 * b1")\ + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) rdfentry_ + 10") + .Define("b2", "(double) b1 * b1") .Snapshot(self.tree_name, self.file_name3) + ) def create_vector_file1(self, num_of_entries=5): - df3 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name4) + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) rdfentry_") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name4) + ) def create_vector_file2(self, num_of_entries=5): - df3 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 5")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name5) - + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) rdfentry_ + 5") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name5) + ) + def teardown_file(self, file): os.remove(file) - def test01_each_element_is_generated_unshuffled(self): self.create_file1() - self.create_file2() + self.create_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -2380,7 +2285,7 @@ def test01_each_element_is_generated_unshuffled(self): df1_entries_before = df1.AsNumpy(["rdfentry_"])["rdfentry_"] df2_entries_before = df2.AsNumpy(["rdfentry_"])["rdfentry_"] - + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], batch_size=3, @@ -2391,10 +2296,10 @@ def test01_each_element_is_generated_unshuffled(self): load_eager=True, ) - results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] - results_x_val = [3.0, 4.0, 8.0, 9.0] + results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [3.0, 4.0, 8.0, 9.0] results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0] - results_y_val = [9.0, 16.0, 64.0, 81.0] + results_y_val = [9.0, 16.0, 64.0, 81.0] collected_x_train = [] collected_x_val = [] @@ -2402,33 +2307,31 @@ def test01_each_element_is_generated_unshuffled(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (3, 1)) self.assertTrue(y.shape == (3, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (3, 1)) self.assertTrue(y.shape == (3, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - + x, y = next(val_iter) self.assertTrue(x.shape == (self.val_remainder, 1)) self.assertTrue(y.shape == (self.val_remainder, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -2437,23 +2340,23 @@ def test01_each_element_is_generated_unshuffled(self): self.assertEqual(results_y_val, flat_y_val) df1_entries_after = df1.AsNumpy(["rdfentry_"])["rdfentry_"] - df2_entries_after = df2.AsNumpy(["rdfentry_"])["rdfentry_"] - + df2_entries_after = df2.AsNumpy(["rdfentry_"])["rdfentry_"] + # check if the dataframes are correctly reset self.assertTrue(np.array_equal(df1_entries_before, df1_entries_after)) - self.assertTrue(np.array_equal(df2_entries_before, df2_entries_after)) - + self.assertTrue(np.array_equal(df2_entries_before, df2_entries_after)) + self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test02_each_element_is_generated_shuffled(self): self.create_file1() - self.create_file2() + self.create_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -2466,7 +2369,7 @@ def test02_each_element_is_generated_shuffled(self): validation_split=0.4, shuffle=True, drop_remainder=False, - load_eager=True + load_eager=True, ) collected_x_train = [] @@ -2497,11 +2400,9 @@ def test02_each_element_is_generated_shuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) @@ -2510,16 +2411,16 @@ def test02_each_element_is_generated_shuffled(self): self.assertEqual(len(flat_y_val), 4) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test04_dropping_remainder(self): self.create_file1() - self.create_file2() + self.create_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -2532,7 +2433,7 @@ def test04_dropping_remainder(self): validation_split=0.4, shuffle=False, drop_remainder=True, - load_eager=True + load_eager=True, ) collected_x = [] @@ -2554,23 +2455,21 @@ def test04_dropping_remainder(self): self.assertEqual(len(collected_y), 3) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise - def test05_more_than_one_file(self): self.create_file1() - self.create_file2() + self.create_file2() self.create_5_entries_file() try: - df1 = ROOT.RDataFrame( - self.tree_name, [self.file_name1, self.file_name2]) - df2 = ROOT.RDataFrame(self.tree_name, self.file_name3) + df1 = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name2]) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name3) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], @@ -2579,13 +2478,12 @@ def test05_more_than_one_file(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 11.0, 12.0] results_x_val = [6.0, 7.0, 8.0, 9.0, 13.0, 14.0] - results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, - 25.0, 100.0, 121.0, 144.0] + results_y_train = [0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 100.0, 121.0, 144.0] results_y_val = [36.0, 49.0, 64.0, 81.0, 169.0, 196.0] collected_x_train = [] @@ -2605,11 +2503,9 @@ def test05_more_than_one_file(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -2619,53 +2515,46 @@ def test05_more_than_one_file(self): self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) - self.teardown_file(self.file_name3) + self.teardown_file(self.file_name3) except: self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) - self.teardown_file(self.file_name3) + self.teardown_file(self.file_name3) raise def test06_multiple_target_columns(self): file_name1 = "multiple_target_columns_1.root" - file_name2 = "multiple_target_columns_2.root" - - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 5")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_ + 5").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df1 = ROOT.RDataFrame("myTree", file_name1) - df2 = ROOT.RDataFrame("myTree", file_name2) + df2 = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], - batch_size=3, + batch_size=3, target=["b2", "b4"], weights="b3", validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] results_x_val = [3.0, 4.0, 8.0, 9.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0] results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] results_z_val = [30.0, 40.0, 80.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -2702,14 +2591,11 @@ def test06_multiple_target_columns(self): collected_y_val.append(y.tolist()) collected_z_val.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -2720,33 +2606,29 @@ def test06_multiple_target_columns(self): self.assertEqual(results_z_val, flat_z_val) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test07_multiple_input_columns(self): file_name1 = "multiple_target_columns_1.root" - file_name2 = "multiple_target_columns_2.root" - - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Snapshot("myTree", file_name1) - - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 5")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Snapshot("myTree", file_name2) - + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Snapshot("myTree", file_name1) + + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_ + 5").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Snapshot("myTree", file_name2) + try: df1 = ROOT.RDataFrame("myTree", file_name1) - df2 = ROOT.RDataFrame("myTree", file_name2) - + df2 = ROOT.RDataFrame("myTree", file_name2) + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], batch_size=3, @@ -2754,11 +2636,10 @@ def test07_multiple_input_columns(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) - results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, - 20.0, 5.0, 50.0, 6.0, 60.0, 7.0, 70.0] + results_x_train = [0.0, 0.0, 1.0, 10.0, 2.0, 20.0, 5.0, 50.0, 6.0, 60.0, 7.0, 70.0] results_x_val = [3.0, 30.0, 4.0, 40.0, 8.0, 80.0, 9.0, 90.0] results_y_train = [0.0, 1.0, 4.0, 25.0, 36.0, 49.0] results_y_val = [9.0, 16.0, 64.0, 81.0] @@ -2791,11 +2672,9 @@ def test07_multiple_input_columns(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -2804,27 +2683,27 @@ def test07_multiple_input_columns(self): self.assertEqual(results_y_val, flat_y_val) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test08_filtered(self): self.create_file1() - self.create_file2() + self.create_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) dff1 = df1.Filter("b1 % 2 == 0", "name") - dff2 = df2.Filter("b1 % 2 != 0", "name") - + dff2 = df2.Filter("b1 % 2 != 0", "name") + dff1_entries_before = dff1.AsNumpy(["rdfentry_"])["rdfentry_"] - dff2_entries_before = dff2.AsNumpy(["rdfentry_"])["rdfentry_"] - + dff2_entries_before = dff2.AsNumpy(["rdfentry_"])["rdfentry_"] + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [dff1, dff2], batch_size=3, @@ -2832,7 +2711,7 @@ def test08_filtered(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 2.0, 5.0] @@ -2860,11 +2739,9 @@ def test08_filtered(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -2873,41 +2750,39 @@ def test08_filtered(self): self.assertEqual(results_y_val, flat_y_val) dff1_entries_after = dff1.AsNumpy(["rdfentry_"])["rdfentry_"] - dff2_entries_after = dff2.AsNumpy(["rdfentry_"])["rdfentry_"] - + dff2_entries_after = dff2.AsNumpy(["rdfentry_"])["rdfentry_"] + # check if the dataframes are correctly reset self.assertTrue(np.array_equal(dff1_entries_before, dff1_entries_after)) - self.assertTrue(np.array_equal(dff2_entries_before, dff2_entries_after)) - + self.assertTrue(np.array_equal(dff2_entries_before, dff2_entries_after)) + self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test09_filtered_last_chunk(self): file_name1 = "filtered_last_chunk_1.root" - file_name2 = "filtered_last_chunk_2.root" + file_name2 = "filtered_last_chunk_2.root" tree_name = "myTree" - ROOT.RDataFrame(10)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Snapshot(tree_name, file_name1) + ROOT.RDataFrame(10).Define("b1", "(int) rdfentry_").Define("b2", "(UShort_t) b1 * b1").Snapshot( + tree_name, file_name1 + ) - ROOT.RDataFrame(10)\ - .Define("b1", "(int) rdfentry_ + 10")\ - .Define("b2", "(UShort_t) b1 * b1")\ - .Snapshot(tree_name, file_name2) + ROOT.RDataFrame(10).Define("b1", "(int) rdfentry_ + 10").Define("b2", "(UShort_t) b1 * b1").Snapshot( + tree_name, file_name2 + ) try: df1 = ROOT.RDataFrame(tree_name, file_name1) - df2 = ROOT.RDataFrame(tree_name, file_name2) + df2 = ROOT.RDataFrame(tree_name, file_name2) dff1 = df1.Filter("b1 % 2 == 0", "name") - dff2 = df2.Filter("b1 % 2 == 0", "name") + dff2 = df2.Filter("b1 % 2 == 0", "name") gen_train, _ = ROOT.TMVA.Experimental.CreateNumPyGenerators( [dff1, dff2], @@ -2916,13 +2791,11 @@ def test09_filtered_last_chunk(self): validation_split=0, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) - results_x_train = [0.0, 2.0, 4.0, 6.0, - 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] - results_y_train = [0.0, 4.0, 16.0, 36.0, - 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] + results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0] + results_y_train = [0.0, 4.0, 16.0, 36.0, 64.0, 100.0, 144.0, 196.0, 256.0, 324.0] collected_x_train = [] collected_y_train = [] @@ -2942,25 +2815,23 @@ def test09_filtered_last_chunk(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) self.assertEqual(results_y_train, flat_y_train) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test10_two_epochs_shuffled(self): self.create_file1() - self.create_file2() + self.create_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -2973,7 +2844,7 @@ def test10_two_epochs_shuffled(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) both_epochs_collected_x_val = [] @@ -3008,14 +2879,10 @@ def test10_two_epochs_shuffled(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) self.assertEqual(len(flat_x_val), 4) @@ -3025,17 +2892,15 @@ def test10_two_epochs_shuffled(self): both_epochs_collected_x_val.append(collected_x_val) both_epochs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) - self.assertEqual( - both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + self.assertEqual(both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual(both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) def test11_number_of_training_and_validation_batches_remainder(self): self.create_file1() - self.create_file2() + self.create_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -3048,7 +2913,7 @@ def test11_number_of_training_and_validation_batches_remainder(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) number_of_training_batches = 0 @@ -3060,43 +2925,33 @@ def test11_number_of_training_and_validation_batches_remainder(self): for _ in gen_validation: number_of_validation_batches += 1 - self.assertEqual(gen_train.number_of_batches, - number_of_training_batches) - self.assertEqual(gen_validation.number_of_batches, - number_of_validation_batches) + self.assertEqual(gen_train.number_of_batches, number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, number_of_validation_batches) self.assertEqual(gen_train.last_batch_no_of_rows, 0) self.assertEqual(gen_validation.last_batch_no_of_rows, 1) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test12_PyTorch(self): - import torch - file_name1 = "multiple_target_columns_1.root" - file_name2 = "multiple_target_columns_2.root" - - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 5")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) - + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_ + 5").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) + try: df1 = ROOT.RDataFrame("myTree", file_name1) - df2 = ROOT.RDataFrame("myTree", file_name2) + df2 = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( [df1, df2], @@ -3106,17 +2961,16 @@ def test12_PyTorch(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] results_x_val = [3.0, 4.0, 8.0, 9.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0] results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] results_z_val = [30.0, 40.0, 80.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -3153,14 +3007,11 @@ def test12_PyTorch(self): collected_y_val.append(y.tolist()) collected_z_val.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -3171,35 +3022,27 @@ def test12_PyTorch(self): self.assertEqual(results_z_val, flat_z_val) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test13_TensorFlow(self): - import tensorflow as tf - file_name1 = "multiple_target_columns_1.root" - file_name2 = "multiple_target_columns_2.root" - - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 5")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) - + file_name2 = "multiple_target_columns_2.root" + + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(5).Define("b1", "(int) rdfentry_ + 5").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) + try: df1 = ROOT.RDataFrame("myTree", file_name1) - df2 = ROOT.RDataFrame("myTree", file_name2) + df2 = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateTFDatasets( [df1, df2], @@ -3209,17 +3052,16 @@ def test13_TensorFlow(self): validation_split=0.4, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) results_x_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] results_x_val = [3.0, 4.0, 8.0, 9.0, 0.0, 0.0] - results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, - 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] + results_y_train = [0.0, 0.0, 1.0, 100.0, 4.0, 200.0, 25.0, 500.0, 36.0, 600.0, 49.0, 700.0] results_y_val = [9.0, 300.0, 16.0, 400.0, 64.0, 800.0, 81.0, 900.0, 0.0, 0.0, 0.0, 0.0] results_z_train = [0.0, 10.0, 20.0, 50.0, 60.0, 70.0] results_z_val = [30.0, 40.0, 80.0, 90.0, 0.0, 0.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -3256,14 +3098,11 @@ def test13_TensorFlow(self): collected_y_val.append(y.numpy().tolist()) collected_z_val.append(z.numpy().tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -3274,16 +3113,16 @@ def test13_TensorFlow(self): self.assertEqual(results_z_val, flat_z_val) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test14_big_data(self): file_name1 = "big_data_1.root" - file_name2 = "big_data_2.root" + file_name2 = "big_data_2.root" tree_name = "myTree" entries_in_rdf = randrange(10000, 30000) @@ -3294,21 +3133,19 @@ def test14_big_data(self): Number of entries: {entries_in_rdf}" def define_rdf(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) rdfentry_")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) rdfentry_").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) def test(size_of_batch, size_of_chunk, num_of_entries): define_rdf(num_of_entries, file_name1) - define_rdf(num_of_entries, file_name2) + define_rdf(num_of_entries, file_name2) try: df1 = ROOT.RDataFrame(tree_name, file_name1) - df2 = ROOT.RDataFrame(tree_name, file_name2) + df2 = ROOT.RDataFrame(tree_name, file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], @@ -3318,7 +3155,7 @@ def test(size_of_batch, size_of_chunk, num_of_entries): validation_split=0.3, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) collect_x = [] @@ -3326,10 +3163,10 @@ def test(size_of_batch, size_of_chunk, num_of_entries): train_remainder = gen_train.last_batch_no_of_rows val_remainder = gen_validation.last_batch_no_of_rows - n_train_batches = gen_train.number_of_batches - \ - 1 if train_remainder else gen_train.number_of_batches - n_val_batches = gen_validation.number_of_batches - \ - 1 if val_remainder else gen_validation.number_of_batches + n_train_batches = gen_train.number_of_batches - 1 if train_remainder else gen_train.number_of_batches + n_val_batches = ( + gen_validation.number_of_batches - 1 if val_remainder else gen_validation.number_of_batches + ) iter_train = iter(gen_train) iter_val = iter(gen_validation) @@ -3337,86 +3174,70 @@ def test(size_of_batch, size_of_chunk, num_of_entries): for i in range(n_train_batches): x, y, z = next(iter_train) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - - self.assertTrue( - np.all(x[:, 0]*(-1) == x[:, 1]), error_message + f" row: {i}") - self.assertTrue( - np.all(x[:, 0]+10192 == y[:, 0]), error_message + f" row: {i}") + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue(np.all(x[:, 0] * (-1) == x[:, 1]), error_message + f" row: {i}") + self.assertTrue(np.all(x[:, 0] + 10192 == y[:, 0]), error_message + f" row: {i}") # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) - self.assertTrue( - np.all(x[:, 0]*2 == z[:, 0]), error_message + f" row: {i}") + self.assertTrue(np.all(x[:, 0] * 2 == z[:, 0]), error_message + f" row: {i}") collect_x.extend(list(x[:, 0])) if train_remainder: x, y, z = next(iter_train) - self.assertTrue(x.shape == ( - train_remainder, 2), error_message) - self.assertTrue(y.shape == ( - train_remainder, 2), error_message) - self.assertTrue(z.shape == ( - train_remainder, 1), error_message) + self.assertTrue(x.shape == (train_remainder, 2), error_message) + self.assertTrue(y.shape == (train_remainder, 2), error_message) + self.assertTrue(z.shape == (train_remainder, 1), error_message) collect_x.extend(list(x[:, 0])) for _ in range(n_val_batches): x, y, z = next(iter_val) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - - self.assertTrue( - np.all(x[:, 0]*(-1) == x[:, 1]), error_message) - self.assertTrue( - np.all(x[:, 0]+10192 == y[:, 0]), error_message) + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + + self.assertTrue(np.all(x[:, 0] * (-1) == x[:, 1]), error_message) + self.assertTrue(np.all(x[:, 0] + 10192 == y[:, 0]), error_message) # self.assertTrue(np.all(x[:,0]*(-1)-10192==y[:,1]), error_message) - self.assertTrue( - np.all(x[:, 0]*2 == z[:, 0]), error_message) + self.assertTrue(np.all(x[:, 0] * 2 == z[:, 0]), error_message) collect_x.extend(list(x[:, 0])) if val_remainder: x, y, z = next(iter_val) - self.assertTrue(x.shape == ( - val_remainder, 2), error_message) - self.assertTrue(y.shape == ( - val_remainder, 2), error_message) - self.assertTrue(z.shape == ( - val_remainder, 1), error_message) + self.assertTrue(x.shape == (val_remainder, 2), error_message) + self.assertTrue(y.shape == (val_remainder, 2), error_message) + self.assertTrue(z.shape == (val_remainder, 1), error_message) collect_x.extend(list(x[:, 0])) - self.assertTrue(set(collect_x) == set(i for i in range(num_of_entries)), f"collected length: {len(set(collect_x))}\ - generated length {len(set(i for i in range(num_of_entries)))}") + self.assertTrue( + set(collect_x) == set(i for i in range(num_of_entries)), + f"collected length: {len(set(collect_x))}\ + generated length {len(set(i for i in range(num_of_entries)))}", + ) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise test(batch_size, chunk_size, entries_in_rdf) - def test15_two_runs_set_seed(self): self.create_file1() - self.create_file2() + self.create_file2() try: both_runs_collected_x_val = [] both_runs_collected_y_val = [] - + df1 = ROOT.RDataFrame(self.tree_name, self.file_name1) df2 = ROOT.RDataFrame(self.tree_name, self.file_name2) - for _ in range(2): + for _ in range(2): gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], batch_size=3, @@ -3424,8 +3245,8 @@ def test15_two_runs_set_seed(self): validation_split=0.4, shuffle=True, drop_remainder=False, - set_seed = 42, - load_eager=True + set_seed=42, + load_eager=True, ) collected_x_train = [] @@ -3456,14 +3277,10 @@ def test15_two_runs_set_seed(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 6) self.assertEqual(len(flat_x_val), 4) @@ -3472,48 +3289,88 @@ def test15_two_runs_set_seed(self): both_runs_collected_x_val.append(collected_x_val) both_runs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_runs_collected_x_val[0], both_runs_collected_x_val[1]) - self.assertEqual( - both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + self.assertEqual(both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual(both_runs_collected_y_val[0], both_runs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) - + self.teardown_file(self.file_name2) def test16_vector_padding(self): self.create_vector_file1() - self.create_vector_file2() + self.create_vector_file2() try: df1 = ROOT.RDataFrame(self.tree_name, self.file_name4) - df2 = ROOT.RDataFrame(self.tree_name, self.file_name5) + df2 = ROOT.RDataFrame(self.tree_name, self.file_name5) max_vec_sizes = {"v1": 3, "v2": 2} - + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], batch_size=3, target="b1", validation_split=0.4, - max_vec_sizes=max_vec_sizes, + max_vec_sizes=max_vec_sizes, shuffle=False, drop_remainder=False, - load_eager=True + load_eager=True, ) - - - results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 10.0, 0, 100.0, 1000.0, - 2.0, 20.0, 0, 200.0, 2000.0, - 5.0, 50.0, 0, 500.0, 5000.0, - 6.0, 60.0, 0.0, 600.0, 6000.0, - 7.0, 70.0, 0.0, 700.0, 7000.0] - results_y_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] - results_x_val = [3.0, 30.0, 0.0, 300.0, 3000.0, - 4.0, 40.0, 0.0, 400.0, 4000.0, - 8.0, 80.0, 0.0, 800.0, 8000.0, - 9.0, 90.0, 0.0, 900.0, 9000.0] - results_y_val = [3.0, 4.0, 8.0, 9.0] + + results_x_train = [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 10.0, + 0, + 100.0, + 1000.0, + 2.0, + 20.0, + 0, + 200.0, + 2000.0, + 5.0, + 50.0, + 0, + 500.0, + 5000.0, + 6.0, + 60.0, + 0.0, + 600.0, + 6000.0, + 7.0, + 70.0, + 0.0, + 700.0, + 7000.0, + ] + results_y_train = [0.0, 1.0, 2.0, 5.0, 6.0, 7.0] + results_x_val = [ + 3.0, + 30.0, + 0.0, + 300.0, + 3000.0, + 4.0, + 40.0, + 0.0, + 400.0, + 4000.0, + 8.0, + 80.0, + 0.0, + 800.0, + 8000.0, + 9.0, + 90.0, + 0.0, + 900.0, + 9000.0, + ] + results_y_val = [3.0, 4.0, 8.0, 9.0] collected_x_train = [] collected_x_val = [] @@ -3521,15 +3378,15 @@ def test16_vector_padding(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (3, 5)) self.assertTrue(y.shape == (3, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (3, 5)) @@ -3543,11 +3400,9 @@ def test16_vector_padding(self): collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -3556,20 +3411,20 @@ def test16_vector_padding(self): self.assertEqual(results_y_val, flat_y_val) self.teardown_file(self.file_name4) - self.teardown_file(self.file_name5) + self.teardown_file(self.file_name5) except: self.teardown_file(self.file_name4) - self.teardown_file(self.file_name5) + self.teardown_file(self.file_name5) raise - -class RBatchGeneratorRandomUndersampling(unittest.TestCase): + +class RBatchGeneratorRandomUndersampling(unittest.TestCase): file_name1 = "major.root" file_name2 = "minor.root" - file_name3 = "second_file.root" + file_name3 = "second_file.root" file_name4 = "vector_columns_major.root" - file_name5 = "vector_columns_minor.root" + file_name5 = "vector_columns_minor.root" tree_name = "mytree" # default constants @@ -3579,61 +3434,60 @@ class RBatchGeneratorRandomUndersampling(unittest.TestCase): # Helpers def define_rdf_even(self, num_of_entries=20): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(double) b1*b1") return df def define_rdf_odd(self, num_of_entries=5): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(double) b1*b1") return df - + def create_file_major(self, num_of_entries=20): - self.define_rdf_even(num_of_entries).Snapshot( - self.tree_name, self.file_name1) + self.define_rdf_even(num_of_entries).Snapshot(self.tree_name, self.file_name1) def create_file_minor(self, num_of_entries=5): - self.define_rdf_odd(num_of_entries).Snapshot( - self.tree_name, self.file_name2) - + self.define_rdf_odd(num_of_entries).Snapshot(self.tree_name, self.file_name2) + def create_5_entries_file(self): - df1 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) 2 * (rdfentry_ + 20)")\ - .Define("b2", "(double) b1 * b1")\ + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) 2 * (rdfentry_ + 20)") + .Define("b2", "(double) b1 * b1") .Snapshot(self.tree_name, self.file_name3) + ) def create_vector_file_major(self, num_of_entries=20): - df3 = ROOT.RDataFrame(20)\ - .Define("b1", "(int) rdfentry_")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name4) + ( + ROOT.RDataFrame(20) + .Define("b1", "(int) rdfentry_") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name4) + ) def create_vector_file_minor(self, num_of_entries=5): - df3 = ROOT.RDataFrame(5)\ - .Define("b1", "(int) rdfentry_ + 20")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name5) - + ( + ROOT.RDataFrame(5) + .Define("b1", "(int) rdfentry_ + 20") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name5) + ) + def teardown_file(self, file): os.remove(file) - def test01_each_element_is_generated_unshuffled(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) major_entries_before = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] - minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] + minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], @@ -3659,33 +3513,31 @@ def test01_each_element_is_generated_unshuffled(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - + x, y = next(train_iter) self.assertTrue(x.shape == (self.train_remainder, 1)) self.assertTrue(y.shape == (self.train_remainder, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -3700,19 +3552,19 @@ def test01_each_element_is_generated_unshuffled(self): # check if there are no duplicate entries (replacement=False) self.assertEqual(len(set(flat_x_train)), len(flat_x_train)) - self.assertEqual(len(set(flat_x_val)), len(flat_x_val)) + self.assertEqual(len(set(flat_x_val)), len(flat_x_val)) self.assertEqual(len(set(flat_y_train)), len(flat_y_train)) - self.assertEqual(len(set(flat_y_val)), len(flat_y_val)) - - # check if correct sampling_ratio (0.5 = minor/major) + self.assertEqual(len(set(flat_y_val)), len(flat_y_val)) + + # check if correct sampling_ratio (0.5 = minor/major) self.assertEqual(num_major_train, 6) self.assertEqual(num_minor_train, 3) self.assertEqual(num_major_val, 4) self.assertEqual(num_minor_val, 2) - + major_entries_after = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] - minor_entries_after = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - + minor_entries_after = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] + # check if the dataframes are correctly reset self.assertTrue(np.array_equal(major_entries_before, major_entries_after)) self.assertTrue(np.array_equal(minor_entries_before, minor_entries_after)) @@ -3722,12 +3574,12 @@ def test01_each_element_is_generated_unshuffled(self): except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test01_each_element_is_generated_unshuffled_replacement(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -3746,44 +3598,74 @@ def test01_each_element_is_generated_unshuffled_replacement(self): replacement=True, ) - results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, - 14.0, 16.0, 18.0, 20.0, 22.0, - 0.0, 2.0, 4.0, 1.0, 3.0, 5.0] - results_x_val = [24.0, 26.0, 28.0, 30.0, 32.0, 34.0, - 36.0, 38.0, 24.0, 26.0, 7.0, 9.0] - results_y_train = [0.0, 4.0, 16.0, 36.0, 64.0, 100.0, 144.0, - 196.0, 256.0, 324.0, 400.0, 484.0, - 0.0, 4.0, 16.0, 1.0, 9.0, 25.0] - results_y_val = [576.0, 676.0, 784.0, 900.0, 1024.0, 1156.0, - 1296.0, 1444.0, 576.0, 676.0, 49.0, 81.0] - + results_x_train = [ + 0.0, + 2.0, + 4.0, + 6.0, + 8.0, + 10.0, + 12.0, + 14.0, + 16.0, + 18.0, + 20.0, + 22.0, + 0.0, + 2.0, + 4.0, + 1.0, + 3.0, + 5.0, + ] + results_x_val = [24.0, 26.0, 28.0, 30.0, 32.0, 34.0, 36.0, 38.0, 24.0, 26.0, 7.0, 9.0] + results_y_train = [ + 0.0, + 4.0, + 16.0, + 36.0, + 64.0, + 100.0, + 144.0, + 196.0, + 256.0, + 324.0, + 400.0, + 484.0, + 0.0, + 4.0, + 16.0, + 1.0, + 9.0, + 25.0, + ] + results_y_val = [576.0, 676.0, 784.0, 900.0, 1024.0, 1156.0, 1296.0, 1444.0, 576.0, 676.0, 49.0, 81.0] + collected_x_train = [] collected_x_val = [] collected_y_train = [] collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(6): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(9): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -3798,10 +3680,10 @@ def test01_each_element_is_generated_unshuffled_replacement(self): # check if there are duplicate entries (replacement=True) self.assertLess(len(set(flat_x_train)), len(flat_x_train)) - self.assertLess(len(set(flat_x_val)), len(flat_x_val)) + self.assertLess(len(set(flat_x_val)), len(flat_x_val)) self.assertLess(len(set(flat_y_train)), len(flat_y_train)) - self.assertLess(len(set(flat_y_val)), len(flat_y_val)) - + self.assertLess(len(set(flat_y_val)), len(flat_y_val)) + # check if correct sampling_ratio (0.2 = minor/major) self.assertEqual(num_major_train, 15) self.assertEqual(num_minor_train, 3) @@ -3813,13 +3695,12 @@ def test01_each_element_is_generated_unshuffled_replacement(self): except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise - def test02_each_element_is_generated_shuffled(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -3837,40 +3718,38 @@ def test02_each_element_is_generated_shuffled(self): sampling_ratio=0.5, replacement=False, ) - + collected_x_train = [] collected_x_val = [] collected_y_train = [] collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - + x, y = next(train_iter) self.assertTrue(x.shape == (self.train_remainder, 1)) self.assertTrue(y.shape == (self.train_remainder, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(len(flat_x_train), 9) @@ -3879,16 +3758,16 @@ def test02_each_element_is_generated_shuffled(self): self.assertEqual(len(flat_y_val), 6) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test04_dropping_remainder(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -3908,46 +3787,44 @@ def test04_dropping_remainder(self): ) train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + collected_x = [] collected_y = [] - + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x.append(x) collected_y.append(y) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x.append(x) collected_y.append(y) - + self.assertEqual(len(collected_x), 7) self.assertEqual(len(collected_y), 7) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise - def test05_more_than_one_file(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() self.create_5_entries_file() try: - df_major = ROOT.RDataFrame( - self.tree_name, [self.file_name1, self.file_name3]) - df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) + df_major = ROOT.RDataFrame(self.tree_name, [self.file_name1, self.file_name3]) + df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], @@ -3974,16 +3851,16 @@ def test05_more_than_one_file(self): iter_train = iter(gen_train) iter_val = iter(gen_validation) - - for _ in range(self.n_train_batch): - x, y = next(iter_train) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) for _ in range(self.n_val_batch): - x, y = next(iter_val) + x, y = next(iter_val) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) @@ -3995,11 +3872,9 @@ def test05_more_than_one_file(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -4009,37 +3884,31 @@ def test05_more_than_one_file(self): self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) - self.teardown_file(self.file_name3) + self.teardown_file(self.file_name3) except: self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) - self.teardown_file(self.file_name3) + self.teardown_file(self.file_name3) raise def test06_multiple_target_columns(self): file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(20)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(5)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(20).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(5).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df_major = ROOT.RDataFrame("myTree", file_name1) - df_minor = ROOT.RDataFrame("myTree", file_name2) + df_minor = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_minor, df_major], - batch_size=2, + batch_size=2, target=["b2", "b4"], weights="b3", validation_split=0.4, @@ -4048,18 +3917,35 @@ def test06_multiple_target_columns(self): load_eager=True, sampling_type="undersampling", sampling_ratio=0.5, - replacement=False + replacement=False, ) results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 1.0, 3.0, 5.0] results_x_val = [24.0, 26.0, 28.0, 30.0, 7.0, 9.0] - results_y_train = [0.0, 0.0, 4.0, 200.0, 16.0, 400.0, 36.0, 600.0, 64.0, 800.0, - 100.0, 1000.0, 1.0, 100.0, 9.0, 300.0, 25.0, 500.0] - results_y_val = [576.0, 2400.0, 676.0, 2600.0, 784.0, 2800.0, 900.0, 3000.0, 49.0, - 700.0, 81.0, 900.0] + results_y_train = [ + 0.0, + 0.0, + 4.0, + 200.0, + 16.0, + 400.0, + 36.0, + 600.0, + 64.0, + 800.0, + 100.0, + 1000.0, + 1.0, + 100.0, + 9.0, + 300.0, + 25.0, + 500.0, + ] + results_y_val = [576.0, 2400.0, 676.0, 2600.0, 784.0, 2800.0, 900.0, 3000.0, 49.0, 700.0, 81.0, 900.0] results_z_train = [0.0, 20.0, 40.0, 60.0, 80.0, 100.0, 10.0, 30.0, 50.0] results_z_val = [240.0, 260.0, 280.0, 300.0, 70.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -4096,14 +3982,11 @@ def test06_multiple_target_columns(self): collected_y_train.append(y.tolist()) collected_z_train.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -4124,33 +4007,29 @@ def test06_multiple_target_columns(self): self.assertEqual(num_minor_val, 2) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test07_multiple_input_columns(self): file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(20)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Snapshot("myTree", file_name1) - - ROOT.RDataFrame(5)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Snapshot("myTree", file_name2) - + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(20).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Snapshot("myTree", file_name1) + + ROOT.RDataFrame(5).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Snapshot("myTree", file_name2) + try: df_major = ROOT.RDataFrame("myTree", file_name1) - df_minor = ROOT.RDataFrame("myTree", file_name2) - + df_minor = ROOT.RDataFrame("myTree", file_name2) + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, @@ -4164,10 +4043,27 @@ def test07_multiple_input_columns(self): replacement=False, ) - results_x_train = [0.0, 0.0, 2.0, 20.0, 4.0, 40.0, 6.0, 60.0, 8.0, 80.0, 10.0, 100.0, - 1.0, 10.0, 3.0, 30.0, 5.0, 50.0] - results_x_val = [24.0, 240.0, 26.0, 260.0, 28.0, 280.0, 30.0, 300.0, 7.0, 70.0, - 9.0, 90.0] + results_x_train = [ + 0.0, + 0.0, + 2.0, + 20.0, + 4.0, + 40.0, + 6.0, + 60.0, + 8.0, + 80.0, + 10.0, + 100.0, + 1.0, + 10.0, + 3.0, + 30.0, + 5.0, + 50.0, + ] + results_x_val = [24.0, 240.0, 26.0, 260.0, 28.0, 280.0, 30.0, 300.0, 7.0, 70.0, 9.0, 90.0] results_y_train = [0.0, 4.0, 16.0, 36.0, 64.0, 100.0, 1.0, 9.0, 25.0] results_y_val = [576.0, 676.0, 784.0, 900.0, 49.0, 81.0] @@ -4199,11 +4095,9 @@ def test07_multiple_input_columns(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -4212,16 +4106,16 @@ def test07_multiple_input_columns(self): self.assertEqual(results_y_val, flat_y_val) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test08_filtered(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -4258,14 +4152,14 @@ def test08_filtered(self): train_iter = iter(gen_train) val_iter = iter(gen_validation) - for _ in range(self.n_train_batch): + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - for _ in range(self.n_val_batch): + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) @@ -4278,11 +4172,9 @@ def test08_filtered(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -4290,7 +4182,7 @@ def test08_filtered(self): self.assertEqual(results_y_train, flat_y_train) self.assertEqual(results_y_val, flat_y_val) - num_major_train = np.sum((np.asarray(flat_x_train) % 2 == 0) & (np.asarray(flat_x_train) % 8 != 0)) + num_major_train = np.sum((np.asarray(flat_x_train) % 2 == 0) & (np.asarray(flat_x_train) % 8 != 0)) num_minor_train = sum(np.array(flat_x_train) % 2 != 0) num_major_val = np.sum((np.asarray(flat_x_val) % 2 == 0) & (np.asarray(flat_x_val) % 8 != 0)) num_minor_val = sum(np.array(flat_x_val) % 2 != 0) @@ -4301,24 +4193,23 @@ def test08_filtered(self): self.assertEqual(num_minor_val, 2) major_filter_entries_after = df_major_filter.AsNumpy(["rdfentry_"])["rdfentry_"] - minor_entries_after = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - + minor_entries_after = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] + # check if the dataframes are correctly reset self.assertTrue(np.array_equal(major_filter_entries_before, major_filter_entries_after)) self.assertTrue(np.array_equal(minor_entries_before, minor_entries_after)) - + self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise - def test10_two_epochs_shuffled(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -4368,14 +4259,10 @@ def test10_two_epochs_shuffled(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 9) self.assertEqual(len(flat_x_val), 6) @@ -4385,17 +4272,15 @@ def test10_two_epochs_shuffled(self): both_epochs_collected_x_val.append(collected_x_val) both_epochs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) - self.assertEqual( - both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + self.assertEqual(both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual(both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) def test11_number_of_training_and_validation_batches_remainder(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -4423,46 +4308,36 @@ def test11_number_of_training_and_validation_batches_remainder(self): for _ in gen_validation: number_of_validation_batches += 1 - self.assertEqual(gen_train.number_of_batches, - number_of_training_batches) - self.assertEqual(gen_validation.number_of_batches, - number_of_validation_batches) + self.assertEqual(gen_train.number_of_batches, number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, number_of_validation_batches) self.assertEqual(gen_train.last_batch_no_of_rows, 1) self.assertEqual(gen_validation.last_batch_no_of_rows, 0) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test12_PyTorch(self): - import torch - file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(20)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(5)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(20).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(5).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df_minor = ROOT.RDataFrame("myTree", file_name1) - df_major = ROOT.RDataFrame("myTree", file_name2) + df_major = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( [df_minor, df_major], - batch_size=2, + batch_size=2, target=["b2", "b4"], weights="b3", validation_split=0.4, @@ -4471,18 +4346,35 @@ def test12_PyTorch(self): load_eager=True, sampling_type="undersampling", sampling_ratio=0.5, - replacement=False + replacement=False, ) results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 1.0, 3.0, 5.0] results_x_val = [24.0, 26.0, 28.0, 30.0, 7.0, 9.0] - results_y_train = [0.0, 0.0, 4.0, 200.0, 16.0, 400.0, 36.0, 600.0, 64.0, 800.0, - 100.0, 1000.0, 1.0, 100.0, 9.0, 300.0, 25.0, 500.0] - results_y_val = [576.0, 2400.0, 676.0, 2600.0, 784.0, 2800.0, 900.0, 3000.0, 49.0, - 700.0, 81.0, 900.0] + results_y_train = [ + 0.0, + 0.0, + 4.0, + 200.0, + 16.0, + 400.0, + 36.0, + 600.0, + 64.0, + 800.0, + 100.0, + 1000.0, + 1.0, + 100.0, + 9.0, + 300.0, + 25.0, + 500.0, + ] + results_y_val = [576.0, 2400.0, 676.0, 2600.0, 784.0, 2800.0, 900.0, 3000.0, 49.0, 700.0, 81.0, 900.0] results_z_train = [0.0, 20.0, 40.0, 60.0, 80.0, 100.0, 10.0, 30.0, 50.0] results_z_val = [240.0, 260.0, 280.0, 300.0, 70.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -4519,14 +4411,11 @@ def test12_PyTorch(self): collected_y_train.append(y.tolist()) collected_z_train.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -4547,38 +4436,30 @@ def test12_PyTorch(self): self.assertEqual(num_minor_val, 2) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test13_TensorFlow(self): - import tensorflow as tf - file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(20)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(5)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(20).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(5).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df_minor = ROOT.RDataFrame("myTree", file_name1) - df_major = ROOT.RDataFrame("myTree", file_name2) + df_major = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( [df_minor, df_major], - batch_size=2, + batch_size=2, target=["b2", "b4"], weights="b3", validation_split=0.4, @@ -4587,18 +4468,35 @@ def test13_TensorFlow(self): load_eager=True, sampling_type="undersampling", sampling_ratio=0.5, - replacement=False + replacement=False, ) results_x_train = [0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 1.0, 3.0, 5.0] results_x_val = [24.0, 26.0, 28.0, 30.0, 7.0, 9.0] - results_y_train = [0.0, 0.0, 4.0, 200.0, 16.0, 400.0, 36.0, 600.0, 64.0, 800.0, - 100.0, 1000.0, 1.0, 100.0, 9.0, 300.0, 25.0, 500.0] - results_y_val = [576.0, 2400.0, 676.0, 2600.0, 784.0, 2800.0, 900.0, 3000.0, 49.0, - 700.0, 81.0, 900.0] + results_y_train = [ + 0.0, + 0.0, + 4.0, + 200.0, + 16.0, + 400.0, + 36.0, + 600.0, + 64.0, + 800.0, + 100.0, + 1000.0, + 1.0, + 100.0, + 9.0, + 300.0, + 25.0, + 500.0, + ] + results_y_val = [576.0, 2400.0, 676.0, 2600.0, 784.0, 2800.0, 900.0, 3000.0, 49.0, 700.0, 81.0, 900.0] results_z_train = [0.0, 20.0, 40.0, 60.0, 80.0, 100.0, 10.0, 30.0, 50.0] results_z_val = [240.0, 260.0, 280.0, 300.0, 70.0, 90.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -4635,14 +4533,11 @@ def test13_TensorFlow(self): collected_y_train.append(y.tolist()) collected_z_train.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -4663,53 +4558,49 @@ def test13_TensorFlow(self): self.assertEqual(num_minor_val, 2) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test14_big_data_replacement_false(self): file_name1 = "big_data_major.root" - file_name2 = "big_data_minor.root" + file_name2 = "big_data_minor.root" tree_name = "myTree" entries_in_rdf_major = randrange(10000, 30000) - entries_in_rdf_minor = randrange(8000, 9999) + entries_in_rdf_minor = randrange(8000, 9999) batch_size = randrange(100, 501) min_allowed_sampling_ratio = entries_in_rdf_minor / entries_in_rdf_major sampling_ratio = round(uniform(min_allowed_sampling_ratio, 2), 2) - + error_message = f"\n Batch size: {batch_size}\ Number of major entries: {entries_in_rdf_major} \ - Number of minor entries: {entries_in_rdf_minor}" + Number of minor entries: {entries_in_rdf_minor}" def define_rdf_major(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) def define_rdf_minor(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) - + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_ + 1").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) + def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_ratio): define_rdf_major(num_of_entries_major, file_name1) define_rdf_minor(num_of_entries_minor, file_name2) try: df1 = ROOT.RDataFrame(tree_name, file_name1) - df2 = ROOT.RDataFrame(tree_name, file_name2) + df2 = ROOT.RDataFrame(tree_name, file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], @@ -4722,19 +4613,19 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat load_eager=True, sampling_type="undersampling", sampling_ratio=sampling_ratio, - replacement=False + replacement=False, ) collected_z_train = [] - collected_z_val = [] + collected_z_val = [] train_remainder = gen_train.last_batch_no_of_rows val_remainder = gen_validation.last_batch_no_of_rows - n_train_batches = gen_train.number_of_batches - \ - 1 if train_remainder else gen_train.number_of_batches - n_val_batches = gen_validation.number_of_batches - \ - 1 if val_remainder else gen_validation.number_of_batches + n_train_batches = gen_train.number_of_batches - 1 if train_remainder else gen_train.number_of_batches + n_val_batches = ( + gen_validation.number_of_batches - 1 if val_remainder else gen_validation.number_of_batches + ) iter_train = iter(gen_train) iter_val = iter(gen_validation) @@ -4742,46 +4633,32 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat for i in range(n_train_batches): x, y, z = next(iter_train) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - collected_z_train.append(z.tolist()) - + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + collected_z_train.append(z.tolist()) if train_remainder: x, y, z = next(iter_train) - self.assertTrue(x.shape == ( - train_remainder, 2), error_message) - self.assertTrue(y.shape == ( - train_remainder, 2), error_message) - self.assertTrue(z.shape == ( - train_remainder, 1), error_message) - collected_z_train.append(z.tolist()) - + self.assertTrue(x.shape == (train_remainder, 2), error_message) + self.assertTrue(y.shape == (train_remainder, 2), error_message) + self.assertTrue(z.shape == (train_remainder, 1), error_message) + collected_z_train.append(z.tolist()) for _ in range(n_val_batches): x, y, z = next(iter_val) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - collected_z_val.append(z.tolist()) + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + collected_z_val.append(z.tolist()) if val_remainder: x, y, z = next(iter_val) - self.assertTrue(x.shape == ( - val_remainder, 2), error_message) - self.assertTrue(y.shape == ( - val_remainder, 2), error_message) - self.assertTrue(z.shape == ( - val_remainder, 1), error_message) - collected_z_val.append(z.tolist()) + self.assertTrue(x.shape == (val_remainder, 2), error_message) + self.assertTrue(y.shape == (val_remainder, 2), error_message) + self.assertTrue(z.shape == (val_remainder, 1), error_message) + collected_z_val.append(z.tolist()) flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] @@ -4793,63 +4670,59 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat # check if there are no duplicate entries (replacement=False) self.assertEqual(len(set(flat_z_train)), len(flat_z_train)) - self.assertEqual(len(set(flat_z_val)), len(flat_z_val)) + self.assertEqual(len(set(flat_z_val)), len(flat_z_val)) # check if the sampling stategy is correct - self.assertEqual(round((num_minor_train/num_major_train), 2), sampling_ratio) - self.assertEqual(round((num_minor_val/num_major_val), 2), sampling_ratio) - + self.assertEqual(round((num_minor_train / num_major_train), 2), sampling_ratio) + self.assertEqual(round((num_minor_val / num_major_val), 2), sampling_ratio) + self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise test(batch_size, entries_in_rdf_major, entries_in_rdf_minor, sampling_ratio) def test14_big_data_replacement_true(self): file_name1 = "big_data_major.root" - file_name2 = "big_data_minor.root" + file_name2 = "big_data_minor.root" tree_name = "myTree" entries_in_rdf_major = randrange(10000, 30000) - entries_in_rdf_minor = randrange(8000, 9999) + entries_in_rdf_minor = randrange(8000, 9999) batch_size = randrange(100, 501) - - # max samling strategy to guarantee duplicate sampled entires + + # max samling strategy to guarantee duplicate sampled entires max_sampling_ratio = entries_in_rdf_minor / entries_in_rdf_major sampling_ratio = round(uniform(0.01, max_sampling_ratio), 2) - + error_message = f"\n Batch size: {batch_size}\ Number of major entries: {entries_in_rdf_major} \ - Number of minor entries: {entries_in_rdf_minor}" + Number of minor entries: {entries_in_rdf_minor}" def define_rdf_major(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) def define_rdf_minor(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) - + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_ + 1").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) + def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_ratio): define_rdf_major(num_of_entries_major, file_name1) define_rdf_minor(num_of_entries_minor, file_name2) try: df1 = ROOT.RDataFrame(tree_name, file_name1) - df2 = ROOT.RDataFrame(tree_name, file_name2) + df2 = ROOT.RDataFrame(tree_name, file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], @@ -4862,19 +4735,19 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat load_eager=True, sampling_type="undersampling", sampling_ratio=sampling_ratio, - replacement=True + replacement=True, ) collected_z_train = [] - collected_z_val = [] + collected_z_val = [] train_remainder = gen_train.last_batch_no_of_rows val_remainder = gen_validation.last_batch_no_of_rows - n_train_batches = gen_train.number_of_batches - \ - 1 if train_remainder else gen_train.number_of_batches - n_val_batches = gen_validation.number_of_batches - \ - 1 if val_remainder else gen_validation.number_of_batches + n_train_batches = gen_train.number_of_batches - 1 if train_remainder else gen_train.number_of_batches + n_val_batches = ( + gen_validation.number_of_batches - 1 if val_remainder else gen_validation.number_of_batches + ) iter_train = iter(gen_train) iter_val = iter(gen_validation) @@ -4882,46 +4755,32 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat for i in range(n_train_batches): x, y, z = next(iter_train) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - collected_z_train.append(z.tolist()) - + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + collected_z_train.append(z.tolist()) if train_remainder: x, y, z = next(iter_train) - self.assertTrue(x.shape == ( - train_remainder, 2), error_message) - self.assertTrue(y.shape == ( - train_remainder, 2), error_message) - self.assertTrue(z.shape == ( - train_remainder, 1), error_message) - collected_z_train.append(z.tolist()) - + self.assertTrue(x.shape == (train_remainder, 2), error_message) + self.assertTrue(y.shape == (train_remainder, 2), error_message) + self.assertTrue(z.shape == (train_remainder, 1), error_message) + collected_z_train.append(z.tolist()) for _ in range(n_val_batches): x, y, z = next(iter_val) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - collected_z_val.append(z.tolist()) + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + collected_z_val.append(z.tolist()) if val_remainder: x, y, z = next(iter_val) - self.assertTrue(x.shape == ( - val_remainder, 2), error_message) - self.assertTrue(y.shape == ( - val_remainder, 2), error_message) - self.assertTrue(z.shape == ( - val_remainder, 1), error_message) - collected_z_val.append(z.tolist()) + self.assertTrue(x.shape == (val_remainder, 2), error_message) + self.assertTrue(y.shape == (val_remainder, 2), error_message) + self.assertTrue(z.shape == (val_remainder, 1), error_message) + collected_z_val.append(z.tolist()) flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] @@ -4933,34 +4792,33 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat # check if there are duplicate entries (replacement=True) self.assertLess(len(set(flat_z_train)), len(flat_z_train)) - self.assertLess(len(set(flat_z_val)), len(flat_z_val)) + self.assertLess(len(set(flat_z_val)), len(flat_z_val)) # check if the sampling stategy is correct - self.assertEqual(round((num_minor_train/num_major_train), 2), sampling_ratio) - self.assertEqual(round((num_minor_val/num_major_val), 2), sampling_ratio) - + self.assertEqual(round((num_minor_train / num_major_train), 2), sampling_ratio) + self.assertEqual(round((num_minor_val / num_major_val), 2), sampling_ratio) + self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise test(batch_size, entries_in_rdf_major, entries_in_rdf_minor, sampling_ratio) - def test15_two_runs_set_seed(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: both_runs_collected_x_val = [] both_runs_collected_y_val = [] - + df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - for _ in range(2): + for _ in range(2): gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, @@ -4973,7 +4831,7 @@ def test15_two_runs_set_seed(self): sampling_ratio=0.5, replacement=False, ) - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -5002,14 +4860,10 @@ def test15_two_runs_set_seed(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = { - x for xl in collected_x_train for xs in xl for x in xs} - flat_x_val = { - x for xl in collected_x_val for xs in xl for x in xs} - flat_y_train = { - y for yl in collected_y_train for ys in yl for y in ys} - flat_y_val = { - y for yl in collected_y_val for ys in yl for y in ys} + flat_x_train = {x for xl in collected_x_train for xs in xl for x in xs} + flat_x_val = {x for xl in collected_x_val for xs in xl for x in xs} + flat_y_train = {y for yl in collected_y_train for ys in yl for y in ys} + flat_y_val = {y for yl in collected_y_val for ys in yl for y in ys} self.assertEqual(len(flat_x_train), 9) self.assertEqual(len(flat_x_val), 6) @@ -5018,22 +4872,19 @@ def test15_two_runs_set_seed(self): both_runs_collected_x_val.append(collected_x_val) both_runs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_runs_collected_x_val[0], both_runs_collected_x_val[1]) - self.assertEqual( - both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + self.assertEqual(both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual(both_runs_collected_y_val[0], both_runs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) - + self.teardown_file(self.file_name2) def test16_vector_padding(self): self.create_vector_file_major() - self.create_vector_file_minor() + self.create_vector_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name4) - df_minor = ROOT.RDataFrame(self.tree_name, self.file_name5) + df_minor = ROOT.RDataFrame(self.tree_name, self.file_name5) max_vec_sizes = {"v1": 3, "v2": 2} gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( @@ -5041,7 +4892,7 @@ def test16_vector_padding(self): batch_size=2, target="b1", validation_split=0.4, - max_vec_sizes=max_vec_sizes, + max_vec_sizes=max_vec_sizes, shuffle=False, drop_remainder=False, load_eager=True, @@ -5050,22 +4901,86 @@ def test16_vector_padding(self): replacement=False, ) - results_x_train = [0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 10.0, 0.0, 100.0, 1000.0, - 2.0, 20.0, 0.0, 200.0, 2000.0, - 3.0, 30.0, 0.0, 300.0, 3000.0, - 4.0, 40.0, 0.0, 400.0, 4000.0, - 5.0, 50.0, 0.0, 500.0, 5000.0, - 20.0, 200.0, 0.0, 2000.0, 20000.0, - 21.0, 210.0, 0.0, 2100.0, 21000.0, - 22.0, 220.0, 0.0, 2200.0, 22000.0] + results_x_train = [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 10.0, + 0.0, + 100.0, + 1000.0, + 2.0, + 20.0, + 0.0, + 200.0, + 2000.0, + 3.0, + 30.0, + 0.0, + 300.0, + 3000.0, + 4.0, + 40.0, + 0.0, + 400.0, + 4000.0, + 5.0, + 50.0, + 0.0, + 500.0, + 5000.0, + 20.0, + 200.0, + 0.0, + 2000.0, + 20000.0, + 21.0, + 210.0, + 0.0, + 2100.0, + 21000.0, + 22.0, + 220.0, + 0.0, + 2200.0, + 22000.0, + ] results_y_train = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 20.0, 21.0, 22.0] - results_x_val = [12.0, 120.0, 0.0, 1200.0, 12000.0, - 13.0, 130.0, 0.0, 1300.0, 13000.0, - 14.0, 140.0, 0.0, 1400.0, 14000.0, - 15.0, 150.0, 0.0, 1500.0, 15000.0, - 23.0, 230.0, 0.0, 2300.0, 23000.0, - 24.0, 240.0, 0.0, 2400.0, 24000.0] + results_x_val = [ + 12.0, + 120.0, + 0.0, + 1200.0, + 12000.0, + 13.0, + 130.0, + 0.0, + 1300.0, + 13000.0, + 14.0, + 140.0, + 0.0, + 1400.0, + 14000.0, + 15.0, + 150.0, + 0.0, + 1500.0, + 15000.0, + 23.0, + 230.0, + 0.0, + 2300.0, + 23000.0, + 24.0, + 240.0, + 0.0, + 2400.0, + 24000.0, + ] results_y_val = [12.0, 13.0, 14.0, 15.0, 23.0, 24.0] collected_x_train = [] @@ -5074,15 +4989,15 @@ def test16_vector_padding(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 5)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 5)) @@ -5096,11 +5011,9 @@ def test16_vector_padding(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -5112,28 +5025,27 @@ def test16_vector_padding(self): num_minor_train = sum(np.array(flat_y_train) >= 20) num_major_val = sum(np.array(flat_y_val) < 20) num_minor_val = sum(np.array(flat_y_val) >= 20) - + self.assertEqual(num_major_train, 6) self.assertEqual(num_minor_train, 3) self.assertEqual(num_major_val, 4) self.assertEqual(num_minor_val, 2) - - + self.teardown_file(self.file_name4) - self.teardown_file(self.file_name5) + self.teardown_file(self.file_name5) except: self.teardown_file(self.file_name4) - self.teardown_file(self.file_name5) + self.teardown_file(self.file_name5) raise -class RBatchGeneratorRandomOversampling(unittest.TestCase): +class RBatchGeneratorRandomOversampling(unittest.TestCase): file_name1 = "major.root" file_name2 = "minor.root" - file_name3 = "second_file.root" + file_name3 = "second_file.root" file_name4 = "vector_columns_major.root" - file_name5 = "vector_columns_minor.root" + file_name5 = "vector_columns_minor.root" tree_name = "mytree" # default constants @@ -5143,62 +5055,61 @@ class RBatchGeneratorRandomOversampling(unittest.TestCase): # Helpers def define_rdf_even(self, num_of_entries=20): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(double) b1*b1") return df def define_rdf_odd(self, num_of_entries=5): - df = ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(double) b1*b1") + df = ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(double) b1*b1") return df - + def create_file_major(self, num_of_entries=10): - self.define_rdf_even(num_of_entries).Snapshot( - self.tree_name, self.file_name1) + self.define_rdf_even(num_of_entries).Snapshot(self.tree_name, self.file_name1) def create_file_minor(self, num_of_entries=3): - self.define_rdf_odd(num_of_entries).Snapshot( - self.tree_name, self.file_name2) - + self.define_rdf_odd(num_of_entries).Snapshot(self.tree_name, self.file_name2) + def create_extra_entry_file(self): - df1 = ROOT.RDataFrame(1)\ - .Define("b1", "(int) 2 * (rdfentry_ + 3) + 1")\ - .Define("b2", "(double) b1 * b1")\ + ( + ROOT.RDataFrame(1) + .Define("b1", "(int) 2 * (rdfentry_ + 3) + 1") + .Define("b2", "(double) b1 * b1") .Snapshot(self.tree_name, self.file_name3) + ) def create_vector_file_major(self, num_of_entries=10): - df3 = ROOT.RDataFrame(10)\ - .Define("b1", "(int) rdfentry_")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name4) + ( + ROOT.RDataFrame(10) + .Define("b1", "(int) rdfentry_") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name4) + ) def create_vector_file_minor(self, num_of_entries=3): - df3 = ROOT.RDataFrame(3)\ - .Define("b1", "(int) rdfentry_ + 10")\ - .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}")\ - .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}")\ - .Snapshot(self.tree_name, self.file_name5) - + ( + ROOT.RDataFrame(3) + .Define("b1", "(int) rdfentry_ + 10") + .Define("v1", "ROOT::VecOps::RVec{ b1, b1 * 10}") + .Define("v2", "ROOT::VecOps::RVec{ b1 * 100, b1 * 1000}") + .Snapshot(self.tree_name, self.file_name5) + ) + def teardown_file(self, file): os.remove(file) - def test01_each_element_is_generated_unshuffled(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) major_entries_before = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] - minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - + minor_entries_before = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, @@ -5222,33 +5133,31 @@ def test01_each_element_is_generated_unshuffled(self): collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - + x, y = next(train_iter) self.assertTrue(x.shape == (self.train_remainder, 1)) self.assertTrue(y.shape == (self.train_remainder, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -5263,34 +5172,34 @@ def test01_each_element_is_generated_unshuffled(self): # check if there are no duplicate entries (oversampling) self.assertLess(len(set(flat_x_train)), len(flat_x_train)) - self.assertLess(len(set(flat_x_val)), len(flat_x_val)) + self.assertLess(len(set(flat_x_val)), len(flat_x_val)) self.assertLess(len(set(flat_y_train)), len(flat_y_train)) - self.assertLess(len(set(flat_y_val)), len(flat_y_val)) - - # check if correct sampling_ratio (0.5 = minor/major) + self.assertLess(len(set(flat_y_val)), len(flat_y_val)) + + # check if correct sampling_ratio (0.5 = minor/major) self.assertEqual(num_major_train, 6) self.assertEqual(num_minor_train, 3) self.assertEqual(num_major_val, 4) self.assertEqual(num_minor_val, 2) major_entries_after = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] - minor_entries_after = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] - + minor_entries_after = df_minor.AsNumpy(["rdfentry_"])["rdfentry_"] + # check if the dataframes are correctly reset self.assertTrue(np.array_equal(major_entries_before, major_entries_after)) self.assertTrue(np.array_equal(minor_entries_before, minor_entries_after)) - + self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test02_each_element_is_generated_shuffled(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -5307,65 +5216,63 @@ def test02_each_element_is_generated_shuffled(self): sampling_type="oversampling", sampling_ratio=0.5, ) - + collected_x_train = [] collected_x_val = [] collected_y_train = [] collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - + x, y = next(train_iter) self.assertTrue(x.shape == (self.train_remainder, 1)) self.assertTrue(y.shape == (self.train_remainder, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] # check if there are no duplicate entries (oversampling) self.assertLess(len(set(flat_x_train)), len(flat_x_train)) - self.assertLess(len(set(flat_x_val)), len(flat_x_val)) + self.assertLess(len(set(flat_x_val)), len(flat_x_val)) self.assertLess(len(set(flat_y_train)), len(flat_y_train)) - self.assertLess(len(set(flat_y_val)), len(flat_y_val)) - - # check if correct sampling_ratio (0.5 = minor/major) + self.assertLess(len(set(flat_y_val)), len(flat_y_val)) + + # check if correct sampling_ratio (0.5 = minor/major) self.assertEqual(len(flat_x_train), 9) self.assertEqual(len(flat_x_val), 6) self.assertEqual(len(flat_y_train), 9) self.assertEqual(len(flat_y_val), 6) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test04_dropping_remainder(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -5384,46 +5291,44 @@ def test04_dropping_remainder(self): ) train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + collected_x = [] collected_y = [] - + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x.append(x) collected_y.append(y) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x.append(x) collected_y.append(y) - + self.assertEqual(len(collected_x), 7) self.assertEqual(len(collected_y), 7) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise - def test05_more_than_one_file(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() self.create_extra_entry_file() try: - df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) - df_minor = ROOT.RDataFrame( - self.tree_name, [self.file_name2, self.file_name3]) + df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) + df_minor = ROOT.RDataFrame(self.tree_name, [self.file_name2, self.file_name3]) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], @@ -5436,12 +5341,12 @@ def test05_more_than_one_file(self): sampling_type="oversampling", sampling_ratio=0.5, ) - + results_x_train = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0] results_x_val = [7.0, 7.0, 12.0, 14.0, 16.0, 18.0] results_y_train = [1.0, 9.0, 25.0, 0.0, 4.0, 16.0, 36.0, 64.0, 100.0] results_y_val = [49.0, 49.0, 144.0, 196.0, 256.0, 324.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -5449,16 +5354,16 @@ def test05_more_than_one_file(self): iter_train = iter(gen_train) iter_val = iter(gen_validation) - - for _ in range(self.n_train_batch): - x, y = next(iter_train) + + for _ in range(self.n_train_batch): + x, y = next(iter_train) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) for _ in range(self.n_val_batch): - x, y = next(iter_val) + x, y = next(iter_val) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) @@ -5470,11 +5375,9 @@ def test05_more_than_one_file(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -5484,37 +5387,31 @@ def test05_more_than_one_file(self): self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) - self.teardown_file(self.file_name3) + self.teardown_file(self.file_name3) except: self.teardown_file(self.file_name1) self.teardown_file(self.file_name2) - self.teardown_file(self.file_name3) + self.teardown_file(self.file_name3) raise def test06_multiple_target_columns(self): file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(10)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(3)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(10).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(3).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df_major = ROOT.RDataFrame("myTree", file_name1) - df_minor = ROOT.RDataFrame("myTree", file_name2) + df_minor = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_minor, df_major], - batch_size=2, + batch_size=2, target=["b2", "b4"], weights="b3", validation_split=0.4, @@ -5527,13 +5424,30 @@ def test06_multiple_target_columns(self): results_x_train = [1.0, 3.0, 1.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0] results_x_val = [5.0, 5.0, 12.0, 14.0, 16.0, 18.0] - results_y_train = [1.0, 100.0, 9.0, 300.0, 1.0, 100.0, 0.0, 0.0, 4.0, 200.0, - 16.0, 400.0, 36.0, 600.0, 64.0, 800.0, 100.0, 1000.0] - results_y_val = [25.0, 500.0, 25.0, 500.0, 144.0, 1200.0, 196.0, 1400.0, - 256.0, 1600.0, 324.0, 1800.0] + results_y_train = [ + 1.0, + 100.0, + 9.0, + 300.0, + 1.0, + 100.0, + 0.0, + 0.0, + 4.0, + 200.0, + 16.0, + 400.0, + 36.0, + 600.0, + 64.0, + 800.0, + 100.0, + 1000.0, + ] + results_y_val = [25.0, 500.0, 25.0, 500.0, 144.0, 1200.0, 196.0, 1400.0, 256.0, 1600.0, 324.0, 1800.0] results_z_train = [10.0, 30.0, 10.0, 0.0, 20.0, 40.0, 60.0, 80.0, 100.0] results_z_val = [50.0, 50.0, 120.0, 140.0, 160.0, 180.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -5570,14 +5484,11 @@ def test06_multiple_target_columns(self): collected_y_train.append(y.tolist()) collected_z_train.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -5598,33 +5509,29 @@ def test06_multiple_target_columns(self): self.assertEqual(num_minor_val, 2) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test07_multiple_input_columns(self): file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(10)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Snapshot("myTree", file_name1) - - ROOT.RDataFrame(3)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Snapshot("myTree", file_name2) - + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(10).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Snapshot("myTree", file_name1) + + ROOT.RDataFrame(3).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Snapshot("myTree", file_name2) + try: df_major = ROOT.RDataFrame("myTree", file_name1) - df_minor = ROOT.RDataFrame("myTree", file_name2) - + df_minor = ROOT.RDataFrame("myTree", file_name2) + gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, @@ -5636,10 +5543,27 @@ def test07_multiple_input_columns(self): sampling_type="oversampling", sampling_ratio=0.5, ) - results_x_train = [1.0, 10.0, 3.0, 30.0, 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, - 4.0, 40.0, 6.0, 60.0, 8.0, 80.0, 10.0, 100.0] - results_x_val = [5.0, 50.0, 5.0, 50.0, 12.0, 120.0, 14.0, 140.0, 16.0, 160.0, - 18.0, 180.0] + results_x_train = [ + 1.0, + 10.0, + 3.0, + 30.0, + 1.0, + 10.0, + 0.0, + 0.0, + 2.0, + 20.0, + 4.0, + 40.0, + 6.0, + 60.0, + 8.0, + 80.0, + 10.0, + 100.0, + ] + results_x_val = [5.0, 50.0, 5.0, 50.0, 12.0, 120.0, 14.0, 140.0, 16.0, 160.0, 18.0, 180.0] results_y_train = [1.0, 9.0, 1.0, 0.0, 4.0, 16.0, 36.0, 64.0, 100.0] results_y_val = [25.0, 25.0, 144.0, 196.0, 256.0, 324.0] @@ -5671,11 +5595,9 @@ def test07_multiple_input_columns(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -5684,22 +5606,20 @@ def test07_multiple_input_columns(self): self.assertEqual(results_y_val, flat_y_val) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test08_filtered(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) - df_minor_duplicate = ROOT.RDataFrame(self.tree_name, [self.file_name2, - self.file_name2, - self.file_name2]) + df_minor_duplicate = ROOT.RDataFrame(self.tree_name, [self.file_name2, self.file_name2, self.file_name2]) df_minor_filter = df_minor_duplicate.Filter("rdfentry_ < 3", "name") @@ -5731,14 +5651,14 @@ def test08_filtered(self): train_iter = iter(gen_train) val_iter = iter(gen_validation) - for _ in range(self.n_train_batch): + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - for _ in range(self.n_val_batch): + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 1)) self.assertTrue(y.shape == (2, 1)) @@ -5751,11 +5671,9 @@ def test08_filtered(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -5774,23 +5692,23 @@ def test08_filtered(self): self.assertEqual(num_minor_val, 2) major_entries_after = df_major.AsNumpy(["rdfentry_"])["rdfentry_"] - minor_filter_entries_after = df_minor_filter.AsNumpy(["rdfentry_"])["rdfentry_"] - + minor_filter_entries_after = df_minor_filter.AsNumpy(["rdfentry_"])["rdfentry_"] + # check if the dataframes are correctly reset self.assertTrue(np.array_equal(major_entries_before, major_entries_after)) self.assertTrue(np.array_equal(minor_filter_entries_before, minor_filter_entries_after)) - + self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise def test10_two_epochs_shuffled(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -5839,14 +5757,10 @@ def test10_two_epochs_shuffled(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] - flat_x_val = [ - x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] - flat_y_val = [ - y for yl in collected_y_val for ys in yl for y in ys] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(len(flat_x_train), 9) self.assertEqual(len(flat_x_val), 6) @@ -5856,17 +5770,15 @@ def test10_two_epochs_shuffled(self): both_epochs_collected_x_val.append(collected_x_val) both_epochs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) - self.assertEqual( - both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) + self.assertEqual(both_epochs_collected_x_val[0], both_epochs_collected_x_val[1]) + self.assertEqual(both_epochs_collected_y_val[0], both_epochs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) def test11_number_of_training_and_validation_batches_remainder(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) @@ -5893,47 +5805,36 @@ def test11_number_of_training_and_validation_batches_remainder(self): for _ in gen_validation: number_of_validation_batches += 1 - self.assertEqual(gen_train.number_of_batches, - number_of_training_batches) - self.assertEqual(gen_validation.number_of_batches, - number_of_validation_batches) + self.assertEqual(gen_train.number_of_batches, number_of_training_batches) + self.assertEqual(gen_validation.number_of_batches, number_of_validation_batches) self.assertEqual(gen_train.last_batch_no_of_rows, 1) self.assertEqual(gen_validation.last_batch_no_of_rows, 0) self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) except: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) + self.teardown_file(self.file_name2) raise - def test12_PyTorch(self): - import torch - file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(10)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(3)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(10).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(3).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df_minor = ROOT.RDataFrame("myTree", file_name1) - df_major = ROOT.RDataFrame("myTree", file_name2) + df_major = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( [df_minor, df_major], - batch_size=2, + batch_size=2, target=["b2", "b4"], weights="b3", validation_split=0.4, @@ -5946,10 +5847,27 @@ def test12_PyTorch(self): results_x_train = [1.0, 3.0, 1.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0] results_x_val = [5.0, 5.0, 12.0, 14.0, 16.0, 18.0] - results_y_train = [1.0, 100.0, 9.0, 300.0, 1.0, 100.0, 0.0, 0.0, 4.0, 200.0, - 16.0, 400.0, 36.0, 600.0, 64.0, 800.0, 100.0, 1000.0] - results_y_val = [25.0, 500.0, 25.0, 500.0, 144.0, 1200.0, 196.0, 1400.0, - 256.0, 1600.0, 324.0, 1800.0] + results_y_train = [ + 1.0, + 100.0, + 9.0, + 300.0, + 1.0, + 100.0, + 0.0, + 0.0, + 4.0, + 200.0, + 16.0, + 400.0, + 36.0, + 600.0, + 64.0, + 800.0, + 100.0, + 1000.0, + ] + results_y_val = [25.0, 500.0, 25.0, 500.0, 144.0, 1200.0, 196.0, 1400.0, 256.0, 1600.0, 324.0, 1800.0] results_z_train = [10.0, 30.0, 10.0, 0.0, 20.0, 40.0, 60.0, 80.0, 100.0] results_z_val = [50.0, 50.0, 120.0, 140.0, 160.0, 180.0] @@ -5989,14 +5907,11 @@ def test12_PyTorch(self): collected_y_train.append(y.tolist()) collected_z_train.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -6017,38 +5932,30 @@ def test12_PyTorch(self): self.assertEqual(num_minor_val, 2) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test13_TensorFlow(self): - import tensorflow as tf - file_name1 = "multiple_target_columns_major.root" - file_name2 = "multiple_target_columns_minor.root" - - ROOT.RDataFrame(10)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name1) - ROOT.RDataFrame(3)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(int) b1 * b1")\ - .Define("b3", "(double) b1 * 10")\ - .Define("b4", "(double) b3 * 10")\ - .Snapshot("myTree", file_name2) + file_name2 = "multiple_target_columns_minor.root" + + ROOT.RDataFrame(10).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name1) + ROOT.RDataFrame(3).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1 * b1").Define( + "b3", "(double) b1 * 10" + ).Define("b4", "(double) b3 * 10").Snapshot("myTree", file_name2) try: df_minor = ROOT.RDataFrame("myTree", file_name1) - df_major = ROOT.RDataFrame("myTree", file_name2) + df_major = ROOT.RDataFrame("myTree", file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreatePyTorchGenerators( [df_minor, df_major], - batch_size=2, + batch_size=2, target=["b2", "b4"], weights="b3", validation_split=0.4, @@ -6061,13 +5968,30 @@ def test13_TensorFlow(self): results_x_train = [1.0, 3.0, 1.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0] results_x_val = [5.0, 5.0, 12.0, 14.0, 16.0, 18.0] - results_y_train = [1.0, 100.0, 9.0, 300.0, 1.0, 100.0, 0.0, 0.0, 4.0, 200.0, - 16.0, 400.0, 36.0, 600.0, 64.0, 800.0, 100.0, 1000.0] - results_y_val = [25.0, 500.0, 25.0, 500.0, 144.0, 1200.0, 196.0, 1400.0, - 256.0, 1600.0, 324.0, 1800.0] + results_y_train = [ + 1.0, + 100.0, + 9.0, + 300.0, + 1.0, + 100.0, + 0.0, + 0.0, + 4.0, + 200.0, + 16.0, + 400.0, + 36.0, + 600.0, + 64.0, + 800.0, + 100.0, + 1000.0, + ] + results_y_val = [25.0, 500.0, 25.0, 500.0, 144.0, 1200.0, 196.0, 1400.0, 256.0, 1600.0, 324.0, 1800.0] results_z_train = [10.0, 30.0, 10.0, 0.0, 20.0, 40.0, 60.0, 80.0, 100.0] results_z_val = [50.0, 50.0, 120.0, 140.0, 160.0, 180.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -6104,14 +6028,11 @@ def test13_TensorFlow(self): collected_y_train.append(y.tolist()) collected_z_train.append(z.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] - flat_z_train = [ - z for zl in collected_z_train for zs in zl for z in zs] + flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] self.assertEqual(results_x_train, flat_x_train) @@ -6132,52 +6053,48 @@ def test13_TensorFlow(self): self.assertEqual(num_minor_val, 2) self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise def test14_big_data_replacement_false(self): file_name1 = "big_data_major.root" - file_name2 = "big_data_minor.root" + file_name2 = "big_data_minor.root" tree_name = "myTree" entries_in_rdf_major = randrange(10000, 30000) - entries_in_rdf_minor = randrange(8000, 9999) + entries_in_rdf_minor = randrange(8000, 9999) batch_size = randrange(100, 501) sampling_ratio = round(uniform(0.1, 2), 2) - + error_message = f"\n Batch size: {batch_size}\ Number of major entries: {entries_in_rdf_major} \ - Number of minor entries: {entries_in_rdf_minor}" + Number of minor entries: {entries_in_rdf_minor}" def define_rdf_major(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) def define_rdf_minor(num_of_entries, file_name): - ROOT.RDataFrame(num_of_entries)\ - .Define("b1", "(int) 2 * rdfentry_ + 1")\ - .Define("b2", "(double) rdfentry_ * 2")\ - .Define("b3", "(int) rdfentry_ + 10192")\ - .Define("b4", "(int) -rdfentry_")\ - .Define("b5", "(double) -rdfentry_ - 10192")\ - .Snapshot(tree_name, file_name) - + ROOT.RDataFrame(num_of_entries).Define("b1", "(int) 2 * rdfentry_ + 1").Define( + "b2", "(double) rdfentry_ * 2" + ).Define("b3", "(int) rdfentry_ + 10192").Define("b4", "(int) -rdfentry_").Define( + "b5", "(double) -rdfentry_ - 10192" + ).Snapshot(tree_name, file_name) + def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_ratio): define_rdf_major(num_of_entries_major, file_name1) define_rdf_minor(num_of_entries_minor, file_name2) try: df1 = ROOT.RDataFrame(tree_name, file_name1) - df2 = ROOT.RDataFrame(tree_name, file_name2) + df2 = ROOT.RDataFrame(tree_name, file_name2) gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df1, df2], @@ -6193,15 +6110,15 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat ) collected_z_train = [] - collected_z_val = [] + collected_z_val = [] train_remainder = gen_train.last_batch_no_of_rows val_remainder = gen_validation.last_batch_no_of_rows - n_train_batches = gen_train.number_of_batches - \ - 1 if train_remainder else gen_train.number_of_batches - n_val_batches = gen_validation.number_of_batches - \ - 1 if val_remainder else gen_validation.number_of_batches + n_train_batches = gen_train.number_of_batches - 1 if train_remainder else gen_train.number_of_batches + n_val_batches = ( + gen_validation.number_of_batches - 1 if val_remainder else gen_validation.number_of_batches + ) iter_train = iter(gen_train) iter_val = iter(gen_validation) @@ -6209,46 +6126,32 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat for i in range(n_train_batches): x, y, z = next(iter_train) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - collected_z_train.append(z.tolist()) - + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + collected_z_train.append(z.tolist()) if train_remainder: x, y, z = next(iter_train) - self.assertTrue(x.shape == ( - train_remainder, 2), error_message) - self.assertTrue(y.shape == ( - train_remainder, 2), error_message) - self.assertTrue(z.shape == ( - train_remainder, 1), error_message) - collected_z_train.append(z.tolist()) - + self.assertTrue(x.shape == (train_remainder, 2), error_message) + self.assertTrue(y.shape == (train_remainder, 2), error_message) + self.assertTrue(z.shape == (train_remainder, 1), error_message) + collected_z_train.append(z.tolist()) for _ in range(n_val_batches): x, y, z = next(iter_val) - self.assertTrue(x.shape == (size_of_batch, 2), - error_message + f" row: {i} x shape: {x.shape}") - self.assertTrue(y.shape == (size_of_batch, 2), - error_message + f" row: {i} y shape: {y.shape}") - self.assertTrue(z.shape == (size_of_batch, 1), - error_message + f" row: {i} z shape: {z.shape}") - collected_z_val.append(z.tolist()) + self.assertTrue(x.shape == (size_of_batch, 2), error_message + f" row: {i} x shape: {x.shape}") + self.assertTrue(y.shape == (size_of_batch, 2), error_message + f" row: {i} y shape: {y.shape}") + self.assertTrue(z.shape == (size_of_batch, 1), error_message + f" row: {i} z shape: {z.shape}") + collected_z_val.append(z.tolist()) if val_remainder: x, y, z = next(iter_val) - self.assertTrue(x.shape == ( - val_remainder, 2), error_message) - self.assertTrue(y.shape == ( - val_remainder, 2), error_message) - self.assertTrue(z.shape == ( - val_remainder, 1), error_message) - collected_z_val.append(z.tolist()) + self.assertTrue(x.shape == (val_remainder, 2), error_message) + self.assertTrue(y.shape == (val_remainder, 2), error_message) + self.assertTrue(z.shape == (val_remainder, 1), error_message) + collected_z_val.append(z.tolist()) flat_z_train = [z for zl in collected_z_train for zs in zl for z in zs] flat_z_val = [z for zl in collected_z_val for zs in zl for z in zs] @@ -6260,34 +6163,33 @@ def test(size_of_batch, num_of_entries_major, num_of_entries_minor, sampling_rat # check if there are no duplicate entries (replacement=False) self.assertLessEqual(len(set(flat_z_train)), len(flat_z_train)) - self.assertLessEqual(len(set(flat_z_val)), len(flat_z_val)) + self.assertLessEqual(len(set(flat_z_val)), len(flat_z_val)) # check if the sampling stategy is correct - self.assertEqual(round((num_minor_train/num_major_train), 2), sampling_ratio) - self.assertEqual(round((num_minor_val/num_major_val), 2), sampling_ratio) - + self.assertEqual(round((num_minor_train / num_major_train), 2), sampling_ratio) + self.assertEqual(round((num_minor_val / num_major_val), 2), sampling_ratio) + self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) except: self.teardown_file(file_name1) - self.teardown_file(file_name2) + self.teardown_file(file_name2) raise test(batch_size, entries_in_rdf_major, entries_in_rdf_minor, sampling_ratio) - def test15_two_runs_set_seed(self): self.create_file_major() - self.create_file_minor() + self.create_file_minor() try: both_runs_collected_x_val = [] both_runs_collected_y_val = [] - + df_major = ROOT.RDataFrame(self.tree_name, self.file_name1) df_minor = ROOT.RDataFrame(self.tree_name, self.file_name2) - for _ in range(2): + for _ in range(2): gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( [df_major, df_minor], batch_size=2, @@ -6299,7 +6201,7 @@ def test15_two_runs_set_seed(self): sampling_type="oversampling", sampling_ratio=0.5, ) - + collected_x_train = [] collected_x_val = [] collected_y_train = [] @@ -6328,14 +6230,10 @@ def test15_two_runs_set_seed(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] - flat_x_val = [ - x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] - flat_y_val = [ - y for yl in collected_y_val for ys in yl for y in ys] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] + flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] + flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(len(flat_x_train), 9) self.assertEqual(len(flat_x_val), 6) @@ -6344,21 +6242,19 @@ def test15_two_runs_set_seed(self): both_runs_collected_x_val.append(collected_x_val) both_runs_collected_y_val.append(collected_y_val) - self.assertEqual( - both_runs_collected_x_val[0], both_runs_collected_x_val[1]) - self.assertEqual( - both_runs_collected_y_val[0], both_runs_collected_y_val[1]) + self.assertEqual(both_runs_collected_x_val[0], both_runs_collected_x_val[1]) + self.assertEqual(both_runs_collected_y_val[0], both_runs_collected_y_val[1]) finally: self.teardown_file(self.file_name1) - self.teardown_file(self.file_name2) - + self.teardown_file(self.file_name2) + def test16_vector_padding(self): self.create_vector_file_major() - self.create_vector_file_minor() + self.create_vector_file_minor() try: df_major = ROOT.RDataFrame(self.tree_name, self.file_name4) - df_minor = ROOT.RDataFrame(self.tree_name, self.file_name5) + df_minor = ROOT.RDataFrame(self.tree_name, self.file_name5) max_vec_sizes = {"v1": 3, "v2": 2} gen_train, gen_validation = ROOT.TMVA.Experimental.CreateNumPyGenerators( @@ -6366,7 +6262,7 @@ def test16_vector_padding(self): batch_size=2, target="b1", validation_split=0.4, - max_vec_sizes=max_vec_sizes, + max_vec_sizes=max_vec_sizes, shuffle=False, drop_remainder=False, load_eager=True, @@ -6375,39 +6271,103 @@ def test16_vector_padding(self): replacement=False, ) - results_x_train = [10.0, 100.0, 0.0, 1000.0, 10000.0, - 11.0, 110.0, 0.0, 1100.0, 11000.0, - 10.0, 100.0, 0.0, 1000.0, 10000.0, - 0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 10.0, 0.0, 100.0, 1000.0, - 2.0, 20.0, 0.0, 200.0, 2000.0, - 3.0, 30.0, 0.0, 300.0, 3000.0, - 4.0, 40.0, 0.0, 400.0, 4000.0, - 5.0, 50.0, 0.0, 500.0, 5000.0] - results_x_val = [12.0, 120.0, 0.0, 1200.0, 12000.0, - 12.0, 120.0, 0.0, 1200.0, 12000.0, - 6.0, 60.0, 0.0, 600.0, 6000.0, - 7.0, 70.0, 0.0, 700.0, 7000.0, - 8.0, 80.0, 0.0, 800.0, 8000.0, - 9.0, 90.0, 0.0, 900.0, 9000.0] + results_x_train = [ + 10.0, + 100.0, + 0.0, + 1000.0, + 10000.0, + 11.0, + 110.0, + 0.0, + 1100.0, + 11000.0, + 10.0, + 100.0, + 0.0, + 1000.0, + 10000.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 10.0, + 0.0, + 100.0, + 1000.0, + 2.0, + 20.0, + 0.0, + 200.0, + 2000.0, + 3.0, + 30.0, + 0.0, + 300.0, + 3000.0, + 4.0, + 40.0, + 0.0, + 400.0, + 4000.0, + 5.0, + 50.0, + 0.0, + 500.0, + 5000.0, + ] + results_x_val = [ + 12.0, + 120.0, + 0.0, + 1200.0, + 12000.0, + 12.0, + 120.0, + 0.0, + 1200.0, + 12000.0, + 6.0, + 60.0, + 0.0, + 600.0, + 6000.0, + 7.0, + 70.0, + 0.0, + 700.0, + 7000.0, + 8.0, + 80.0, + 0.0, + 800.0, + 8000.0, + 9.0, + 90.0, + 0.0, + 900.0, + 9000.0, + ] results_y_train = [10.0, 11.0, 10.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0] results_y_val = [12.0, 12.0, 6.0, 7.0, 8.0, 9.0] - + collected_x_train = [] collected_x_val = [] collected_y_train = [] collected_y_val = [] train_iter = iter(gen_train) - val_iter = iter(gen_validation) - + val_iter = iter(gen_validation) + for _ in range(self.n_val_batch): x, y = next(val_iter) self.assertTrue(x.shape == (2, 5)) self.assertTrue(y.shape == (2, 1)) collected_x_val.append(x.tolist()) collected_y_val.append(y.tolist()) - + for _ in range(self.n_train_batch): x, y = next(train_iter) self.assertTrue(x.shape == (2, 5)) @@ -6421,11 +6381,9 @@ def test16_vector_padding(self): collected_x_train.append(x.tolist()) collected_y_train.append(y.tolist()) - flat_x_train = [ - x for xl in collected_x_train for xs in xl for x in xs] + flat_x_train = [x for xl in collected_x_train for xs in xl for x in xs] flat_x_val = [x for xl in collected_x_val for xs in xl for x in xs] - flat_y_train = [ - y for yl in collected_y_train for ys in yl for y in ys] + flat_y_train = [y for yl in collected_y_train for ys in yl for y in ys] flat_y_val = [y for yl in collected_y_val for ys in yl for y in ys] self.assertEqual(results_x_train, flat_x_train) @@ -6437,20 +6395,20 @@ def test16_vector_padding(self): num_minor_train = sum(np.array(flat_y_train) >= 10) num_major_val = sum(np.array(flat_y_val) < 10) num_minor_val = sum(np.array(flat_y_val) >= 10) - + self.assertEqual(num_major_train, 6) self.assertEqual(num_minor_train, 3) self.assertEqual(num_major_val, 4) self.assertEqual(num_minor_val, 2) - - + self.teardown_file(self.file_name4) - self.teardown_file(self.file_name5) + self.teardown_file(self.file_name5) except: self.teardown_file(self.file_name4) - self.teardown_file(self.file_name5) + self.teardown_file(self.file_name5) raise -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx index 45d5923a87132..ef559b31d6b50 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchGenerator.hxx @@ -50,7 +50,7 @@ template class RBatchGenerator { private: std::vector fCols; - std::vector fVecSizes; + std::vector fVecSizes; // clang-format on std::size_t fChunkSize; std::size_t fMaxChunks; @@ -60,7 +60,7 @@ private: float fValidationSplit; - std::unique_ptr> fDatasetLoader; + std::unique_ptr> fDatasetLoader; std::unique_ptr> fChunkLoader; std::unique_ptr fTrainingBatchLoader; std::unique_ptr fValidationBatchLoader; @@ -68,9 +68,9 @@ private: std::unique_ptr fValidationSampler; std::unique_ptr fTensorOperators; - + std::vector f_rdfs; - + std::unique_ptr fLoadingThread; std::size_t fTrainingChunkNum; @@ -83,8 +83,8 @@ private: bool fLoadEager; std::string fSampleType; float fSampleRatio; - bool fReplacement; - + bool fReplacement; + bool fIsActive{false}; // Whether the loading thread is active bool fUseWholeFile; @@ -104,10 +104,10 @@ private: RFlat2DMatrix fTrainingDataset; RFlat2DMatrix fValidationDataset; - + RFlat2DMatrix fSampledTrainingDataset; RFlat2DMatrix fSampledValidationDataset; - + RFlat2DMatrix fTrainTensor; RFlat2DMatrix fTrainChunkTensor; @@ -124,7 +124,7 @@ public: : f_rdfs(rdfs), fCols(cols), - fVecSizes(vecSizes), + fVecSizes(vecSizes), fChunkSize(chunkSize), fBlockSize(blockSize), fBatchSize(batchSize), @@ -140,57 +140,56 @@ public: fUseWholeFile(maxChunks == 0) { fTensorOperators = std::make_unique(fShuffle, fSetSeed); - + if (fLoadEager) { fDatasetLoader = std::make_unique>(f_rdfs, fValidationSplit, fCols, fVecSizes, - vecPadding, fShuffle, fSetSeed); + vecPadding, fShuffle, fSetSeed); // split the datasets and extract the training and validation datasets fDatasetLoader->SplitDatasets(); if (fSampleType == "") { fDatasetLoader->ConcatenateDatasets(); - + fTrainingDataset = fDatasetLoader->GetTrainingDataset(); - fValidationDataset = fDatasetLoader->GetValidationDataset(); - + fValidationDataset = fDatasetLoader->GetValidationDataset(); + fNumTrainingEntries = fDatasetLoader->GetNumTrainingEntries(); fNumValidationEntries = fDatasetLoader->GetNumValidationEntries(); } else { fTrainingDatasets = fDatasetLoader->GetTrainingDatasets(); - fValidationDatasets = fDatasetLoader->GetValidationDatasets(); - + fValidationDatasets = fDatasetLoader->GetValidationDatasets(); + fTrainingSampler = std::make_unique(fTrainingDatasets, fSampleType, fSampleRatio, fReplacement, fShuffle, fSetSeed); - fValidationSampler = std::make_unique(fValidationDatasets, fSampleType, fSampleRatio, fReplacement, - fShuffle, fSetSeed); + fValidationSampler = std::make_unique(fValidationDatasets, fSampleType, fSampleRatio, + fReplacement, fShuffle, fSetSeed); - fNumTrainingEntries = fTrainingSampler->GetNumEntries(); + fNumTrainingEntries = fTrainingSampler->GetNumEntries(); fNumValidationEntries = fValidationSampler->GetNumEntries(); } } else { - fChunkLoader = - std::make_unique>(f_rdfs[0], fChunkSize, fBlockSize, fValidationSplit, - fCols, fVecSizes, vecPadding, fShuffle, fSetSeed); + fChunkLoader = std::make_unique>(f_rdfs[0], fChunkSize, fBlockSize, fValidationSplit, + fCols, fVecSizes, vecPadding, fShuffle, fSetSeed); // split the dataset into training and validation sets fChunkLoader->SplitDataset(); fNumTrainingEntries = fChunkLoader->GetNumTrainingEntries(); - fNumValidationEntries = fChunkLoader->GetNumValidationEntries(); + fNumValidationEntries = fChunkLoader->GetNumValidationEntries(); // number of training and validation chunks, calculated in RChunkConstructor fNumTrainingChunks = fChunkLoader->GetNumTrainingChunks(); fNumValidationChunks = fChunkLoader->GetNumValidationChunks(); } - fTrainingBatchLoader = std::make_unique(fBatchSize, fCols, fVecSizes, - fNumTrainingEntries, fDropRemainder); - fValidationBatchLoader = std::make_unique(fBatchSize, fCols, fVecSizes, - fNumValidationEntries, fDropRemainder); + fTrainingBatchLoader = + std::make_unique(fBatchSize, fCols, fVecSizes, fNumTrainingEntries, fDropRemainder); + fValidationBatchLoader = + std::make_unique(fBatchSize, fCols, fVecSizes, fNumValidationEntries, fDropRemainder); } ~RBatchGenerator() { DeActivate(); } @@ -203,7 +202,7 @@ public: } fTrainingBatchLoader->DeActivate(); - fValidationBatchLoader->DeActivate(); + fValidationBatchLoader->DeActivate(); if (fLoadingThread) { if (fLoadingThread->joinable()) { @@ -225,7 +224,7 @@ public: } fTrainingBatchLoader->Activate(); - fValidationBatchLoader->Activate(); + fValidationBatchLoader->Activate(); // fLoadingThread = std::make_unique(&RBatchGenerator::LoadChunks, this); } @@ -241,10 +240,11 @@ public: void DeActivateValidationEpoch() { fValidationEpochActive = false; } - /// \brief Create training batches by first loading a chunk (see RChunkLoader) and split it into batches (see RBatchLoader) + /// \brief Create training batches by first loading a chunk (see RChunkLoader) and split it into batches (see + /// RBatchLoader) void CreateTrainBatches() { - fTrainingEpochActive = true; + fTrainingEpochActive = true; if (fLoadEager) { if (fSampleType == "") { fTensorOperators->ShuffleTensor(fSampledTrainingDataset, fTrainingDataset); @@ -253,10 +253,10 @@ public: else { fTrainingSampler->Sampler(fSampledTrainingDataset); } - + fTrainingBatchLoader->CreateBatches(fSampledTrainingDataset, 1); } - + else { fChunkLoader->CreateTrainingChunksIntervals(); fTrainingChunkNum = 0; @@ -266,10 +266,11 @@ public: } } - /// \brief Creates validation batches by first loading a chunk (see RChunkLoader), and then split it into batches (see RBatchLoader) + /// \brief Creates validation batches by first loading a chunk (see RChunkLoader), and then split it into batches + /// (see RBatchLoader) void CreateValidationBatches() { - fValidationEpochActive = true; + fValidationEpochActive = true; if (fLoadEager) { if (fSampleType == "") { fTensorOperators->ShuffleTensor(fSampledValidationDataset, fValidationDataset); @@ -278,7 +279,7 @@ public: else { fValidationSampler->Sampler(fSampledValidationDataset); } - + fValidationBatchLoader->CreateBatches(fSampledValidationDataset, 1); } @@ -294,28 +295,28 @@ public: /// \brief Loads a training batch from the queue RFlat2DMatrix GetTrainBatch() { - if (!fLoadEager) { - auto batchQueue = fTrainingBatchLoader->GetNumBatchQueue(); - - // load the next chunk if the queue is empty - if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) { - fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); - std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum; - fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, lastTrainingBatch); - fTrainingChunkNum++; - } - } - // Get next batch if available - return fTrainingBatchLoader->GetBatch(); + if (!fLoadEager) { + auto batchQueue = fTrainingBatchLoader->GetNumBatchQueue(); + + // load the next chunk if the queue is empty + if (batchQueue < 1 && fTrainingChunkNum < fNumTrainingChunks) { + fChunkLoader->LoadTrainingChunk(fTrainChunkTensor, fTrainingChunkNum); + std::size_t lastTrainingBatch = fNumTrainingChunks - fTrainingChunkNum; + fTrainingBatchLoader->CreateBatches(fTrainChunkTensor, lastTrainingBatch); + fTrainingChunkNum++; + } + } + // Get next batch if available + return fTrainingBatchLoader->GetBatch(); } /// \brief Loads a validation batch from the queue RFlat2DMatrix GetValidationBatch() { - if (!fLoadEager) { + if (!fLoadEager) { auto batchQueue = fValidationBatchLoader->GetNumBatchQueue(); - // load the next chunk if the queue is empty + // load the next chunk if the queue is empty if (batchQueue < 1 && fValidationChunkNum < fNumValidationChunks) { fChunkLoader->LoadValidationChunk(fValidationChunkTensor, fValidationChunkNum); std::size_t lastValidationBatch = fNumValidationChunks - fValidationChunkNum; diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx index 8cd69afb99410..1a49e3cad2b99 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RBatchLoader.hxx @@ -44,7 +44,7 @@ private: // needed for calculating the total number of batch columns when vectors columns are present std::vector fCols; std::vector fVecSizes; - std::size_t fSumVecSizes; + std::size_t fSumVecSizes; std::size_t fNumColumns; std::size_t fNumEntries; bool fDropRemainder; @@ -53,7 +53,7 @@ private: std::size_t fNumLeftoverBatches; std::size_t fNumBatches; std::size_t fLeftoverBatchSize; - + bool fIsActive = false; std::mutex fBatchLock; @@ -70,18 +70,14 @@ private: std::unique_ptr fSecondaryLeftoverBatch; public: - RBatchLoader(std::size_t batchSize, const std::vector &cols, const std::vector &vecSizes = {}, - std::size_t numEntries = 0, bool dropRemainder = false) - : fBatchSize(batchSize), - fCols(cols), - fVecSizes(vecSizes), - fNumEntries(numEntries), - fDropRemainder(dropRemainder) + RBatchLoader(std::size_t batchSize, const std::vector &cols, + const std::vector &vecSizes = {}, std::size_t numEntries = 0, bool dropRemainder = false) + : fBatchSize(batchSize), fCols(cols), fVecSizes(vecSizes), fNumEntries(numEntries), fDropRemainder(dropRemainder) { fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0); fNumColumns = fCols.size() + fSumVecSizes - fVecSizes.size(); - + if (fBatchSize == 0) { fBatchSize = fNumEntries; } @@ -98,10 +94,9 @@ public: else { fNumBatches = fNumFullBatches + fNumLeftoverBatches; } - + fPrimaryLeftoverBatch = std::make_unique(); fSecondaryLeftoverBatch = std::make_unique(); - } public: @@ -158,8 +153,7 @@ public: /// \brief Creating the batches from a chunk and add them to the queue. /// \param[in] chunkTensor Tensor with the data from the chunk /// \param[in] lastbatch Check if the batch in the chunk is the last one - void - CreateBatches(RFlat2DMatrix &chunkTensor, std::size_t lastbatch) + void CreateBatches(RFlat2DMatrix &chunkTensor, std::size_t lastbatch) { std::size_t ChunkSize = chunkTensor.GetRows(); std::size_t NumCols = chunkTensor.GetCols(); @@ -194,8 +188,8 @@ public: // copy LeftoverBatch to end of fPrimaryLeftoverBatch and add it to the batch vector if (emptySlots == LeftoverBatchSize) { auto copy = std::make_unique(fBatchSize, fNumColumns); - std::copy(fPrimaryLeftoverBatch->GetData(), - fPrimaryLeftoverBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData()); + std::copy(fPrimaryLeftoverBatch->GetData(), fPrimaryLeftoverBatch->GetData() + (fBatchSize * fNumColumns), + copy->GetData()); batches.emplace_back(std::move(copy)); // reset fPrimaryLeftoverBatch and fSecondaryLeftoverBatch @@ -214,13 +208,12 @@ public: // copy the last part of LeftoverBatch to the end of fSecondaryLeftoverBatch fSecondaryLeftoverBatch->Resize(LeftoverBatchSize - emptySlots, NumCols); std::copy(LeftoverBatch.GetData() + (emptySlots * NumCols), - LeftoverBatch.GetData() + (LeftoverBatchSize * NumCols), - fSecondaryLeftoverBatch->GetData()); + LeftoverBatch.GetData() + (LeftoverBatchSize * NumCols), fSecondaryLeftoverBatch->GetData()); // add fPrimaryLeftoverBatch to the batch vector auto copy = std::make_unique(fBatchSize, fNumColumns); - std::copy(fPrimaryLeftoverBatch->GetData(), - fPrimaryLeftoverBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData()); + std::copy(fPrimaryLeftoverBatch->GetData(), fPrimaryLeftoverBatch->GetData() + (fBatchSize * fNumColumns), + copy->GetData()); batches.emplace_back(std::move(copy)); // exchange fPrimaryLeftoverBatch and fSecondaryLeftoverBatch @@ -252,7 +245,7 @@ public: std::size_t GetNumBatches() { return fNumBatches; } std::size_t GetNumEntries() { return fNumEntries; } - std::size_t GetNumRemainderRows() { return fLeftoverBatchSize; } + std::size_t GetNumRemainderRows() { return fLeftoverBatchSize; } std::size_t GetNumBatchQueue() { return fBatchQueue.size(); } }; diff --git a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx index 094d29f423633..7043d5458c318 100644 --- a/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx +++ b/tmva/tmva/inc/TMVA/BatchGenerator/RChunkConstructor.hxx @@ -180,7 +180,7 @@ struct RChunkConstructor { } ////////////////////////////////////////////////////////////////////////// - /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry. + /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry. void CreateChunksIntervals() { @@ -219,7 +219,7 @@ struct RChunkConstructor { } ////////////////////////////////////////////////////////////////////////// - /// \brief Fills a vector with the size of every chunk from the dataset + /// \brief Fills a vector with the size of every chunk from the dataset void SizeOfChunks() {