Skip to content

Commit

Permalink
keep separate 'path' and 'filenames' for element dataset; allows elem…
Browse files Browse the repository at this point in the history
…ent datasets to be imported when moved
  • Loading branch information
LonnekeScheffer committed May 28, 2024
1 parent 114499f commit 03e506c
Show file tree
Hide file tree
Showing 22 changed files with 148 additions and 138 deletions.
37 changes: 20 additions & 17 deletions immuneML/IO/dataset_export/ImmuneMLExporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,18 @@ def export(dataset: Dataset, path: Path, number_of_processes: int = 1):
yaml.dump(yaml_dict, file)

elif isinstance(dataset, SequenceDataset) or isinstance(dataset, ReceptorDataset):
exported_dataset.set_filenames(ImmuneMLExporter._export_receptors(exported_dataset.get_filenames(), path))
ImmuneMLExporter._export_receptors(exported_dataset.get_filenames_full_path(), path_new=path)
exported_dataset.batchfiles_path = path

exported_dataset.dataset_file = ImmuneMLExporter._export_element_metadata(dataset, path)
file_path = exported_dataset.dataset_file

version_path = path / "info.txt"
with version_path.open("w") as file:
file.writelines(f"immuneML_version: {Constants.VERSION}\n"
f"Python_version: {platform.python_version()}\n")

return exported_dataset
return file_path

@staticmethod
def _parse_val_for_export(val):
Expand Down Expand Up @@ -82,19 +85,22 @@ def _export_metadata(dataset, metadata_folder_path: Path, dataset_filename, repe
if dataset.metadata_file is None or not dataset.metadata_file.is_file():
return None

metadata_file = metadata_folder_path / f"{dataset.name}_metadata.csv"
new_metadata_file = metadata_folder_path / f"{dataset.name}_metadata.csv"

if not metadata_file.is_file():
shutil.copyfile(dataset.metadata_file, metadata_file)
if not new_metadata_file.is_file():
shutil.copyfile(dataset.metadata_file, new_metadata_file)

ImmuneMLExporter._update_repertoire_paths_in_metadata(metadata_file, repertoires_path)
ImmuneMLExporter._add_dataset_to_metadata(metadata_file, dataset_filename)
ImmuneMLExporter._update_repertoire_paths_in_metadata(new_metadata_file, repertoires_path)
ImmuneMLExporter._add_dataset_to_metadata(new_metadata_file, dataset_filename)

old_metadata_file = metadata_folder_path / "metadata.csv"
if old_metadata_file.is_file():
# if true, it means the metadata file is exported to the same location
# overwrite the original dataset.metadata file with the new file to prevent 'missing metadata' bug
os.remove(str(old_metadata_file))
dataset.metadata_file = new_metadata_file

return metadata_file
return new_metadata_file

@staticmethod
def _update_repertoire_paths_in_metadata(metadata_file: Path, repertoires_path: Path):
Expand All @@ -111,12 +117,9 @@ def _add_dataset_to_metadata(metadata_file: Path, dataset_filename: str):
metadata.to_csv(metadata_file, mode="a", index=False)

@staticmethod
def _export_receptors(filenames_old: List[Path], path: Path) -> List[Path]:
filenames_new = []
for filename_old in filenames_old:
filename_new = ImmuneMLExporter._copy_if_exists(filename_old, path)
filenames_new.append(filename_new)
return filenames_new
def _export_receptors(filepaths: List[Path], path_new: Path) -> List[Path]:
for filepath_old in filepaths:
ImmuneMLExporter._copy_if_exists(filepath_old, path_new)

@staticmethod
def _export_repertoires(repertoires: List[Repertoire], repertoires_path: Path) -> List[Repertoire]:
Expand All @@ -131,11 +134,11 @@ def _export_repertoires(repertoires: List[Repertoire], repertoires_path: Path) -
return new_repertoires

@staticmethod
def _copy_if_exists(old_file: Path, path: Path):
def _copy_if_exists(old_file: Path, new_folder: Path):
if old_file is not None and old_file.is_file():
new_file = path / old_file.name
new_file = new_folder / old_file.name
if not new_file.is_file():
shutil.copyfile(old_file, new_file)
return new_file
else:
raise RuntimeError(f"{ImmuneMLExporter.__name__}: tried exporting file {old_file}, but it does not exist.")
raise RuntimeError(f"{ImmuneMLExporter.__name__}: tried exporting file {old_file} to new folder {new_folder}, but it does not exist.")
20 changes: 3 additions & 17 deletions immuneML/IO/dataset_import/ImmuneMLImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ class ImmuneMLImport(DataImport):
- path (str): The path to the previously created dataset file. This file should have an '.yaml' extension. If the
path has not been specified, immuneML attempts to load the dataset from a specified metadata file (only for
RepertoireDatasets).
RepertoireDatasets). For Sequence- and ReceptorDatasets, all additional files are expected to be located in
the same folder as the dataset '.yaml' file.
- metadata_file (str): An optional metadata file for a RepertoireDataset. If specified, the RepertoireDataset
metadata will be updated to the newly specified metadata without otherwise changing the Repertoire objects
Expand Down Expand Up @@ -78,8 +79,6 @@ def import_dataset(params: dict, dataset_name: str) -> Dataset:

if isinstance(dataset, RepertoireDataset):
dataset = ImmuneMLImport._update_repertoire_paths(iml_params, dataset)
else:
dataset = ImmuneMLImport._update_receptor_paths(iml_params, dataset)

return dataset

Expand All @@ -100,7 +99,7 @@ def _import_from_path(iml_params):
dataset_dict['metadata_file'] = iml_params.path.parent / Path(dataset_dict['metadata_file']).name

if dataset_class.__name__ in ['ReceptorDataset', 'SequenceDataset']:
dataset_dict['filenames'] = [iml_params.path.parent / filename for filename in dataset_dict['filenames']]
dataset_dict['batchfiles_path'] = iml_params.path.parent
del dataset_dict['type_dict']

dataset = dataset_class.build(**{**dataset_dict, 'dataset_file': iml_params.path})
Expand Down Expand Up @@ -135,19 +134,6 @@ def _update_repertoire_paths(iml_params, dataset):
def _discover_dataset_dir(pickle_params):
return pickle_params.path.parent

@staticmethod
def _update_receptor_paths(pickle_params, dataset: ElementDataset):
dataset_dir = ImmuneMLImport._discover_dataset_dir(pickle_params)

if len(list(dataset_dir.glob("*.npy"))) == len(dataset.get_filenames()):
path = dataset_dir
new_filenames = []
for file in dataset.get_filenames():
new_filenames.append(path / file.name)
dataset.set_filenames(new_filenames)

return dataset

@staticmethod
def _discover_repertoire_path(params, dataset):
dataset_dir = ImmuneMLImport._discover_dataset_dir(params)
Expand Down
40 changes: 28 additions & 12 deletions immuneML/data_model/dataset/ElementDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,45 +16,61 @@ class ElementDataset(Dataset):
"""

@classmethod
def build(cls, dataset_file: Path, types: dict = None, filenames: list = None, **kwargs):
def build(cls, dataset_file: Path, types: dict = None, filenames: list = None, batchfiles_path: Path = None, **kwargs):
if not Path(dataset_file).exists():
metadata = {
'type_dict': {key: SequenceSet.TYPE_TO_STR[val] for key, val in types.items()},
'dataset_class': cls.__name__, 'element_class_name': kwargs['element_class_name'],
'filenames': [str(file) for file in filenames]
'filenames': [str(file) for file in filenames],
'batchfiles_path': str(batchfiles_path)
}
write_yaml(dataset_file, metadata)
return cls(**{**kwargs, 'dataset_file': dataset_file, 'filenames': filenames})
return cls(**{**kwargs, 'dataset_file': dataset_file, 'filenames': filenames, 'batchfiles_path': batchfiles_path})

@staticmethod
def parse_filenames(filenames):
filenames = [Path(filename) for filename in filenames] if filenames is not None else []
for filename in filenames:
assert str(filename) == str(filename.name), "ElementDataset: filenames must only contain the names of files, not the full path. " \
"To supply the name of the folder where the files are stored, use parameter batchfiles_path. " \
f"This error was caused by the following filename: {filename} (should be {filename.name})"

return filenames

def __init__(self, labels: dict = None, encoded_data: EncodedData = None, filenames: list = None,
identifier: str = None, dataset_file: Path = None,
batchfiles_path: Path = None, identifier: str = None, dataset_file: Path = None,
file_size: int = 100000, name: str = None, element_class_name: str = None,
element_ids: list = None, example_weights: list = None,
buffer_type=None):
super().__init__(encoded_data, name, identifier if identifier is not None else uuid4().hex, labels, example_weights)
self.filenames = filenames if filenames is not None else []
self.filenames = [Path(filename) for filename in self.filenames]
self.batchfiles_path = Path(batchfiles_path)
self.filenames = ElementDataset.parse_filenames(filenames)
if buffer_type is None:
buffer_type = make_buffer_type_from_dataset_file(Path(dataset_file))
self.element_generator = ElementGenerator(self.filenames, file_size, element_class_name, buffer_type)

self.element_generator = ElementGenerator([self.batchfiles_path / filename for filename in self.filenames],
file_size, element_class_name, buffer_type)
self.file_size = file_size
self.element_ids = element_ids
self.element_class_name = element_class_name
self.dataset_file = Path(dataset_file)

def get_data(self, batch_size: int = 10000, return_objects: bool = True):
self.element_generator.file_list = self.filenames
self.element_generator.file_list = self.get_filenames_full_path()
return self.element_generator.build_element_generator(return_objects=return_objects)

def get_batch(self, batch_size: int = 10000):
self.element_generator.file_list = self.filenames
self.element_generator.file_list = self.get_filenames_full_path()
return self.element_generator.build_batch_generator()

def get_filenames_full_path(self):
return [self.batchfiles_path / filename for filename in self.filenames]

def get_filenames(self):
return self.filenames

def set_filenames(self, filenames):
self.filenames = filenames
self.filenames = ElementDataset.parse_filenames(filenames)

def get_example_count(self):
return len(self.get_example_ids())
Expand Down Expand Up @@ -102,12 +118,12 @@ def make_subset(self, example_indices, path, dataset_type: str):

types = read_yaml(self.dataset_file)['type_dict']

new_dataset = self.__class__.build(labels=self.labels, file_size=self.file_size, filenames=batch_filenames,
new_dataset = self.__class__.build(labels=self.labels, file_size=self.file_size,
filenames=batch_filenames, batchfiles_path=path,
element_class_name=self.element_generator.element_class_name,
dataset_file=path / f"{dataset_name}.yaml", types=types,
identifier=new_dataset_id, name=dataset_name)

# todo check if this is necessary
original_example_weights = self.get_example_weights()
if original_example_weights is not None:
new_dataset.set_example_weights([original_example_weights[i] for i in example_indices])
Expand Down
17 changes: 9 additions & 8 deletions immuneML/data_model/dataset/ReceptorDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,27 @@ def build_from_objects(cls, receptors: List[Receptor], file_size: int, path: Pat
labels: dict = None):

file_count = math.ceil(len(receptors) / file_size)
file_names = [
path / f"batch{''.join(['0' for i in range(1, len(str(file_count)) - len(str(index)) + 1)])}{index}.tsv"
for index in range(1, file_count + 1)]
filenames = [f"batch{''.join(['0' for i in range(1, len(str(file_count)) - len(str(index)) + 1)])}{index}.tsv"
for index in range(1, file_count + 1)]

receptor_dc, types = make_dynamic_seq_set_from_objs(receptors)

for index in range(file_count):
field_vals = get_receptor_attributes_for_bnp(receptors[index * file_size:(index + 1) * file_size], receptor_dc, types)
receptor_matrix = receptor_dc(**field_vals)
bnp_write_to_file(file_names[index], receptor_matrix)
bnp_write_to_file(path / filenames[index], receptor_matrix)

dataset_metadata = {'type_dict': {key: SequenceSet.TYPE_TO_STR[val] for key, val in types.items()},
'element_class_name': type(receptors[0]).__name__,
'dataset_class': 'ReceptorDataset',
'filenames': [str(file) for file in file_names]}
'filenames': filenames,
'batchfiles_path': str(path)}
metadata_filename = path / f'dataset_{name}.yaml'
write_yaml(metadata_filename, dataset_metadata)

return ReceptorDataset(filenames=file_names, file_size=file_size, name=name, labels=labels,
return ReceptorDataset(filenames=filenames, file_size=file_size, name=name, labels=labels,
element_class_name=type(receptors[0]).__name__ if len(receptors) > 0 else None,
dataset_file=metadata_filename,
dataset_file=metadata_filename, batchfiles_path=path,
buffer_type=bnp.io.delimited_buffers.get_bufferclass_for_datatype(receptor_dc,
delimiter='\t',
has_header=True))
Expand All @@ -57,7 +57,8 @@ def get_metadata(self, field_names: list, return_df: bool = False):
return pd.DataFrame(result) if return_df else result

def clone(self, keep_identifier: bool = False):
dataset = ReceptorDataset(self.labels, copy.deepcopy(self.encoded_data), copy.deepcopy(self.filenames),
dataset = ReceptorDataset(labels=self.labels, encoded_data=copy.deepcopy(self.encoded_data),
filenames=copy.deepcopy(self.filenames), batchfiles_path=copy.deepcopy(self.batchfiles_path),
file_size=self.file_size, dataset_file=copy.deepcopy(self.dataset_file),
name=self.name, element_class_name=self.element_generator.element_class_name)
if keep_identifier:
Expand Down
3 changes: 0 additions & 3 deletions immuneML/data_model/dataset/RepertoireDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,6 @@ def clone(self, keep_identifier: bool = False):
dataset.identifier = self.identifier
return dataset

def add_encoded_data(self, encoded_data: EncodedData):
self.encoded_data = encoded_data

def get_data(self, batch_size: int = 1):
return self.repertoires

Expand Down
14 changes: 7 additions & 7 deletions immuneML/data_model/dataset/SequenceDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ def build_from_objects(cls, sequences: List[ReceptorSequence], file_size: int, p
file_count = math.ceil(len(sequences) / file_size)
PathBuilder.build(path)

file_names = [
path / f"batch{''.join(['0' for i in range(1, len(str(file_count)) - len(str(index)) + 1)])}{index}.tsv"
filenames = [f"batch{''.join(['0' for i in range(1, len(str(file_count)) - len(str(index)) + 1)])}{index}.tsv"
for index in range(1, file_count + 1)]

seq_set_dc, types = make_dynamic_seq_set_from_objs(sequences)
Expand All @@ -39,17 +38,18 @@ def build_from_objects(cls, sequences: List[ReceptorSequence], file_size: int, p
for field_name in types.keys()}
vals = prepare_values_for_bnp(vals, types)
sequence_matrix = seq_set_dc(**vals)
bnp_write_to_file(file_names[index], sequence_matrix)
bnp_write_to_file(path / filenames[index], sequence_matrix)

metadata = {
'type_dict': {key: SequenceSet.TYPE_TO_STR[val] for key, val in types.items()},
'dataset_class': 'SequenceDataset', 'element_class_name': ReceptorSequence.__name__,
'filenames': [str(file) for file in file_names]
'filenames': filenames
}
dataset_file = path / f'dataset_{name}.yaml'
write_yaml(dataset_file, metadata)

return SequenceDataset(filenames=file_names, file_size=file_size, name=name, labels=labels,
return SequenceDataset(filenames=filenames, batchfiles_path=path, file_size=file_size,
name=name, labels=labels,
element_class_name=ReceptorSequence.__name__, dataset_file=dataset_file,
buffer_type=bnp.io.delimited_buffers.get_bufferclass_for_datatype(seq_set_dc,
delimiter='\t',
Expand Down Expand Up @@ -78,8 +78,8 @@ def get_metadata(self, field_names: list, return_df: bool = False):

def clone(self, keep_identifier: bool = False):
dataset = SequenceDataset(labels=self.labels, encoded_data=copy.deepcopy(self.encoded_data),
filenames=copy.deepcopy(self.filenames), dataset_file=self.dataset_file,
file_size=self.file_size, name=self.name)
filenames=copy.deepcopy(self.filenames), batchfiles_path=copy.deepcopy(self.batchfiles_path),
dataset_file=self.dataset_file, file_size=self.file_size, name=self.name)
if keep_identifier:
dataset.identifier = self.identifier
dataset.element_ids = self.element_ids
Expand Down
11 changes: 5 additions & 6 deletions immuneML/data_model/receptor/ElementGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,27 +100,26 @@ def make_subset(self, example_indices: list, path: Path, dataset_type: str, data

example_indices.sort()

batch_filenames = self._prepare_batch_filenames(len(example_indices), path, dataset_type, dataset_identifier)
batch_filenames = self._prepare_batch_filenames(len(example_indices), dataset_type, dataset_identifier)

for index, batch in enumerate(self.build_batch_generator(return_objects=False)):
extracted_elements = self._extract_elements_from_batch(index, batch, example_indices, paired=paired)
elements = merge_dataclass_objects([elements, extracted_elements]) if elements else extracted_elements

if len(elements) >= tmp_file_size or len(elements) == len(example_indices):
bnp_write_to_file(batch_filenames[file_count - 1], elements[:tmp_file_size])
bnp_write_to_file(path / batch_filenames[file_count - 1], elements[:tmp_file_size])
file_count += 1
elements = elements[tmp_file_size:]

if len(elements) > 0:
bnp_write_to_file(batch_filenames[file_count - 1], elements)
bnp_write_to_file(path / batch_filenames[file_count - 1], elements)

return batch_filenames

def _prepare_batch_filenames(self, example_count: int, path: Path, dataset_type: str, dataset_identifier: str):
def _prepare_batch_filenames(self, example_count: int, dataset_type: str, dataset_identifier: str):
batch_count = math.ceil(example_count / self.file_size)
digits_count = len(str(batch_count)) + 1
filenames = [
path / f"{dataset_identifier}_{dataset_type}_batch{''.join(['0' for _ in range(digits_count - len(str(index)))])}{index}.tsv"
filenames = [f"{dataset_identifier}_{dataset_type}_batch{''.join(['0' for _ in range(digits_count - len(str(index)))])}{index}.tsv"
for index in range(batch_count)]
return filenames

Expand Down
Loading

0 comments on commit 03e506c

Please sign in to comment.