diff --git a/data/advanced_datasets.py b/data/advanced_datasets.py index 0b69512c..7257f4c8 100644 --- a/data/advanced_datasets.py +++ b/data/advanced_datasets.py @@ -19,6 +19,7 @@ def __init__( queue_size: int = 2, max_images_per_example: int = 4, max_images_per_knapsack: int = 18, + dataset_subsample_step: int = 1, ): self.dataset = dataset self.max_sample_length = max_sample_length @@ -33,6 +34,7 @@ def __init__( self._average_length_per_sample = ( self.dataset.mp_image_token_length + 198 ) # 198 is the average tokens for the cauldron dataset + self.dataset_subsample_step = dataset_subsample_step def __len__(self): return int( @@ -110,6 +112,8 @@ def _producer( buffer, buffer_len = [], 0 while buffer_len < self.max_length: try: + for _ in range(self.dataset_subsample_step - 1): + _ = next(iterator) sample = next(iterator) except StopIteration: if self.infinite: diff --git a/models/config.py b/models/config.py index 749c9c1d..2eb132c1 100644 --- a/models/config.py +++ b/models/config.py @@ -72,9 +72,10 @@ class TrainConfig: max_sample_length: int = 4096 compile: bool = False resume_from_vlm_checkpoint: bool = False # Indicate if the training should be resumed from a checkpoint of the whole VLM or you want to start from scratch - train_dataset_path: str = 'HuggingFaceM4/FineVision_concat_shuffled_2' + train_dataset_path: str = 'HuggingFaceM4/FineVisionMax' train_dataset_name: tuple[str, ...] = ("default", ) #('allava_laion', 'allava_vflan', 'cambrian(filtered)_processed', 'LLaVA_Instruct_150K', 'mmevol', 'sharegpt4o', 'sharegpt4v(coco)', 'sharegpt4v(knowledge)', 'sharegpt4v(llava)', 'sharegpt4v(sam)') # 'vision_flan(filtered)', 'lvis_instruct4v', stream_dataset: bool = True + dataset_subsample_step: int = 3 relevance_min_rating: int = 1 image_correspondence_min_rating: int = 1 visual_dependency_min_rating: int = 1 @@ -82,6 +83,6 @@ class TrainConfig: wandb_entity: str = "HuggingFace" # Indicate the entity to log to in wandb log_wandb: bool = True use_lmms_eval: bool = True # Use lmms-eval for evaluation - lmms_eval_tasks: str = 'mmstar,mmmu_val,ocrbench,textvqa_val,docvqa_val,scienceqa,mme,infovqa_val,chartqa' # Pass additional task as one string, seperated by commas without spaces (e.g. 'mmstar,mmmu,ocrbench') + lmms_eval_tasks: str = 'mmstar,mmmu_val,ocrbench,textvqa_val,docvqa_val,scienceqa,mme,infovqa_val,chartqa,ai2d' # Pass additional task as one string, seperated by commas without spaces (e.g. 'mmstar,mmmu,ocrbench') lmms_eval_limit: float = None lmms_eval_batch_size: int = 64 diff --git a/train.py b/train.py index 0e2ed58b..78f20a6a 100644 --- a/train.py +++ b/train.py @@ -197,10 +197,10 @@ def get_dataloaders(train_cfg, vlm_cfg): ) train_dataset = ConstantLengthDataset(train_dataset, infinite=False, max_sample_length=train_cfg.max_sample_length, seq_length=vlm_cfg.lm_max_length, num_of_sequences=train_cfg.batch_size*4, queue_size=8, - max_images_per_example=train_cfg.max_images_per_example, max_images_per_knapsack=train_cfg.max_images_per_knapsack) + max_images_per_example=train_cfg.max_images_per_example, max_images_per_knapsack=train_cfg.max_images_per_knapsack, dataset_subsample_step=train_cfg.dataset_subsample_step) val_dataset = ConstantLengthDataset(val_dataset, infinite=False, max_sample_length=train_cfg.max_sample_length, seq_length=vlm_cfg.lm_max_length, num_of_sequences=train_cfg.batch_size*4, queue_size=8, - max_images_per_example=train_cfg.max_images_per_example, max_images_per_knapsack=train_cfg.max_images_per_knapsack) + max_images_per_example=train_cfg.max_images_per_example, max_images_per_knapsack=train_cfg.max_images_per_knapsack, dataset_subsample_step=train_cfg.dataset_subsample_step) # Create collators vqa_collator = VQACollator(tokenizer, vlm_cfg.lm_max_length)