raphaelsty · bclavie · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/docs/api/overview.md b/docs/api/overview.md
@@ -27,6 +27,7 @@
 - [train_colbert](../train/train-colbert)
 - [train_sparse_embed](../train/train-sparse-embed)
 - [train_splade](../train/train-splade)
+- [Multi-GPU training via Accelerator](../train/multi-gpu)
 
 ## utils
 

diff --git a/docs/fine_tune/.pages b/docs/fine_tune/.pages
@@ -3,4 +3,5 @@ nav:
     - colbert.md
     - splade.md
     - sparse_embed.md
+    - multi_gpu.md
 
diff --git a/docs/fine_tune/multi_gpu.md b/docs/fine_tune/multi_gpu.md
@@ -0,0 +1,64 @@
+# Multi-GPU (Accelerator)
+
+
+Training any of the models on multiple GPU via the accelerator library is simple. You just need to modify the training loop in a few key ways:
+
+```python
+from neural_cherche import models, utils, train
+import torch
+from torch.utils.data import DataLoader
+from accelerate import Accelerator
+
+
+# Wrap in main function to avoid multiprocessing issues
+if __name__ == "__main__"":
+    accelerator = Accelerator()
+    device = accelerator.device
+    batch_size = 32
+    epochs = 2
+    save_on_epoch = True
+
+    model = models.SparseEmbed(
+        model_name_or_path="distilbert-base-uncased",
+        device=device
+    ).to(device)
+
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
+
+    # prepare your dataset -- this example uses a huggingface `datasets` object
+    ...
+
+    # Convert the data into a PyTorch dataloader for ease of preparation
+    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+    # Wrap the model, optimizer, and data loader in the accelerator
+    model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
+
+    for epoch in range(epochs):
+        for batch_id, batch_data in enumerate(data_loader):
+            # Assuming batch_data is a tuple in the form (anchors, positives, negatives)
+            anchors, positives, negatives = batch_data
+
+            loss = train_sparse_embed(
+                model=model,
+                optimizer=optimizer,
+                anchor=anchors,
+                positive=positives,
+                negative=negatives,
+                threshold_flops=30,
+                accelerator=accelerator,
+            )
+
+        if accelerator.is_main_process and save_on_epoch:
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+            "checkpoint/epoch" + str(epoch),
+            )
+
+    # Save at the end of the training loop
+    # We check to make sure that only the main process will export the model
+    if accelerator.is_main_process:
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained("checkpoint", accelerator=True)
+```
diff --git a/neural_cherche/models/base.py b/neural_cherche/models/base.py
@@ -77,16 +77,16 @@ def _encode(self, texts: list[str], **kwargs) -> tuple[torch.Tensor, torch.Tenso
         texts
             List of sentences to encode.
         """
-        encoded_input = self.tokenizer.batch_encode_plus(
-            texts, return_tensors="pt", **kwargs
+        encoded_input = self.tokenizer(texts, return_tensors="pt", **kwargs).to(
+            self.device
         )
 
-        if self.device != "cpu":
-            encoded_input = {
-                key: value.to(self.device) for key, value in encoded_input.items()
-            }
+        # Must hardcode position_ids to avoid a bug with accelerate multi-GPU
+        seq_len = encoded_input["input_ids"].size(1)
+        position_ids = torch.arange(0, seq_len).expand((len(texts), -1)).to(self.device)
 
-        output = self.model(**encoded_input)
+        # Pass both the inputs and position_ids to the model
+        output = self.model(**encoded_input, position_ids=position_ids)
         return output.logits, output.hidden_states[-1]
 
     @abstractmethod

diff --git a/neural_cherche/models/colbert.py b/neural_cherche/models/colbert.py
@@ -268,7 +268,7 @@ def scores(
 
         return torch.cat(list_scores, dim=0)
 
-    def save_pretrained(self, path: str) -> "ColBERT":
+    def save_pretrained(self, path: str, accelerator: bool = False) -> "ColBERT":
         """Save model the model.
 
         Parameters
@@ -279,7 +279,32 @@ def save_pretrained(self, path: str) -> "ColBERT":
         self.model.save_pretrained(path)
         torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt"))
         self.tokenizer.pad_token = self.original_pad_token
-        self.tokenizer.save_pretrained(path)
+        if accelerator:
+            # Workaround an issue with accelerator. Tokenizer has a key "device"
+            # which is non serialisable, but not removeable with a basic delattr
+
+            # dump config
+            tokenizer_config = {
+                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+            }
+            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+            # dump vocab
+            self.tokenizer.save_vocabulary(path)
+
+            # save special tokens
+            special_tokens_file = os.path.join(path, "special_tokens_map.json")
+            with open(special_tokens_file, "w", encoding="utf-8") as file:
+                json.dump(
+                    self.tokenizer.special_tokens_map,
+                    file,
+                    ensure_ascii=False,
+                    indent=4,
+                )
+        else:
+            self.tokenizer.save_pretrained(path)
         with open(os.path.join(path, "metadata.json"), "w") as f:
             json.dump(
                 {

diff --git a/neural_cherche/models/sparse_embed.py b/neural_cherche/models/sparse_embed.py
@@ -212,11 +212,40 @@ def _get_attention(
 
         return self.softmax(attention)
 
-    def save_pretrained(self, path: str):
+    def save_pretrained(
+        self,
+        path: str,
+        accelerator: bool = False,
+    ):
         """Save model the model."""
         self.model.save_pretrained(path)
         self.tokenizer.pad_token = self.original_pad_token
-        self.tokenizer.save_pretrained(path)
+        if accelerator:
+            # Workaround an issue with accelerator. Tokenizer has a key "device"
+            # which is non serialisable, but not removeable with a basic delattr
+
+            # dump config
+            tokenizer_config = {
+                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+            }
+            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+            # dump vocab
+            self.tokenizer.save_vocabulary(path)
+
+            # save special tokens
+            special_tokens_file = os.path.join(path, "special_tokens_map.json")
+            with open(special_tokens_file, "w", encoding="utf-8") as file:
+                json.dump(
+                    self.tokenizer.special_tokens_map,
+                    file,
+                    ensure_ascii=False,
+                    indent=4,
+                )
+        else:
+            self.tokenizer.save_pretrained(path)
         torch.save(self.linear.state_dict(), os.path.join(path, "linear.pt"))
         with open(os.path.join(path, "metadata.json"), "w") as file:
             json.dump(

diff --git a/neural_cherche/models/splade.py b/neural_cherche/models/splade.py
@@ -206,7 +206,11 @@ def forward(
 
         return {"sparse_activations": activations["sparse_activations"]}
 
-    def save_pretrained(self, path: str):
+    def save_pretrained(
+        self,
+        path: str,
+        accelerator: bool = False,
+    ):
         """Save model the model.
 
         Parameters
@@ -217,7 +221,32 @@ def save_pretrained(self, path: str):
         """
         self.model.save_pretrained(path)
         self.tokenizer.pad_token = self.original_pad_token
-        self.tokenizer.save_pretrained(path)
+        if accelerator:
+            # Workaround an issue with accelerator. Tokenizer has a key "device"
+            # which is non serialisable, but not removeable with a basic delattr
+
+            # dump config
+            tokenizer_config = {
+                k: v for k, v in self.tokenizer.__dict__.items() if k != "device"
+            }
+            tokenizer_config_file = os.path.join(path, "tokenizer_config.json")
+            with open(tokenizer_config_file, "w", encoding="utf-8") as file:
+                json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
+
+            # dump vocab
+            self.tokenizer.save_vocabulary(path)
+
+            # save special tokens
+            special_tokens_file = os.path.join(path, "special_tokens_map.json")
+            with open(special_tokens_file, "w", encoding="utf-8") as file:
+                json.dump(
+                    self.tokenizer.special_tokens_map,
+                    file,
+                    ensure_ascii=False,
+                    indent=4,
+                )
+        else:
+            self.tokenizer.save_pretrained(path)
 
         with open(os.path.join(path, "metadata.json"), "w") as file:
             json.dump(
@@ -306,15 +335,12 @@ def _update_activations(
     ) -> torch.Tensor:
         """Returns activated tokens."""
         activations = torch.topk(input=sparse_activations, k=k_tokens, dim=1).indices
-
-        # Set value of max sparse_activations which are not in top k to 0.
-        sparse_activations = sparse_activations * torch.zeros(
-            (sparse_activations.shape[0], sparse_activations.shape[1]),
-            dtype=int,
-            device=self.device,
-        ).scatter_(dim=1, index=activations.long(), value=1)
+        zero_tensor = torch.zeros_like(sparse_activations, dtype=int)
+        updated_sparse_activations = sparse_activations * zero_tensor.scatter(
+            dim=1, index=activations.long(), value=1
+        )
 
         return {
             "activations": activations,
-            "sparse_activations": sparse_activations,
+            "sparse_activations": updated_sparse_activations,
         }
diff --git a/neural_cherche/train/train_colbert.py b/neural_cherche/train/train_colbert.py
@@ -10,6 +10,7 @@ def train_colbert(
     positive: list[str],
     negative: list[str],
     in_batch_negatives: bool = False,
+    accelerator=None,
     **kwargs,
 ):
     """Compute the ranking loss and the flops loss for a single step.
@@ -98,7 +99,10 @@ def train_colbert(
 
     loss = losses.Ranking()(**scores)
 
-    loss.backward()
+    if accelerator:
+        accelerator.backward(loss)
+    else:
+        loss.backward()
     optimizer.step()
     optimizer.zero_grad()
 

diff --git a/neural_cherche/train/train_sparse_embed.py b/neural_cherche/train/train_sparse_embed.py
@@ -16,6 +16,7 @@ def train_sparse_embed(
     dense_loss_weight: float = 1.0,
     in_batch_negatives: bool = False,
     threshold_flops: float = 30,
+    accelerator=None,
     **kwargs,
 ):
     """Compute the ranking loss and the flops loss for a single step.
@@ -147,7 +148,10 @@ def train_sparse_embed(
         + flops_loss_weight * flops_loss
     )
 
-    loss.backward()
+    if accelerator:
+        accelerator.backward(loss)
+    else:
+        loss.backward()
     optimizer.step()
     optimizer.zero_grad()
 

diff --git a/neural_cherche/train/train_splade.py b/neural_cherche/train/train_splade.py
@@ -13,6 +13,7 @@ def train_splade(
     sparse_loss_weight: float = 1.0,
     in_batch_negatives: bool = False,
     threshold_flops: float = 30,
+    accelerator=None,
     **kwargs,
 ):
     """Compute the ranking loss and the flops loss for a single step.
@@ -117,7 +118,10 @@ def train_splade(
 
     loss = sparse_loss_weight * sparse_loss + flops_loss_weight * flops_loss
 
-    loss.backward()
+    if accelerator:
+        accelerator.backward(loss)
+    else:
+        loss.backward()
     optimizer.step()
     optimizer.zero_grad()