haflway through modularizing

puririshi98 · puririshi98 · commit 5badc10d2cde · 2024-07-10T15:02:31.000-07:00
diff --git a/torch_geometric/nn/models/g_retriever.py b/torch_geometric/nn/models/g_retriever.py
@@ -179,49 +179,12 @@ def forward(
         batch_label_input_ids = []
         num_nodes_per_graph = ptr[1:] - ptr[:-1]
         for i in range(batch_size):
-            # Add bos & eos token
-            label_input_ids = labels.input_ids[
-                i][:MAX_NEW_TOKENS] + eos_tokens.input_ids
-            if additional_text_context is not None:
-                input_ids = context.input_ids[
-                    i][:MAX_TXT_LEN] + questions.input_ids[
-                        i] + eos_user_tokens.input_ids + label_input_ids
-            else:
-                input_ids = questions.input_ids[
-                    i] + eos_user_tokens.input_ids + label_input_ids
-            inputs_embeds = self.word_embedding(
-                torch.tensor(input_ids).to(self.llm_device))
-            to_cat = [bos_embeds]
-            if num_nodes_per_graph[i] != 0:
-                to_cat.append(graph_embeds[i].unsqueeze(0))
-            to_cat.append(inputs_embeds)
-            inputs_embeds = torch.cat([i.to(self.llm_device) for i in to_cat],
-                                      dim=0)
-            batch_inputs_embeds.append(inputs_embeds)
-            batch_attention_mask.append([1] * inputs_embeds.shape[0])
-            label_input_ids = [IGNORE_INDEX
-                               ] * (inputs_embeds.shape[0] -
-                                    len(label_input_ids)) + label_input_ids
-            batch_label_input_ids.append(label_input_ids)
-
-        # pad inputs_embeds
-        max_length = max([x.shape[0] for x in batch_inputs_embeds])
-        for i in range(batch_size):
-            pad_length = max_length - batch_inputs_embeds[i].shape[0]
-            batch_inputs_embeds[i] = torch.cat([
-                pad_embeds.repeat(pad_length, 1).to(self.llm_device),
-                batch_inputs_embeds[i].to(self.llm_device)
-            ])
-            batch_attention_mask[i] = [0
-                                       ] * pad_length + batch_attention_mask[i]
-            batch_label_input_ids[
-                i] = [IGNORE_INDEX] * pad_length + batch_label_input_ids[i]
-
-        inputs_embeds = torch.stack(batch_inputs_embeds,
-                                    dim=0).to(self.llm_device)
-        attention_mask = torch.tensor(batch_attention_mask).to(self.llm_device)
-        label_input_ids = torch.tensor(batch_label_input_ids).to(
-            self.llm_device)
+            label_input_ids = self.llm_to_use._label_input_ids(label, eos_tokens)
+            input_ids = self.llm_to_use._input_ids(additional_text_context, question, eos_user_tokens)
+            input_ids += label_input_ids
+            inputs_embeds = self.llm_to_use._inputs_embeds(input_ids, bos_embeds, graph_embeds[i].unsqueeze(0) if num_nodes_per_graph[i] != 0 else None)
+            batch_inputs_embeds, batch_attention_mask, batch_label_input_ids = self.llm_to_use.append_embeds(inputs_embeds, batch_inputs_embeds, batch_attention_mask, label_input_ids, batch_label_input_ids)
+        inputs_embeds, attention_mask, label_input_ids = self.llm_to_use.pad_embeds(batch_inputs_embeds, batch_attention_mask, batch_label_input_ids)
         with self.llm_to_use.autocast_context:
             outputs = self.llm_generator(
                 inputs_embeds=inputs_embeds,
@@ -274,36 +237,11 @@ def inference(
         batch_attention_mask = []
         num_nodes_per_graph = ptr[1:] - ptr[:-1]
         for i in range(batch_size):
-            # Add bos & eos token
-            if additional_text_context is not None:
-                input_ids = context.input_ids[
-                    i][:MAX_TXT_LEN] + questions.input_ids[
-                        i] + eos_user_tokens.input_ids
-            else:
-                input_ids = questions.input_ids[i] + eos_user_tokens.input_ids
-            inputs_embeds = self.word_embedding(
-                torch.tensor(input_ids).to(self.llm_device))
-            to_cat = [bos_embeds]
-            if num_nodes_per_graph[i] != 0:
-                to_cat.append(graph_embeds[i].unsqueeze(0))
-            to_cat.append(inputs_embeds)
-            inputs_embeds = torch.cat([i.to(self.llm_device) for i in to_cat],
-                                      dim=0)
-            batch_inputs_embeds.append(inputs_embeds)
-            batch_attention_mask.append([1] * inputs_embeds.shape[0])
-
-        # pad inputs_embeds
-        max_length = max([x.shape[0] for x in batch_inputs_embeds])
-        for i in range(batch_size):
-            pad_length = max_length - batch_inputs_embeds[i].shape[0]
-            batch_inputs_embeds[i] = torch.cat(
-                [pad_embeds.repeat(pad_length, 1), batch_inputs_embeds[i]])
-            batch_attention_mask[i] = [0
-                                       ] * pad_length + batch_attention_mask[i]
+            input_ids = self.llm_to_use._input_ids(additional_text_context, question, eos_user_tokens)
+            inputs_embeds = self.llm_to_use._inputs_embeds(input_ids, bos_embeds, graph_embeds[i].unsqueeze(0) if num_nodes_per_graph[i] != 0 else None)
+            batch_inputs_embeds, batch_attention_mask, _ = self.llm_to_use.append_embeds(inputs_embeds, batch_inputs_embeds, batch_attention_mask)
 
-        inputs_embeds = torch.stack(batch_inputs_embeds,
-                                    dim=0).to(self.llm_device)
-        attention_mask = torch.tensor(batch_attention_mask).to(self.llm_device)
+        inputs_embeds, attention_mask, _ = self.llm_to_use.pad_embeds(batch_inputs_embeds, batch_attention_mask)
 
         with self.llm_to_use.autocast_context:
             outputs = self.llm_generator.generate(
diff --git a/torch_geometric/nn/nlp/llm.py b/torch_geometric/nn/nlp/llm.py
@@ -117,6 +117,61 @@ def _encode_inputs(
         return (batch_size, questions, context, eos_user_tokens, bos_embeds,
                 pad_embeds)
 
+    def _label_input_ids(self, label, eos_tokens):
+        label_input_ids = label.input_ids[i][:MAX_NEW_TOKENS]
+        label_input_ids += eos_tokens.input_ids  # Add EOS token.
+        return label_input_ids
+
+    def _input_ids(self, context, question, eos_user_tokens):
+        input_ids: List[int] = []
+        if context is not None:
+            input_ids += context.input_ids[i][:MAX_TXT_LEN]
+        input_ids += question.input_ids[i]
+        input_ids += eos_user_tokens.input_ids
+        return input_ids
+
+    def _inputs_embeds(self, input_ids, bos_embeds, embedding=None):
+        inputs_embeds = self.word_embedding(
+            torch.tensor(input_ids, device=self.llm_device))
+
+        to_cat = [bos_embeds]
+        if embedding is not None:
+            to_cat.append(embedding[i])
+        to_cat.append(inputs_embeds)
+        inputs_embeds = torch.cat([i.to(self.llm_device) for i in to_cat], dim=0)
+        return inputs_embeds
+
+    def append_embeds(self, inputs_embeds, batch_inputs_embeds, batch_attention_mask, label_input_ids=None, batch_label_input_ids=None):
+        batch_inputs_embeds.append(inputs_embeds)
+        batch_attention_mask.append([1] * inputs_embeds.size(0))
+        if label_input_ids is not None:
+            label_input_ids = [IGNORE_INDEX] * (
+                inputs_embeds.size(0) - len(label_input_ids)) + label_input_ids
+            batch_label_input_ids.append(label_input_ids)
+        return batch_inputs_embeds, batch_attention_mask, batch_label_input_ids
+
+    def pad_embeds(batch_inputs_embeds, batch_attention_mask, batch_label_input_ids=None):
+        max_length = max([x.size(0) for x in batch_inputs_embeds])
+        for i in range(batch_size):
+            pad = max_length - batch_inputs_embeds[i].size(0)
+            batch_inputs_embeds[i] = torch.cat([
+                pad_embeds.repeat(pad, 1),
+                batch_inputs_embeds[i],
+            ])
+            batch_attention_mask[i] = [0] * pad + batch_attention_mask[i]
+            if batch_label_input_ids is not None:
+                batch_label_input_ids[i] = ([IGNORE_INDEX] * pad +
+                                            batch_label_input_ids[i])
+        inputs_embeds = torch.stack(batch_inputs_embeds, dim=0)
+        attention_mask = torch.tensor(batch_attention_mask,
+                                      device=self.llm_device)
+        if batch_label_input_ids is not None:
+            label_input_ids = torch.tensor(batch_label_input_ids,
+                                           device=self.llm_device)
+        else:
+            label_input_ids = None
+        return inputs_embeds, attention_mask, label_input_ids
+
     def forward(
         self,
         question: List[str],
@@ -150,48 +205,15 @@ def forward(
         batch_attention_mask = []
         batch_label_input_ids = []
         for i in range(batch_size):
-            label_input_ids = label.input_ids[i][:MAX_NEW_TOKENS]
-            label_input_ids += eos_tokens.input_ids  # Add EOS token.
-
-            input_ids: List[int] = []
-            if context is not None:
-                input_ids += context.input_ids[i][:MAX_TXT_LEN]
-            input_ids += question.input_ids[i]
-            input_ids += eos_user_tokens.input_ids
+            label_input_ids = self._label_input_ids(label, eos_tokens)
+            input_ids = self._input_ids(context, question, eos_user_tokens)
             input_ids += label_input_ids
 
-            inputs_embeds = self.word_embedding(
-                torch.tensor(input_ids, device=self.llm_device))
-
-            to_cat = [bos_embeds]
-            if embedding is not None:
-                to_cat.append(embedding[i])
-            to_cat.append(inputs_embeds)
-            inputs_embeds = torch.cat(to_cat, dim=0)
-
-            batch_inputs_embeds.append(inputs_embeds)
-            batch_attention_mask.append([1] * inputs_embeds.size(0))
-            label_input_ids = [IGNORE_INDEX] * (
-                inputs_embeds.size(0) - len(label_input_ids)) + label_input_ids
-            batch_label_input_ids.append(label_input_ids)
+            inputs_embeds = self._inputs_embeds(input_ids, bos_embeds, embedding)
 
-        # Pad input embeddings:
-        max_length = max([x.size(0) for x in batch_inputs_embeds])
-        for i in range(batch_size):
-            pad = max_length - batch_inputs_embeds[i].size(0)
-            batch_inputs_embeds[i] = torch.cat([
-                pad_embeds.repeat(pad, 1),
-                batch_inputs_embeds[i],
-            ])
-            batch_attention_mask[i] = [0] * pad + batch_attention_mask[i]
-            batch_label_input_ids[i] = ([IGNORE_INDEX] * pad +
-                                        batch_label_input_ids[i])
+            batch_inputs_embeds, batch_attention_mask, batch_label_input_ids = self.append_embeds(inputs_embeds, batch_inputs_embeds, batch_attention_mask, label_input_ids, batch_label_input_ids)
 
-        inputs_embeds = torch.stack(batch_inputs_embeds, dim=0)
-        attention_mask = torch.tensor(batch_attention_mask,
-                                      device=self.llm_device)
-        label_input_ids = torch.tensor(batch_label_input_ids,
-                                       device=self.llm_device)
+        inputs_embeds, attention_mask, label_input_ids = self.pad_embeds(batch_inputs_embeds, batch_attention_mask, batch_label_input_ids)
 
         with self.autocast_context:
             outputs = self.llm(
@@ -235,37 +257,14 @@ def inference(
         batch_attention_mask = []
         for i in range(batch_size):
             input_ids: List[int] = []
-            if context is not None:
-                input_ids = context.input_ids[i][:MAX_TXT_LEN]
-            input_ids += question.input_ids[i]
-            input_ids += eos_user_tokens.input_ids
-
-            inputs_embeds = self.word_embedding(
-                torch.tensor(input_ids, device=self.llm_device))
+            input_ids = self._input_ids(context, question, eos_user_tokens)
 
-            to_cat = [bos_embeds]
-            if embedding is not None:
-                to_cat.append(embedding[i])
-            to_cat.append(inputs_embeds)
-            inputs_embeds = torch.cat(to_cat, dim=0)
+            inputs_embeds = self._inputs_embeds(input_ids, bos_embeds, embedding)
 
-            batch_inputs_embeds.append(inputs_embeds)
-            batch_attention_mask.append([1] * inputs_embeds.size(0))
+            batch_inputs_embeds, batch_attention_mask, _ = self.append_embeds(inputs_embeds, batch_inputs_embeds, batch_attention_mask)
 
-        # Pad input embeddings:
-        max_length = max([x.size(0) for x in batch_inputs_embeds])
-        for i in range(batch_size):
-            pad = max_length - batch_inputs_embeds[i].size(0)
-            batch_inputs_embeds[i] = torch.cat([
-                pad_embeds.repeat(pad, 1),
-                batch_inputs_embeds[i],
-            ])
-            batch_attention_mask[i] = [0] * pad + batch_attention_mask[i]
-
-        inputs_embeds = torch.stack(batch_inputs_embeds, dim=0)
-        attention_mask = torch.tensor(batch_attention_mask,
-                                      device=self.llm_device)
 
+        inputs_embeds, attention_mask, _ = self.pad_embeds(batch_inputs_embeds, batch_attention_mask)
         bos_token = self.tokenizer(
             BOS,
             add_special_tokens=False,