Load models which were trained without zeros to always predict non-zero. Also, don't both training the zero predictor if a dataset has no zeros in it

AngledLuffa · AngledLuffa · commit 604bcf39f7b4 · 2025-10-01T15:03:00.000-07:00
diff --git a/stanza/models/coref/config.py b/stanza/models/coref/config.py
@@ -65,3 +65,5 @@ class Config:  # pylint: disable=too-many-instance-attributes, too-few-public-me
     singletons: bool
     
     max_train_len: int
+    use_zeros: bool
+
diff --git a/stanza/models/coref/coref_config.toml b/stanza/models/coref/coref_config.toml
@@ -119,6 +119,9 @@ conll_log_dir = "data/conll_logs"
 # Skip any documents longer than this length
 max_train_len = 5000
 
+# if this is set to false, the model will set its zero_predictor to, well, 0
+use_zeros = true
+
 # =============================================================================
 # Extra keyword arguments to be passed to bert tokenizers of specified models
 [DEFAULT.tokenizer_kwargs]
diff --git a/stanza/models/coref/model.py b/stanza/models/coref/model.py
@@ -478,6 +478,14 @@ def train(self, log=False):
         docs_ids = list(range(len(docs)))
         avg_spans = docs.avg_span
 
+        # for a brand new model, we set the zeros prediction to all 0 if the dataset has no zeros
+        training_has_zeros = any('is_zero' in doc for doc in docs)
+        if not training_has_zeros:
+            logger.info("No zeros found in the dataset.  The zeros predictor will set to 0")
+            if self.epochs_trained == 0:
+                # new model, set it to always predict not-zero
+                self.disable_zeros_predictor()
+
         best_f1 = None
         for epoch in range(self.epochs_trained, self.config.train_epochs):
             self.training = True
@@ -500,7 +508,7 @@ def train(self, log=False):
 
                 res = self.run(doc)
 
-                if res.zero_scores.size(0) == 0:
+                if res.zero_scores.size(0) == 0 or not training_has_zeros:
                     z_loss = 0.0 # since there are no corefs
                 else:
                     is_zero = doc.get("is_zero")
@@ -522,7 +530,7 @@ def train(self, log=False):
 
                 running_c_loss += c_loss.item()
                 running_s_loss += s_loss.item()
-                if res.zero_scores.size(0) != 0:
+                if res.zero_scores.size(0) != 0 and training_has_zeros:
                     running_z_loss += z_loss.item()
 
                 # log every 100 docs
@@ -531,7 +539,7 @@ def train(self, log=False):
                         'train_c_loss': c_loss.item(),
                         'train_s_loss': s_loss.item(),
                     }
-                    if res.zero_scores.size(0) != 0:
+                    if res.zero_scores.size(0) != 0 and training_has_zeros:
                         logged['train_z_loss'] = z_loss.item()
                     wandb.log(logged)
 
@@ -666,6 +674,8 @@ def _build_model(self, foundation_cache):
             nn.ReLU(),
             nn.Linear(bert_emb, 1)
         ).to(self.config.device)
+        if not hasattr(self.config, 'use_zeros') or not self.config.use_zeros:
+            self.disable_zeros_predictor()
 
         self.trainable: Dict[str, torch.nn.Module] = {
             "bert": self.bert, "we": self.we,
@@ -674,6 +684,10 @@ def _build_model(self, foundation_cache):
             "sp": self.sp, "zeros_predictor": self.zeros_predictor
         }
 
+    def disable_zeros_predictor(self):
+        nn.init.zeros_(self.zeros_predictor[-1].weight)
+        nn.init.zeros_(self.zeros_predictor[-1].bias)
+
     def _build_optimizers(self):
         n_docs = len(self._get_docs(self.config.train_data))
         self.optimizers: Dict[str, torch.optim.Optimizer] = {}