From a83727934c6703ec1ab2c0d298ae5c93ce573c21 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 25 Mar 2025 15:49:29 +0000
Subject: [PATCH 1/4] CU-8698f8fgc: Add new test to check that the negative
 sampling indices do not include non-vectored indices

---
 tests/test_vocab.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index 5e4f8e25e..8ef6f7874 100644
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -47,7 +47,8 @@ class VocabUnigramTableTests(unittest.TestCase):
     NUM_SAMPLES = 20 # NOTE: 3, 9, 18, and 27 at a time are regular due to context vector sizes
     NUM_TIMES = 200
     # based on the counts on vocab_data.txt and the one set in setUpClass
-    EXPECTED_FREQUENCIES = [0.62218692, 0.32422858, 0.0535845]
+    # EXPECTED_FREQUENCIES = [0.62218692, 0.32422858, 0.0535845]
+    EXPECTED_FREQUENCIES = [0.04875, 0.316, 0.61075, 0.0245]
     TOLERANCE = 0.001
 
     @classmethod
@@ -55,6 +56,8 @@ def setUpClass(cls):
         cls.vocab = Vocab()
         cls.vocab.add_words(cls.EXAMPLE_DATA_PATH)
         cls.vocab.add_word("test", cnt=1310, vec=[1.42, 1.44, 1.55])
+        cls.vocab.add_word("vectorless", cnt=1234, vec=None)
+        cls.vocab.add_word("withvector", cnt=321, vec=[1.3, 1.2, 0.8])
         cls.vocab.make_unigram_table(table_size=cls.UNIGRAM_TABLE_SIZE)
 
     def setUp(self):
@@ -67,7 +70,7 @@ def _get_freqs(cls) -> list[float]:
             got = cls.vocab.get_negative_samples(cls.NUM_SAMPLES)
             c += Counter(got)
         total = sum(c[i] for i in c)
-        got_freqs = [c[i]/total for i in range(len(cls.EXPECTED_FREQUENCIES))]
+        got_freqs = [c[i]/total for i in c]
         return got_freqs
 
     def assert_accurate_enough(self, got_freqs: list[float]):
@@ -75,6 +78,19 @@ def assert_accurate_enough(self, got_freqs: list[float]):
             np.max(np.abs(np.array(got_freqs) - self.EXPECTED_FREQUENCIES)) < self.TOLERANCE
         )
 
+    def test_does_not_include_vectorless_indices(self, num_samples: int = 100):
+        inds = self.vocab.get_negative_samples(num_samples)
+        for index in inds:
+            with self.subTest(f"Index: {index}"):
+                # in the right list
+                self.assertIn(index, self.vocab.vec_index2word)
+                word = self.vocab.vec_index2word[index]
+                info = self.vocab.vocab[word]
+                # the info has vector
+                self.assertIn("vec", info)
+                # the vector is an array or a list
+                self.assertIsInstance(self.vocab.vec(word), (np.ndarray, list),)
+
     def test_negative_sampling(self):
         got_freqs = self._get_freqs()
         self.assert_accurate_enough(got_freqs)

From d8b167e16aa0ddd30a43b7e95ac02f382832a784 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 25 Mar 2025 15:56:20 +0000
Subject: [PATCH 2/4] CU-8698f8fgc: Add fix for negative sampling including
 indices for words without a vector

---
 medcat/vocab.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/medcat/vocab.py b/medcat/vocab.py
index b23b24190..04528fd40 100644
--- a/medcat/vocab.py
+++ b/medcat/vocab.py
@@ -190,8 +190,19 @@ def make_unigram_table(self, table_size: int = -1) -> None:
                            "the creation of a massive array. So therefore, there "
                            "is no need to pass the `table_size` parameter anymore.")
         freqs = []
-        for word in self.vec_index2word.values():
+        # index list maps the slot in which a word index
+        # sits in vec_index2word to the actual index for said word
+        # e.g:
+        #    if we have words indexed 0, 1, and 2
+        #    but only 0, and 2 have corresponding vectors
+        #    then only 0 and 2 will occur in vec_index2word
+        #    and while 0 will be in the 0th position (as expected)
+        #    in the final probability list, 2 will be in 1st position
+        #    so we need to mark that conversion down
+        index_list = []
+        for word_index, word in self.vec_index2word.items():
             freqs.append(self[word])
+            index_list.append(word_index)
 
         # Power and normalize frequencies
         freqs = np.array(freqs) ** (3/4)
@@ -199,6 +210,8 @@ def make_unigram_table(self, table_size: int = -1) -> None:
 
         # Calculate cumulative probabilities
         self.cum_probs = np.cumsum(freqs)
+        # the mapping from vector index order to word indices
+        self._index_list = index_list
 
     def get_negative_samples(self, n: int = 6, ignore_punct_and_num: bool = False) -> List[int]:
         """Get N negative samples.
@@ -216,8 +229,11 @@ def get_negative_samples(self, n: int = 6, ignore_punct_and_num: bool = False) -
         if len(self.cum_probs) == 0:
             self.make_unigram_table()
         random_vals = np.random.rand(n)
-        # NOTE: there's a change in numpy
-        inds = cast(List[int], np.searchsorted(self.cum_probs, random_vals).tolist())
+        # NOTE: These indices are in terms of the cum_probs array
+        #       which only has word data for words with vectors.
+        vec_slots = cast(List[int], np.searchsorted(self.cum_probs, random_vals).tolist())
+        # so we need to translate these back to word indices
+        inds = list(map(self._index_list.__getitem__, vec_slots))
 
         if ignore_punct_and_num:
             # Do not return anything that does not have letters in it

From c42d7b21c3a31bc3062645bb49f894732400aed3 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 26 Mar 2025 10:00:56 +0000
Subject: [PATCH 3/4] CU-8698f8fgc: Update tests to make sure index frequencies
 are respected

---
 tests/test_vocab.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index 8ef6f7874..fa40a66cd 100644
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -43,12 +43,16 @@ class VocabUnigramTableTests(unittest.TestCase):
                                      "..", "examples", "vocab_data.txt")
     UNIGRAM_TABLE_SIZE = 10_000
     # found that this seed had the closest frequency at the sample size we're at
-    RANDOM_SEED = 4976
+    RANDOM_SEED = 32
     NUM_SAMPLES = 20 # NOTE: 3, 9, 18, and 27 at a time are regular due to context vector sizes
     NUM_TIMES = 200
-    # based on the counts on vocab_data.txt and the one set in setUpClass
-    # EXPECTED_FREQUENCIES = [0.62218692, 0.32422858, 0.0535845]
-    EXPECTED_FREQUENCIES = [0.04875, 0.316, 0.61075, 0.0245]
+    # based on the counts on vocab_data.txt and the ones set in setUpClass
+    # plus the power of 3/4
+    EXPECTED_FREQUENCIES = {
+        0: 0.61078822, 1: 0.3182886,
+        2: 0.05260281,
+        # NOTE: no 3 since that's got no vectors
+        4: 0.01832037}
     TOLERANCE = 0.001
 
     @classmethod
@@ -64,19 +68,29 @@ def setUp(self):
         np.random.seed(self.RANDOM_SEED)
 
     @classmethod
-    def _get_freqs(cls) -> list[float]:
+    def _get_freqs(cls) -> dict[int, float]:
         c = Counter()
         for _ in range(cls.NUM_TIMES):
             got = cls.vocab.get_negative_samples(cls.NUM_SAMPLES)
             c += Counter(got)
-        total = sum(c[i] for i in c)
-        got_freqs = [c[i]/total for i in c]
+        total = c.total()
+        got_freqs = {index: val/total for index, val in c.items()}
         return got_freqs
 
-    def assert_accurate_enough(self, got_freqs: list[float]):
+    @classmethod
+    def _get_abs_max_diff(cls, dict1: dict[int, float],
+                          dict2: dict[int, float]):
+        assert dict1.keys() == dict2.keys()
+        vals1, vals2 = [], []
+        for index in dict1:
+            vals1.append(dict1[index])
+            vals2.append(dict2[index])
+        return np.max(np.abs(np.array(vals1) - np.array(vals2)))
+
+    def assert_accurate_enough(self, got_freqs: dict[int, float]):
+        self.assertEqual(got_freqs.keys(), self.EXPECTED_FREQUENCIES.keys())
         self.assertTrue(
-            np.max(np.abs(np.array(got_freqs) - self.EXPECTED_FREQUENCIES)) < self.TOLERANCE
-        )
+            self._get_abs_max_diff(self.EXPECTED_FREQUENCIES, got_freqs) < self.TOLERANCE)
 
     def test_does_not_include_vectorless_indices(self, num_samples: int = 100):
         inds = self.vocab.get_negative_samples(num_samples)

From 5a6c2e5309acdd1629d8fc26391931fc94811b35 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 26 Mar 2025 10:34:34 +0000
Subject: [PATCH 4/4] CU-8698f8fgc: Add 3.9-friendly counter totalling method

---
 tests/test_vocab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index fa40a66cd..dd1203ad8 100644
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -73,7 +73,7 @@ def _get_freqs(cls) -> dict[int, float]:
         for _ in range(cls.NUM_TIMES):
             got = cls.vocab.get_negative_samples(cls.NUM_SAMPLES)
             c += Counter(got)
-        total = c.total()
+        total = sum(c.values())
         got_freqs = {index: val/total for index, val in c.items()}
         return got_freqs