From a83727934c6703ec1ab2c0d298ae5c93ce573c21 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Mar 2025 15:49:29 +0000 Subject: [PATCH 1/4] CU-8698f8fgc: Add new test to check that the negative sampling indices do not include non-vectored indices --- tests/test_vocab.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 5e4f8e25e..8ef6f7874 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -47,7 +47,8 @@ class VocabUnigramTableTests(unittest.TestCase): NUM_SAMPLES = 20 # NOTE: 3, 9, 18, and 27 at a time are regular due to context vector sizes NUM_TIMES = 200 # based on the counts on vocab_data.txt and the one set in setUpClass - EXPECTED_FREQUENCIES = [0.62218692, 0.32422858, 0.0535845] + # EXPECTED_FREQUENCIES = [0.62218692, 0.32422858, 0.0535845] + EXPECTED_FREQUENCIES = [0.04875, 0.316, 0.61075, 0.0245] TOLERANCE = 0.001 @classmethod @@ -55,6 +56,8 @@ def setUpClass(cls): cls.vocab = Vocab() cls.vocab.add_words(cls.EXAMPLE_DATA_PATH) cls.vocab.add_word("test", cnt=1310, vec=[1.42, 1.44, 1.55]) + cls.vocab.add_word("vectorless", cnt=1234, vec=None) + cls.vocab.add_word("withvector", cnt=321, vec=[1.3, 1.2, 0.8]) cls.vocab.make_unigram_table(table_size=cls.UNIGRAM_TABLE_SIZE) def setUp(self): @@ -67,7 +70,7 @@ def _get_freqs(cls) -> list[float]: got = cls.vocab.get_negative_samples(cls.NUM_SAMPLES) c += Counter(got) total = sum(c[i] for i in c) - got_freqs = [c[i]/total for i in range(len(cls.EXPECTED_FREQUENCIES))] + got_freqs = [c[i]/total for i in c] return got_freqs def assert_accurate_enough(self, got_freqs: list[float]): @@ -75,6 +78,19 @@ def assert_accurate_enough(self, got_freqs: list[float]): np.max(np.abs(np.array(got_freqs) - self.EXPECTED_FREQUENCIES)) < self.TOLERANCE ) + def test_does_not_include_vectorless_indices(self, num_samples: int = 100): + inds = self.vocab.get_negative_samples(num_samples) + for index in inds: + with self.subTest(f"Index: {index}"): + # in the right list + self.assertIn(index, self.vocab.vec_index2word) + word = self.vocab.vec_index2word[index] + info = self.vocab.vocab[word] + # the info has vector + self.assertIn("vec", info) + # the vector is an array or a list + self.assertIsInstance(self.vocab.vec(word), (np.ndarray, list),) + def test_negative_sampling(self): got_freqs = self._get_freqs() self.assert_accurate_enough(got_freqs) From d8b167e16aa0ddd30a43b7e95ac02f382832a784 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Mar 2025 15:56:20 +0000 Subject: [PATCH 2/4] CU-8698f8fgc: Add fix for negative sampling including indices for words without a vector --- medcat/vocab.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/medcat/vocab.py b/medcat/vocab.py index b23b24190..04528fd40 100644 --- a/medcat/vocab.py +++ b/medcat/vocab.py @@ -190,8 +190,19 @@ def make_unigram_table(self, table_size: int = -1) -> None: "the creation of a massive array. So therefore, there " "is no need to pass the `table_size` parameter anymore.") freqs = [] - for word in self.vec_index2word.values(): + # index list maps the slot in which a word index + # sits in vec_index2word to the actual index for said word + # e.g: + # if we have words indexed 0, 1, and 2 + # but only 0, and 2 have corresponding vectors + # then only 0 and 2 will occur in vec_index2word + # and while 0 will be in the 0th position (as expected) + # in the final probability list, 2 will be in 1st position + # so we need to mark that conversion down + index_list = [] + for word_index, word in self.vec_index2word.items(): freqs.append(self[word]) + index_list.append(word_index) # Power and normalize frequencies freqs = np.array(freqs) ** (3/4) @@ -199,6 +210,8 @@ def make_unigram_table(self, table_size: int = -1) -> None: # Calculate cumulative probabilities self.cum_probs = np.cumsum(freqs) + # the mapping from vector index order to word indices + self._index_list = index_list def get_negative_samples(self, n: int = 6, ignore_punct_and_num: bool = False) -> List[int]: """Get N negative samples. @@ -216,8 +229,11 @@ def get_negative_samples(self, n: int = 6, ignore_punct_and_num: bool = False) - if len(self.cum_probs) == 0: self.make_unigram_table() random_vals = np.random.rand(n) - # NOTE: there's a change in numpy - inds = cast(List[int], np.searchsorted(self.cum_probs, random_vals).tolist()) + # NOTE: These indices are in terms of the cum_probs array + # which only has word data for words with vectors. + vec_slots = cast(List[int], np.searchsorted(self.cum_probs, random_vals).tolist()) + # so we need to translate these back to word indices + inds = list(map(self._index_list.__getitem__, vec_slots)) if ignore_punct_and_num: # Do not return anything that does not have letters in it From c42d7b21c3a31bc3062645bb49f894732400aed3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 26 Mar 2025 10:00:56 +0000 Subject: [PATCH 3/4] CU-8698f8fgc: Update tests to make sure index frequencies are respected --- tests/test_vocab.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 8ef6f7874..fa40a66cd 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -43,12 +43,16 @@ class VocabUnigramTableTests(unittest.TestCase): "..", "examples", "vocab_data.txt") UNIGRAM_TABLE_SIZE = 10_000 # found that this seed had the closest frequency at the sample size we're at - RANDOM_SEED = 4976 + RANDOM_SEED = 32 NUM_SAMPLES = 20 # NOTE: 3, 9, 18, and 27 at a time are regular due to context vector sizes NUM_TIMES = 200 - # based on the counts on vocab_data.txt and the one set in setUpClass - # EXPECTED_FREQUENCIES = [0.62218692, 0.32422858, 0.0535845] - EXPECTED_FREQUENCIES = [0.04875, 0.316, 0.61075, 0.0245] + # based on the counts on vocab_data.txt and the ones set in setUpClass + # plus the power of 3/4 + EXPECTED_FREQUENCIES = { + 0: 0.61078822, 1: 0.3182886, + 2: 0.05260281, + # NOTE: no 3 since that's got no vectors + 4: 0.01832037} TOLERANCE = 0.001 @classmethod @@ -64,19 +68,29 @@ def setUp(self): np.random.seed(self.RANDOM_SEED) @classmethod - def _get_freqs(cls) -> list[float]: + def _get_freqs(cls) -> dict[int, float]: c = Counter() for _ in range(cls.NUM_TIMES): got = cls.vocab.get_negative_samples(cls.NUM_SAMPLES) c += Counter(got) - total = sum(c[i] for i in c) - got_freqs = [c[i]/total for i in c] + total = c.total() + got_freqs = {index: val/total for index, val in c.items()} return got_freqs - def assert_accurate_enough(self, got_freqs: list[float]): + @classmethod + def _get_abs_max_diff(cls, dict1: dict[int, float], + dict2: dict[int, float]): + assert dict1.keys() == dict2.keys() + vals1, vals2 = [], [] + for index in dict1: + vals1.append(dict1[index]) + vals2.append(dict2[index]) + return np.max(np.abs(np.array(vals1) - np.array(vals2))) + + def assert_accurate_enough(self, got_freqs: dict[int, float]): + self.assertEqual(got_freqs.keys(), self.EXPECTED_FREQUENCIES.keys()) self.assertTrue( - np.max(np.abs(np.array(got_freqs) - self.EXPECTED_FREQUENCIES)) < self.TOLERANCE - ) + self._get_abs_max_diff(self.EXPECTED_FREQUENCIES, got_freqs) < self.TOLERANCE) def test_does_not_include_vectorless_indices(self, num_samples: int = 100): inds = self.vocab.get_negative_samples(num_samples) From 5a6c2e5309acdd1629d8fc26391931fc94811b35 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 26 Mar 2025 10:34:34 +0000 Subject: [PATCH 4/4] CU-8698f8fgc: Add 3.9-friendly counter totalling method --- tests/test_vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_vocab.py b/tests/test_vocab.py index fa40a66cd..dd1203ad8 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -73,7 +73,7 @@ def _get_freqs(cls) -> dict[int, float]: for _ in range(cls.NUM_TIMES): got = cls.vocab.get_negative_samples(cls.NUM_SAMPLES) c += Counter(got) - total = c.total() + total = sum(c.values()) got_freqs = {index: val/total for index, val in c.items()} return got_freqs