From 30884c2c819ff509b983fad8b3284a369449dd52 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Mon, 7 Aug 2023 12:59:26 +0100 Subject: [PATCH] transformations: add wordSubstituteSet (#80) --- adtl/transformations.py | 24 ++++++++++++++++++++++++ tests/test_transformations.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/adtl/transformations.py b/adtl/transformations.py index 6ee1807..3ed8258 100644 --- a/adtl/transformations.py +++ b/adtl/transformations.py @@ -20,6 +20,30 @@ def textIfNotNull(field, return_val): return return_val if field not in [None, ""] else None +def wordSubstituteSet(value, params): + """ + For a value that can have multiple words, use substitutions from params. + params is a list of 2-tuples, in the form + + [(w1, s1), (w2, s2), ... (w_n, s_n)] + + where w1 is replaced by s1, w2 is replaced by s2. + + Word matches are regular expressions, delimited by the `\b` word boundary + delimiter so can have arbitrary regular expressions to match. Any match of + regex w_n will use substitute s_n. Case is ignored when matching. + """ + out = [] + for i in params: + if not isinstance(i, (tuple, list)): + raise ValueError("wordSubstituteSet: params item not a tuple or list") + sub_map = dict(params) + for match, subst in sub_map.items(): + if re.search(r"\b" + match + r"\b", value, re.IGNORECASE): + out.append(subst) + return sorted(set(out)) if out else None + + def getFloat(value, set_decimal=None, separator=None): """ In cases where the decimal seperators is not a . you can diff --git a/tests/test_transformations.py b/tests/test_transformations.py index f674bed..fe9049a 100644 --- a/tests/test_transformations.py +++ b/tests/test_transformations.py @@ -10,6 +10,39 @@ def test_isNotNull(test_input, expected): assert transform.isNotNull(test_input) == expected +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ( + "Metilprednisolona - Dexametasona - Fluticasona", + [ + ("Metilprednisolona", "Methylprednisolone"), + ("Fluticasona", "Fluticasone"), + ("Dexametasona", "Dexamethasone"), + ], + ), + ["Dexamethasone", "Fluticasone", "Methylprednisolone"], + ), + ( + ( + "Hidrocortisona - Fluticasona", + [("Hidrocortisona", "Hydrocortisone"), ("Fluticasona", "Fluticasone")], + ), + ["Fluticasone", "Hydrocortisone"], + ), + ((("Hidrocortisona - Fluticasona"), [("Cortisona", "Cortisone")]), None), + ], +) +def test_wordSubstituteSet(test_input, expected): + assert transform.wordSubstituteSet(*test_input) == expected + + +def test_wordSubstituteSet_error(): + with pytest.raises(ValueError): + transform.wordSubstituteSet("value", [20, 30]) + + @pytest.mark.parametrize( "test_date_birth, test_date_current, expected", [