diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 89473bb..d7fa37f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -35,6 +35,7 @@ jobs: - name: Test run: | python -m unittest discover + python -m unittest discover -s medcat/compare_models # TODO - in the future, we might want to add automated tests for notebooks as well # though it's not really possible right now since the notebooks are designed # in a way that assumes interaction (i.e specifying model pack names) diff --git a/medcat/compare_models/cmp_utils.py b/medcat/compare_models/cmp_utils.py new file mode 100644 index 0000000..cfd95b3 --- /dev/null +++ b/medcat/compare_models/cmp_utils.py @@ -0,0 +1,62 @@ +from typing import Type, TypeVar, Generic, Iterable, Callable, Optional + +import sqlite3 +import re +from pydantic import BaseModel + + +T = TypeVar('T', bound=BaseModel) + + +def sanitize_table_name(name, max_length=64): + # Replace any characters not allowed in table names with underscores + name = re.sub(r'[^a-zA-Z0-9_$]', '_', name) + # Truncate the name if it's too long + name = name[:max_length] + return name + + +class SaveOptions(BaseModel): + use_db: bool = False + db_file_name: Optional[str] = None + clean_callback: Optional[Callable[[], None]] = None + + +class DifferenceDatabase(Generic[T]): + + def __init__(self, db_file: str, part: str, model_type: Type[T], + batch_size: int = 100): + self.db_file = db_file + self.part = sanitize_table_name(part) + self.model_type = model_type + self.conn = sqlite3.connect(self.db_file) + self.cursor = self.conn.cursor() + self._create_table() + self._len = 0 + self._batch_size = batch_size + + def _create_table(self): + self.cursor.execute(f'''CREATE TABLE IF NOT EXISTS differences_{self.part} + (id INTEGER PRIMARY KEY, data TEXT)''') + self.conn.commit() + + def append(self, difference: T): + data = difference.json() + self.cursor.execute(f"INSERT INTO differences_{self.part} (data) VALUES (?)", (data,)) + self.conn.commit() + self._len += 1 + + def __iter__(self) -> Iterable[T]: + self.cursor.execute(f"SELECT data FROM differences_{self.part}") + while True: + rows = self.cursor.fetchmany(self._batch_size) + if not rows: + break + for row in rows: + yield self.model_type.parse_raw(row[0]) + + def __len__(self) -> int: + return self._len + + def __del__(self): + self.conn.close() diff --git a/medcat/compare_models/compare.py b/medcat/compare_models/compare.py new file mode 100644 index 0000000..6c04a46 --- /dev/null +++ b/medcat/compare_models/compare.py @@ -0,0 +1,166 @@ +from typing import List, Tuple, Dict, Set, Optional, Union, Iterator +from functools import partial +import glob + +from medcat.cat import CAT + +import pandas as pd +import tqdm +import tempfile +import os + +from compare_cdb import compare as compare_cdbs, CDBCompareResults +from compare_annotations import ResultsTally, PerAnnotationDifferences +from output import parse_and_show +from cmp_utils import SaveOptions +from validation import validate_input + + + +def load_documents(file_name: str) -> Iterator[Tuple[str, str]]: + with open(file_name) as f: + df = pd.read_csv(f, names=["id", "text"]) + if df.iloc[0].id == "id" and df.iloc[0].text == "text": + # removes the header + # but also messes up the index a little + df = df.iloc[1:, :] + yield from df.itertuples(index=False) + + +def do_counting(cat1: CAT, cat2: CAT, + ann_diffs: PerAnnotationDifferences) -> ResultsTally: + def cui2name(cat, cui): + if cui in cat.cdb.cui2preferred_name: + return cat.cdb.cui2preferred_name[cui] + all_names = cat.cdb.cui2names[cui] + # longest anme + return sorted(all_names, key=lambda name: len(name), reverse=True)[0] + res1 = ResultsTally(pt2ch=_get_pt2ch(cat1), cat_data=cat1.cdb.make_stats(), + cui2name=partial(cui2name, cat1)) + res2 = ResultsTally(pt2ch=_get_pt2ch(cat2), cat_data=cat2.cdb.make_stats(), + cui2name=partial(cui2name, cat2)) + for per_doc in tqdm.tqdm(ann_diffs.per_doc_results.values()): + res1.count(per_doc.raw1) + res2.count(per_doc.raw2) + return res1, res2 + + +def _get_pt2ch(cat: CAT) -> Optional[Dict]: + return cat.cdb.addl_info.get("pt2ch", None) + + +def get_per_annotation_diffs(cat1: CAT, cat2: CAT, documents: Iterator[Tuple[str, str]], + show_progress: bool = True, + keep_raw: bool = True, + ) -> PerAnnotationDifferences: + pt2ch1: Optional[Dict] = _get_pt2ch(cat1) + pt2ch2: Optional[Dict] = _get_pt2ch(cat2) + temp_file = tempfile.NamedTemporaryFile() + save_opts = SaveOptions(use_db=True, db_file_name=temp_file.name, + clean_callback=temp_file.close) + pad = PerAnnotationDifferences(pt2ch1=pt2ch1, pt2ch2=pt2ch2, + model1_cuis=set(cat1.cdb.cui2names), + model2_cuis=set(cat2.cdb.cui2names), + keep_raw=keep_raw, + save_options=save_opts) + for doc_id, doc in tqdm.tqdm(documents, disable=not show_progress): + pad.look_at_doc(cat1.get_entities(doc), cat2.get_entities(doc), doc_id, doc) + pad.finalise() + return pad + + +def load_cui_filter(filter_file: str) -> Set[str]: + with open(filter_file) as f: + str_list = f.read().split(',') + return set(item.strip() for item in str_list) + + +def _add_all_children(cat: CAT, cui_filter: Set[str], include_children: int) -> None: + if include_children <= 0: + return + if "pt2ch" not in cat.cdb.addl_info: + return + pt2ch = cat.cdb.addl_info["pt2ch"] + children = set(ch for cui in cui_filter for ch in pt2ch.get(cui, [])) + if include_children > 1: + _add_all_children(cat, children, include_children=include_children-1) + cui_filter.update(children) + + +def load_and_train(model_pack_path: str, mct_export_path: str) -> CAT: + cat = CAT.load_model_pack(model_pack_path) + # NOTE: Allowing mct_export_path to contain wildcat ("*"). + # And in such a case, iterating over all matching files + if "*" not in mct_export_path: + cat.train_supervised_from_json(mct_export_path) + else: + for file in glob.glob(mct_export_path): + cat.train_supervised_from_json(file) + return cat + + +def get_diffs_for(model_pack_path_1: str, + model_pack_path_2: str, + documents_file: str, + cui_filter: Optional[Union[Set[str], str]] = None, + show_progress: bool = True, + include_children_in_filter: Optional[int] = None, + supervised_train_comparison_model: bool = False, + keep_raw: bool = True, + ) -> Tuple[CDBCompareResults, ResultsTally, ResultsTally, PerAnnotationDifferences]: + validate_input(model_pack_path_1, model_pack_path_2, documents_file, cui_filter, supervised_train_comparison_model) + documents = load_documents(documents_file) + if show_progress: + print("Loading [1]", model_pack_path_1) + cat1 = CAT.load_model_pack(model_pack_path_1) + if show_progress: + print("Loading [2]", model_pack_path_2) + if not supervised_train_comparison_model: + cat2 = CAT.load_model_pack(model_pack_path_2) + else: + if show_progress: + print("Reloading model pack 1", model_pack_path_1) + print("And subsequently training on", model_pack_path_2) + print("This may take a while, depending on the amount of " + "data is being trained on") + cat2 = load_and_train(model_pack_path_1, model_pack_path_2) + if show_progress: + print("Per annotations diff finding") + if cui_filter: + if isinstance(cui_filter, str): + cui_filter = load_cui_filter(cui_filter) + if show_progress: + print("Applying filter to CATs:", len(cui_filter), 'CUIs') + if include_children_in_filter: + if show_progress: + print("Adding all children of", include_children_in_filter, + "or lower level from first model") + _add_all_children(cat1, cui_filter, include_children_in_filter) + if show_progress: + print("After adding children from 1st model have a total of", + len(cui_filter), "CUIs") + _add_all_children(cat2, cui_filter, include_children_in_filter) + if show_progress: + print("After adding children from 2nd model have a total of", + len(cui_filter), "CUIs") + cat1.config.linking.filters.cuis = cui_filter + cat2.config.linking.filters.cuis = cui_filter + ann_diffs = get_per_annotation_diffs(cat1, cat2, documents, keep_raw=keep_raw) + if show_progress: + print("Counting [1&2]") + res1, res2 = do_counting(cat1, cat2, ann_diffs) + if show_progress: + print("CDB compare") + cdb_diff = compare_cdbs(cat1.cdb, cat2.cdb) + return cdb_diff, res1, res2, ann_diffs + + +def main(mpn1: str, mpn2: str, documents_file: str): + cdb_diff, res1, res2, ann_diffs = get_diffs_for(mpn1, mpn2, documents_file, show_progress=False) + print("Results:") + parse_and_show(cdb_diff, res1, res2, ann_diffs) + + +if __name__ == "__main__": + import sys + main(*sys.argv[1:]) \ No newline at end of file diff --git a/medcat/compare_models/compare_annotations.py b/medcat/compare_models/compare_annotations.py new file mode 100644 index 0000000..768bbb6 --- /dev/null +++ b/medcat/compare_models/compare_annotations.py @@ -0,0 +1,535 @@ +from typing import List, Tuple, Dict, Set, Callable, Optional, Union, Iterator, Iterable + +from pydantic import BaseModel +from enum import Enum, auto +from copy import deepcopy + +import pandas as pd +import json + +from cmp_utils import SaveOptions, DifferenceDatabase + + +class ResultsTally(BaseModel): + pt2ch: Optional[Dict[str, Set[str]]] + cat_data: dict + cui2name: Callable[[str], str] + total_count = 0 + per_cui_count: Dict[str, int] = {} + per_cui_acc: Dict[str, float] = {} + per_cui_forms: Dict[str, Set[str]] = {} + per_type_counts: Dict[str, int] = {} + + def _count(self, entity: Dict): + cui = entity['cui'] + type_ids = entity['type_ids'] + form = entity['detected_name'] + acc = entity['acc'] + if cui not in self.per_cui_count: + self.per_cui_count[cui] = 0 + self.per_cui_acc[cui] = 0 + self.per_cui_forms[cui] = set() + # update total count + self.total_count += 1 + # update accuracy + prev_cui_cnt = self.per_cui_count[cui] + self.per_cui_acc[cui] = (self.per_cui_acc[cui] * prev_cui_cnt + acc) / (prev_cui_cnt + 1) + # update count + self.per_cui_count[cui] = prev_cui_cnt + 1 + # update forms + self.per_cui_forms[cui].add(form) + for type_id in type_ids: + if type_id not in self.per_type_counts: + self.per_type_counts[type_id] = 0 + self.per_type_counts[type_id] += 1 + + def count(self, raw: Dict): + for _, value in raw.items(): + self._count(value) + + def summary(self) -> Dict: + summary = { + "total": self.total_count, + "per-cui": {cui: {"name": self.cui2name(cui), + "count": self.per_cui_count[cui], + "acc": self.per_cui_acc[cui], + "forms": len(self.per_cui_forms[cui])} for cui in self.per_cui_count} + } + return summary + + def _get_for_cui_recusive(self, cui: str, include_children: int = 0 + ) -> Tuple[List[str], List[int], List[float], Set[str]]: + all_names = [self.cui2name(cui), ] + all_counts = [self.per_cui_count.get(cui, 0), ] + all_accuracies = [self.per_cui_acc.get(cui, 0), ] + all_forms = self.per_cui_forms.get(cui, set()) + if include_children == 0 or not self.pt2ch: + return all_names, all_counts, all_accuracies, all_forms + for child in self.pt2ch.get(cui, []): + child_names, child_counts, child_accs, child_forms = self._get_for_cui_recusive(child, include_children-1) + all_names.extend(child_names) + all_counts.extend(child_counts) + all_accuracies.extend(child_accs) + all_forms.update(child_forms) + return all_names, all_counts, all_accuracies, all_forms + + + def get_for_cui(self, cui: str, include_children: int = 0) -> dict: + if cui not in self.per_cui_count: + return {"name": "N/A", "count": "N/A", "acc": "N/A", "forms": "N/A"} + all_names, all_counts, all_accuracies, all_forms = self._get_for_cui_recusive(cui, include_children) + names = f"{all_names[0]}" + nr_of_names = len(all_names) + if 4 > nr_of_names > 1: + names += f" ({', '.join(all_names[1:])})" + elif nr_of_names > 1: + names += f" (and {len(all_names) - 1} children)" + counts = sum(all_counts) + accuracies = sum(all_accuracies) + return {"name": names, + "count": counts, + "acc": accuracies, + "forms": len(all_forms)} + + + def _remove_cui(self, cui: str) -> None: + # TODO - this could potentially use all fields that start with `per_cui` + cnt = self.per_cui_count[cui] + self.total_count -= cnt + del self.per_cui_count[cui] + del self.per_cui_acc[cui] + del self.per_cui_forms[cui] + + def filter_cuis(self, cuis: Union[Set[str], List[str]]) -> None: + """Filter the results to only include the CUIs specified. + + Args: + cuis (Union[Set[str], List[str]]): The CUIs to include. + """ + for cui in list(self.per_cui_count): + if cui not in cuis: + self._remove_cui(cui) + +def _check_overlap_internal(start1: int, end1: int, start2: int, end2: int) -> bool: + if end1 < start2: + # 1st ends before 2nd starts + return False + elif end2 < start1: + # 2nd ends before 1st starts + return False + return True + + +class AnnotationComparisonType(Enum): + """Options as I see them + - 1st has annotation, 2nd doesn't + - 2nd has annotation, 1st doesn't + - Both have overlapping annotations + - One larger span, but different concept + - One larger span and same concept + - Identical, but different concept + - Identical and same concept + """ + FIRST_HAS = auto() + SECOND_HAS = auto() + OVERLAPP_1ST_LARGER_DIFF_CONCEPT = auto() + OVERLAPP_2ND_LARGER_DIFF_CONCEPT = auto() + OVERLAPP_1ST_LARGER_SAME_CONCEPT = auto() + OVERLAPP_2ND_LARGER_SAME_CONCEPT = auto() + PARTIAL_OVERLAP_DIFF_CONCEPT = auto() + PARTIAL_OVERLAP_SAME_CONCEPT = auto() + # NOTE: in the following cases we consider the annotated CUI + # so if the first annotates C101 and that does not exist + # in the second, then SAME_SPAN_CONCEPT_NOT_IN_2ND. + # However, when determininig this, we will do this after + # determining parents/grandparents to that these will be + # given priority (i.e if the 1st annotates a child that + # does not exist in the 2nd, but the parent does exist + # and is specified, then the parent relationship will + # be determined instead of the missing concept one) + SAME_SPAN_CONCEPT_NOT_IN_1ST = auto() + SAME_SPAN_CONCEPT_NOT_IN_2ND = auto() + SAME_SPAN_DIFF_CONCEPT = auto() + IDENTICAL = auto() + SAME_PARENT = auto() + SAME_GRANDPARENT = auto() + + def in_first(self) -> bool: + return self != AnnotationComparisonType.SECOND_HAS + + def in_second(self) -> bool: + return self != AnnotationComparisonType.FIRST_HAS + + @classmethod + def _determine_parent(cls, cui1: str, cui2: str, + pt2ch: Dict) -> Optional['AnnotationComparisonType']: + for ch in pt2ch.get(cui1, []): + if ch == cui2: + return cls.SAME_PARENT + return None + + @classmethod + def _determine_grandparent(cls, cui1: str, cui2: str, + pt2ch1: Optional[Dict], pt2ch2: Optional[Dict] + ) -> Optional['AnnotationComparisonType']: + if pt2ch1: + for ch in pt2ch1.get(cui1, []): + parent = cls._determine_parent(ch, cui2, pt2ch1) + if parent == cls.SAME_PARENT: + return cls.SAME_GRANDPARENT + if pt2ch2: + for ch in pt2ch2.get(cui2, []): + parent = cls._determine_parent(ch, cui1, pt2ch2) + if parent == cls.SAME_PARENT: + return cls.SAME_GRANDPARENT + return None + + @classmethod + def _determine_same_span(cls, cui1: str, cui2: str, + pt2ch1: Optional[Dict], pt2ch2: Optional[Dict] + ) -> 'AnnotationComparisonType': + if pt2ch1: + # check for children of cui1 in pt2ch1 + parent = cls._determine_parent(cui1, cui2, pt2ch1) + if parent: + return parent + if pt2ch2: + # check for children of cui2 in pt2ch2 + parent = cls._determine_parent(cui2, cui1, pt2ch2) + if parent: + return parent + grandparents = cls._determine_grandparent(cui1, cui2, pt2ch1, pt2ch2) + if grandparents: + return grandparents + return cls.SAME_SPAN_DIFF_CONCEPT + + @classmethod + def _determine_missing_concept(cls, cui1: str, cui2: str, + model1_cuis: Set[str], + model2_cuis: Set[str] + ) -> 'AnnotationComparisonType': + if cui1 not in model2_cuis: + return cls.SAME_SPAN_CONCEPT_NOT_IN_2ND + elif cui2 not in model1_cuis: + return cls.SAME_SPAN_CONCEPT_NOT_IN_1ST + return cls.SAME_SPAN_DIFF_CONCEPT + + @classmethod + def determine(cls, d1: Optional[dict], d2: Optional[dict], + pt2ch1: Optional[dict], pt2ch2: Optional[dict], + model1_cuis: Set[str], model2_cuis: Set[str], + ) -> 'AnnotationComparisonType': + """Determine the annotated comparison between two annotations. + + Annotated entities are assumed to have the following keys: + ['pretty_name', 'cui', 'type_ids', 'types', 'source_value', 'detected_name', + 'acc', 'context_similarity', 'start', 'end', 'icd10', 'ontologies', + 'snomed', 'id', 'meta_anns'] + + Args: + d1 (Optional[dict]): The entity dict for 1st, or None. + d2 (Optional[dict]): The entity dict for 2nd, or None. + pt2ch1 (Optional[dict]): The parent to child mapping for the 1st. + pt2ch2 (Optional[dict]): The parent to child mapping for the 2nd. + model1_cuis (Set[str]): All CUIs in 1st model. + model2_cuis (Set[str]): All CUIs in 2nd model. + + Returns: + AnnotationComparisonType: _description_ + """ + if d1 is None: + return cls.SECOND_HAS + if d2 is None: + return cls.FIRST_HAS + + start1, end1 = d1['start'], d1['end'] + start2, end2 = d2['start'], d2['end'] + cui1, cui2 = d1['cui'], d2['cui'] + has_overlap = _check_overlap_internal(start1, end1, start2, end2) + if not has_overlap: + if start1 < start2: + return cls.FIRST_HAS + return cls.SECOND_HAS + if start1 == start2 and end1 == end2: + if cui1 == cui2: + return cls.IDENTICAL + same_span = cls._determine_same_span(cui1, cui2, pt2ch1, pt2ch2) + if same_span != cls.SAME_SPAN_DIFF_CONCEPT: + return same_span + # determine concepts missing in one of the models + return cls._determine_missing_concept(cui1, cui2, model1_cuis, model2_cuis) + # semi-overlapping + len1 = end1 - start1 + len2 = end2 - start2 + if len1 > len2: + # first larger + if cui1 == cui2: + return cls.OVERLAPP_1ST_LARGER_SAME_CONCEPT + return cls.OVERLAPP_1ST_LARGER_DIFF_CONCEPT + if len2 > len1: + # second larget + if cui1 == cui2: + return cls.OVERLAPP_2ND_LARGER_SAME_CONCEPT + return cls.OVERLAPP_2ND_LARGER_DIFF_CONCEPT + # condition shouldn't be necessary + # if len1 == len2: + # same length, but not identical span + if cui1 == cui2: + return cls.PARTIAL_OVERLAP_SAME_CONCEPT + return cls.PARTIAL_OVERLAP_DIFF_CONCEPT + + +class AnnotationPair(BaseModel): + one: Optional[Dict] + two: Optional[Dict] + comparison_type: AnnotationComparisonType + + @classmethod + def iterate_over(cls, raw1: dict, raw2: dict, + pt2ch1: Optional[dict], pt2ch2: Optional[dict], + model1_cuis: Set[str], model2_cuis: Set[str], + ) -> Iterator['AnnotationPair']: + # keep originals + _raw1 = raw1 + _raw2 = raw2 + raw1 = deepcopy(raw1) + raw2 = deepcopy(raw2) + while len(raw1) or len(raw2): + # first key in either dict of entities + if raw1: + k1 = sorted(raw1.keys())[0] + v1 = raw1[k1] + else: + k1 = None + v1 = None + if raw2: + k2 = sorted(raw2.keys())[0] + v2 = raw2[k2] + else: + k2 = None + v2 = None + # corresponding value in either dict of entities + comp = AnnotationComparisonType.determine(v1, v2, pt2ch1, pt2ch2, + model1_cuis, model2_cuis) + rem_1st = comp.in_first() + rem_2nd = comp.in_second() + if rem_1st: + del raw1[k1] + else: + # now overlap with 1st + v1 = None + if rem_2nd: + del raw2[k2] + else: + # no overlap with 2nd + v2 = None + if not rem_1st and not rem_2nd: + # can't move forward, would be stuck in infinte loop + raise ValueError("Unknown comparison that leaves us" + "in an infinite loop. Happened while" + f"comparing '{k1}' ({v1})" + f"to '{k2}' ({v2})") + # using parts from the original dict instead of + # the copied one for better memory management + # (since the original raw is also being kept in PerDocAnnotationDifferences) + if k1 is not None and v1 is not None: + v1 = _raw1[k1] + if k2 is not None and v2 is not None: + v2 = _raw2[k2] + yield cls(one=v1, two=v2, comparison_type=comp) + + +class PerDocAnnotationDifferences(BaseModel): + nr_of_comparisons: Dict[AnnotationComparisonType, int] = {} + all_annotation_pairs: Union[List[AnnotationPair], Iterable[AnnotationPair]] + raw_text: str + raw1: Dict + raw2: Dict + + @classmethod + def get(cls, doc_id: str, raw_text: str, d1: dict, d2: dict, + pt2ch1: Optional[dict], pt2ch2: Optional[dict], + model1_cuis: Set[str], model2_cuis: Set[str], + save_options: SaveOptions = SaveOptions(), + keep_raw: bool = True, + ) -> 'PerDocAnnotationDifferences': + # creating copies so I can ditch the entries + # that I've already dealt with + raw1 = dict(d1['entities']) + raw2 = dict(d2['entities']) + # now we have {'key': VAL} + # where VAL has keys: + # ['pretty_name', 'cui', 'type_ids', 'types', 'source_value', 'detected_name', + # 'acc', 'context_similarity', 'start', 'end', 'icd10', 'ontologies', + # 'snomed', 'id', 'meta_anns'] + comparisons: Dict[AnnotationComparisonType, int] = {} + if save_options.use_db: + all_annotation_pairs: DifferenceDatabase = DifferenceDatabase(db_file=save_options.db_file_name, + part=doc_id, + model_type=AnnotationPair) + else: + all_annotation_pairs = [] + for pair in AnnotationPair.iterate_over(raw1, raw2, pt2ch1, pt2ch2, + model1_cuis, model2_cuis): + comp = pair.comparison_type + if comp not in comparisons: + comparisons[comp] = 0 + comparisons[comp] += 1 + all_annotation_pairs.append(pair) + if not keep_raw: + raw_text = '' + return cls(nr_of_comparisons=comparisons, all_annotation_pairs=all_annotation_pairs, + raw1=raw1, raw2=raw2, raw_text=raw_text) + + +class PerAnnotationDifferences(BaseModel): + model1_cuis: Set[str] + model2_cuis: Set[str] + pt2ch1: Optional[Dict] + pt2ch2: Optional[Dict] + save_options: SaveOptions = SaveOptions() + per_doc_results: Dict[str, PerDocAnnotationDifferences] = {} + totals: Optional[Dict[AnnotationComparisonType, int]] = None + keep_raw: bool = True + + def look_at_doc(self, d1: dict, d2: dict, doc_id: str, raw_text: str): + self.per_doc_results[doc_id] = PerDocAnnotationDifferences.get(doc_id, raw_text, d1, d2, + self.pt2ch1, self.pt2ch2, + self.model1_cuis, + self.model2_cuis, + self.save_options, + self.keep_raw) + + def finalise(self): + totals: Dict[AnnotationComparisonType, int] = {} + for value in self.per_doc_results.values(): + for k, v in value.nr_of_comparisons.items(): + if k not in totals: + totals[k] = 0 + totals[k] += v + self.totals = totals + + def iter_ann_pairs(self, + docs: Optional[Iterable[str]] = None, + omit_identical: bool = True) -> Iterator[Tuple[str, AnnotationPair]]: + """ITerate over annotation pairs, potentially only for a specific subset of documents. + + If no document IDs are specified, all documents are used. + Otherwise, only the documents specified are used. + + If the list of documents contains document IDs that have not been looked at + they will be ignored. + + Args: + docs (Optional[Iterable[str]], optional): The document IDs to use. Defaults to None. + omit_identical (bool, optional): Whether to omit identical annotations. Defaults to True. + + Yields: + Iterator[Tuple[str, AnnotationPair]]: An iteration of document name and annotation pair. + """ + targets = [(doc, self.per_doc_results[doc]) for doc in self.per_doc_results + if docs is None or doc in docs] + for doc, pdad in targets: + for pair in pdad.all_annotation_pairs: + if omit_identical and pair.comparison_type == AnnotationComparisonType.IDENTICAL: + continue + yield doc, pair + + def iter_document_annotations(self, docs: Optional[Iterable[str]] = None, + types_filter: Optional[Set[AnnotationComparisonType]] = None, + ) -> Iterator[Tuple[str, str, Optional[Dict], Optional[Dict]]]: + """Iterate over document annotations (including raw text). + + Args: + docs (Optional[Iterable[str]], optional): The documents to iterate over (or all). Defaults to None. + omit_identical (bool, optional): Whether to omit identical annotations. Defaults to True. + + Yields: + Iterator[Tuple[str, str, Dict, Dict]]: + The document ID, the raw text, the annotations for model 1, the annotaitons for model 2 + """ + targets = [(doc, self.per_doc_results[doc]) for doc in self.per_doc_results + if docs is None or doc in docs] + if types_filter is None: + types_filter = set(AnnotationComparisonType) + for doc, pdad in targets: + for pair in pdad.all_annotation_pairs: + if pair.comparison_type not in types_filter: + continue + yield doc, pdad.raw_text, pair.one, pair.two + + def _get_text(self, raw_text: str, span_char_limit: Optional[int], + ann1: Optional[dict], ann2: Optional[dict], + ) -> str: + if span_char_limit is None: + text = raw_text + else: + if ann1: + start1, end1 = ann1['start'], ann1['end'] + else: + start1, end1 = -1, -1 + if ann2: + start2, end2 = ann2['start'], ann2['end'] + if not ann1: + start1, end1 = start2, end2 + else: + start2, end2 = start1, end1 + min_char_nr = max(min(start1, start2) - span_char_limit, 0) + max_char_nr = min(max(end1, end2) + span_char_limit, len(raw_text) + 1) + text = raw_text[min_char_nr: max_char_nr] + # update start and end chars so that they match the new text + if ann1: + ann1['start'], ann1['end'] = start1 - min_char_nr, end1 - min_char_nr + ann1['start-raw'], ann1['end-raw'] = start1, end1 + if ann2: + ann2['start'], ann2['end'] = start2 - min_char_nr, end2 - min_char_nr + ann2['start-raw'], ann2['end-raw'] = start2, end2 + return text + + def _to_raw(self, docs: Set[str], + types_filter: Set[AnnotationComparisonType], + span_char_limit: Optional[int] = 200, + ) -> List[Tuple[str, str, str, str]]: + data: List[Tuple[str, str, str, str]] = [] + for doc_id, raw_text, ann1, ann2 in self.iter_document_annotations(docs, types_filter): + text = self._get_text(raw_text, span_char_limit=span_char_limit, ann1=ann1, ann2=ann2) + # convert annotation dicts to json + data.append((doc_id, text, json.dumps(ann1), json.dumps(ann2))) + return data + + def to_csv(self, csv_file: str, + docs: Optional[Iterable[str]] = None, + types_filter: Optional[Set[AnnotationComparisonType]] = None, + span_char_limit: Optional[int] = 200) -> None: + """Generates a CSV file based on the results. + + Each annotation pair creates a line in the CSV. + + The CSV file has the following columns: + doc_id: the ID of the document for this annotation + text: the text (`span_char_limit` both ways, or the entire text if None) + ann1: the annotation for model 1 + ann2: the annotation for model 2 + + NOTE: One of the annotations in each line may be None (NaN). + This happens when one of the model did not annotate that span. + + Args: + csv_file (str): The csv file to write to. + docs (Optional[Iterable[str]], optional): The documents to include (or all). Defaults to None. + span_char_limit (Optional[int], optional): The char span limit either side (or all if None). Defaults to 200. + """ + if docs is None: + docs = set(self.per_doc_results) + else: + docs = set(docs) + if types_filter is None: + types_filter = set(AnnotationComparisonType) + data = self._to_raw(docs, types_filter=types_filter, span_char_limit=span_char_limit) + df = pd.DataFrame(data, columns=["doc_id", "text", "ann1", "ann2"]) + df.to_csv(csv_file, index=False) + + + def __del__(self): + if self.save_options.use_db: + self.save_options.clean_callback() diff --git a/medcat/compare_models/compare_cdb.py b/medcat/compare_models/compare_cdb.py new file mode 100644 index 0000000..5f99574 --- /dev/null +++ b/medcat/compare_models/compare_cdb.py @@ -0,0 +1,124 @@ +from typing import Dict, Set, Tuple + +from medcat.cdb import CDB + +import tqdm +from itertools import chain + +from pydantic import BaseModel + + +class DictCompareKeys(BaseModel): + """This is based on the keys.""" + total1: int + """The total number of keys in 1st dict""" + total2: int + """The total number of keys in 2nd dict""" + joint: int + """The total number of keys (intersection)""" + not_in_1: int + """The number of keys in 2nd but not in 1st dict""" + not_in_2: int + """The number of keys in 1st but not in 2nd dict""" + + @classmethod + def get(cls, d1: dict, d2: dict) -> "DictCompareKeys": + # helpers + all1 = set(d1) + all2 = set(d2) + # total keys + total1 = len(all1) + total2 = len(all2) + # non-common keys + joint = len(all1 & all2) + all_combined = len(all1 | all2) + not_in_1 = all_combined - total1 + not_in_2 = all_combined - total2 + return cls(total1=total1, total2=total2, joint=joint, + not_in_1=not_in_1, not_in_2=not_in_2) + + +class DictCompareValues(BaseModel): + """This is based on the notion of the values being sets. + + With respect to the difference between `not_in_1` and `unique_in_2`: + - If we have {"1": {"a", "b"}} and {"2": {"a", "b"}} + - The values are identical overall (`unique_in_1==unique_in_2==0`) + - However, the values are under different keys + - So `not_in_1==not_in_2==2` (since this is per key) + """ + total1: int + """The total number of values in 1st dict""" + total2: int + """The total number of values in 2nd dict""" + not_in_1: int + """The number of values in 2nd, but not in 1st (per key)""" + not_in_2: int + """The number of values in 1st, but not in 2nd (per key)""" + joint: int + """Total number of values in both 1st and 2nd dict (overall)""" + unique_in_1: int + """The number of unique values in 1nd (overall)""" + unique_in_2: int + """The number of unique values in 2nd (overall)""" + + @classmethod + def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictCompareValues": + # helpers + all_keys = set(d1) | set(d2) + vals_in_1 = set(chain.from_iterable(d1.values())) + vals_in_2 = set(chain.from_iterable(d2.values())) + # total names + total1 = sum(len(v) for v in d1.values()) + total2 = sum(len(v) for v in d2.values()) + # names ... + not_in_1 = 0 + not_in_2 = 0 + for key in tqdm.tqdm(all_keys, desc="keys", disable=not progress): + n1 = d1.get(key, set()) + n2 = d2.get(key, set()) + all_vals4key = len(n1 | n2) + not_in_1 += all_vals4key - len(n1) + not_in_2 += all_vals4key - len(n2) + # names in common + joint = len(vals_in_1 & vals_in_2) + # names unique to one of the two + vals_in_one_but_not_both = vals_in_1 ^ vals_in_2 + unique_in_1 = len(vals_in_one_but_not_both & vals_in_1) + unique_in_2 = len(vals_in_one_but_not_both & vals_in_2) + return cls(total1=total1, total2=total2, not_in_1=not_in_1, + not_in_2=not_in_2, joint=joint, + unique_in_1=unique_in_1, unique_in_2=unique_in_2) + + +class DictComparisonResults(BaseModel): + keys: DictCompareKeys + values: DictCompareValues + + @classmethod + def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictComparisonResults": + return cls(keys=DictCompareKeys.get(d1, d2), + values=DictCompareValues.get(d1, d2, progress=progress)) + + +class CDBCompareResults(BaseModel): + names: DictComparisonResults + snames: DictComparisonResults + + +def compare(cdb1: CDB, + cdb2: CDB, + show_progress: bool = True) -> CDBCompareResults: + """_summary_ + + Args: + cdb1 (CDB): _description_ + cdb2 (CDB): _description_ + show_progress (bool, optional): _description_. Defaults to True. + + Returns: + CDBCompareResults: _description_ + """ + reg = DictComparisonResults.get(cdb1.cui2names, cdb2.cui2names, progress=show_progress) + snames = DictComparisonResults.get(cdb1.cui2snames, cdb2.cui2snames, progress=show_progress) + return CDBCompareResults(names=reg, snames=snames) diff --git a/medcat/compare_models/data/some_synthetic_data.csv b/medcat/compare_models/data/some_synthetic_data.csv new file mode 100644 index 0000000..6df8f84 --- /dev/null +++ b/medcat/compare_models/data/some_synthetic_data.csv @@ -0,0 +1,978 @@ +"id","text" +doc_0," +Patient Name: John Smith +Address: 15 Maple Avenue +City: New York +CC: Chronic back pain + +HX: Mr. Smith is a 52-year-old male who has been experiencing chronic back pain for the past six months. The pain initially started after a lifting incident at work. He describes the pain as a dull ache in the lower back, which worsens with prolonged sitting or standing. He has tried over-the-counter pain medications with limited relief. Mr. Smith decided to seek medical attention due to the persistent nature of his symptoms. + +FHX: No significant family history of back pain or spinal conditions. + +SHX: Office worker. Non-smoker. Occasional alcohol consumption. + +Physical examination revealed tenderness over the lumbar spine with no signs of neurological deficit. X-rays performed on 6/10/2023 showed degenerative changes in the lumbar spine, consistent with spondylosis. + +Seen by Dr. R. Johnson on 6/15/2023. + +" +doc_1," +Patient Name: Emily Davis +Address: 22 Willow Lane +City: Los Angeles +CC: Allergic rhinitis + +HX: Miss Davis is a 28-year-old female who presents with symptoms of allergic rhinitis. She complains of frequent sneezing, nasal congestion, and itchy eyes, which have been bothering her for the past two years. Symptoms are worse during the spring and fall seasons and improve with over-the-counter antihistamines. Miss Davis seeks medical advice to explore other treatment options. + +FHX: No significant family history of allergic rhinitis or other allergic conditions. + +SHX: Office administrator. Non-smoker. No alcohol or drug use. + +Nasal examination revealed pale, boggy nasal mucosa with clear nasal discharge. Skin prick testing conducted on 6/12/2023 demonstrated positive reactions to grass pollen and dust mites. + +Seen by Dr. S. Patel on 6/17/2023. + +" +doc_2," +Patient Name: Michael Johnson +Address: 10 Oak Street +City: Chicago +CC: Acute bronchitis + +HX: Mr. Johnson is a 42-year-old male who presents with symptoms of acute bronchitis. He reports a cough productive of yellowish sputum, mild chest discomfort, and low-grade fever for the past five days. He denies any shortness of breath or wheezing. Mr. Johnson sought medical attention due to the persistence of symptoms and concern about the nature of his illness. + +FHX: No significant family history of respiratory conditions or chronic lung diseases. + +SHX: Construction worker. Non-smoker. Occasional alcohol consumption. + +Pulmonary examination revealed scattered coarse breath sounds with no signs of consolidation. Chest X-ray performed on 6/13/2023 showed no evidence of pneumonia. + +Seen by Dr. L. Anderson on 6/16/2023. + +" +doc_3," +Patient Name: Sarah Thompson +Address: 5 Elm Street +City: San Francisco +CC: Migraine headaches + +HX: Miss Thompson is a 30-year-old female who complains of recurrent migraine headaches. She describes the headaches as pulsating, moderate to severe in intensity, lasting for several hours to a day. The headaches are usually accompanied by nausea, vomiting, and sensitivity to light and sound. Miss Thompson reports experiencing these episodes once or twice a month for the past two years. She seeks medical advice to explore treatment options and alleviate her symptoms. + +FHX: Maternal aunt had a history of migraines. No other significant family history of neurological conditions. + +SHX: Graphic designer. Non-smoker. Rare alcohol consumption. + +Neurological examination revealed no focal deficits. Miss Thompson's headache characteristics and frequency are consistent with a diagnosis of migraines. + +Seen by Dr. K. Roberts on 6/19/2023. + +" +doc_4," +Patient Name: David Wilson +Address: 3 Pine Street +City: Houston +CC: Gastroesophageal reflux disease (GERD) + +HX: Mr. Wilson is a 48-year-old male who presents with symptoms of gastroesophageal reflux disease. He complains of frequent heartburn, regurgitation, and a bitter taste in his mouth, particularly after meals. Symptoms have been bothering him for the past six months, and he has noticed a decrease in his appetite and unintentional weight loss. Mr. Wilson seeks medical advice to manage his symptoms and address the weight loss. + +FHX: No significant family history of gastrointestinal conditions. + +SHX: Accountant. Non-smoker. Occasional alcohol consumption. + +Abdominal examination revealed epigastric tenderness. Upper endoscopy performed on 6/16/2023 demonstrated evidence of esophagitis and hiatal hernia. + +Seen by Dr. J. Anderson on 6/21/2023. + +" +doc_5," +Patient Name: Olivia Martinez +Address: 12 Rose Lane +City: Miami +CC: Depression + +HX: Miss Martinez is a 36-year-old female who presents with symptoms of depression. She reports feeling persistent sadness, loss of interest in activities, decreased energy, changes in appetite and sleep patterns, and difficulty concentrating for the past six months. These symptoms have significantly affected her daily functioning and overall quality of life. Miss Martinez seeks medical assistance to address her depressive symptoms. + +FHX: No significant family history of mood disorders. + +SHX: Teacher. Non-smoker. No alcohol or drug use. + +Psychiatric evaluation revealed a depressed mood, anhedonia, and impaired concentration. Based on the clinical presentation, Miss Martinez meets the criteria for major depressive disorder. + +Seen by Dr. A. Ramirez on 6/23/2023. + +" +doc_6," +Patient Name: Daniel Lee +Address: 8 Maple Street +City: Seattle +CC: Hypertension + +HX: Mr. Lee is a 58-year-old male who presents with elevated blood pressure readings during routine check-ups. He has a family history of hypertension and is concerned about his cardiovascular health. Mr. Lee has no associated symptoms but seeks medical advice to manage his blood pressure and reduce the risk of complications. + +FHX: Father and paternal grandfather had hypertension. No other significant family history of cardiovascular diseases. + +SHX: Engineer. Non-smoker. Occasional alcohol consumption. + +Physical examination revealed blood pressure consistently above the normal range. Further investigations, including 24-hour ambulatory blood pressure monitoring, confirmed the diagnosis of essential hypertension. + +Seen by Dr. H. Johnson on 6/25/2023. + +" +doc_7," +Patient Name: Sophia Adams +Address: 18 Cedar Avenue +City: Boston +CC: Urinary tract infection (UTI) + +HX: Miss Adams is a 24-year-old female who complains of urinary frequency, urgency, and a burning sensation during urination. Symptoms started two days ago and have progressively worsened. She denies any hematuria or fever. Miss Adams seeks medical attention due to the persistence of symptoms and concern about a possible urinary tract infection. + +FHX: No significant family history of urinary tract infections. + +SHX: Marketing executive. Non-smoker. No alcohol or drug use. + +Urinalysis revealed pyuria and positive leukocyte esterase, indicating a urinary tract infection. A midstream urine culture confirmed the presence of Escherichia coli. + +Seen by Dr. M. Patel on 6/28/2023. + +" +doc_8," +Patient Name: Benjamin Thompson +Address: 25 Oak Street +City: Chicago +CC: Seasonal allergies + +HX: Mr. Thompson is a 40-year-old male who presents with symptoms of seasonal allergies. He reports sneezing, itching, and a runny nose, particularly during the spring and summer months. Symptoms significantly interfere with his daily activities and sleep. Mr. Thompson seeks medical advice to manage his allergic symptoms. + +FHX: Mother had a history of seasonal allergies. No other significant family history of allergic conditions. + +SHX: IT specialist. Non-smoker. No alcohol or drug use. + +Allergy testing conducted on 6/26/2023 demonstrated positive reactions to grass pollen and tree pollen. + +Seen by Dr. E. Anderson on 6/30/2023. + +" +doc_9," +Patient Name: Emma Davis +Address: 6 Willow Lane +City: Los Angeles +CC: Anxiety + +HX: Miss Davis is a 32-year-old female who presents with symptoms of anxiety. She reports excessive worrying, restlessness, irritability, muscle tension, and difficulty concentrating. These symptoms have been present for the past six months and have + + +" +doc_10," +Patient Name: Alexander Johnson +Address: 9 Elm Street +City: San Francisco +CC: Asthma + +HX: Mr. Johnson is a 28-year-old male who presents with symptoms of asthma. He complains of recurrent episodes of wheezing, shortness of breath, and chest tightness, particularly during physical activity and exposure to triggers such as dust and pollen. Symptoms have been present since childhood and have recently worsened. Mr. Johnson seeks medical assistance to manage his asthma symptoms and improve his quality of life. + +FHX: Mother and paternal uncle have a history of asthma. No other significant family history of respiratory conditions. + +SHX: Sales representative. Non-smoker. No alcohol or drug use. + +Pulmonary function tests revealed airflow obstruction with significant reversibility after bronchodilator administration, confirming the diagnosis of asthma. + +Seen by Dr. N. Patel on 7/2/2023. + +" +doc_11," +Patient Name: Lily Wilson +Address: 4 Pine Street +City: Houston +CC: Gastroenteritis + +HX: Miss Wilson is a 22-year-old female who presents with symptoms of gastroenteritis. She reports diarrhea, abdominal cramping, nausea, and vomiting, which started after consuming a meal at a local restaurant. Symptoms have been ongoing for the past 24 hours, and she is concerned about dehydration and the persistence of symptoms. Miss Wilson seeks medical advice for symptom relief and to ensure appropriate management. + +FHX: No significant family history of gastrointestinal conditions. + +SHX: Student. Non-smoker. No alcohol or drug use. + +Physical examination revealed mild abdominal tenderness with no signs of peritonitis. Based on the clinical presentation and recent food exposure, the diagnosis of gastroenteritis is likely. + +Seen by Dr. K. Roberts on 7/5/2023. + +" +doc_12," +Patient Name: Noah Thompson +Address: 19 Cedar Avenue +City: Boston +CC: Insomnia + +HX: Mr. Thompson is a 45-year-old male who complains of difficulty falling asleep and maintaining sleep. He reports frequent awakenings during the night and feeling unrefreshed upon waking up. These symptoms have been present for the past three months and significantly affect his daytime functioning. Mr. Thompson seeks medical assistance to address his insomnia and improve his sleep quality. + +FHX: No significant family history of sleep disorders. + +SHX: Financial analyst. Non-smoker. Occasional alcohol consumption. + +Sleep diary records revealed prolonged sleep latency and frequent awakenings during the night. Based on the clinical presentation, Mr. Thompson meets the criteria for chronic insomnia disorder. + +Seen by Dr. S. Ramirez on 7/8/2023. + +" +doc_13," +Patient Name: Chloe Adams +Address: 14 Cedar Avenue +City: Boston +CC: Sinusitis + +HX: Miss Adams is a 26-year-old female who presents with symptoms of sinusitis. She reports nasal congestion, facial pressure, headache, and thick nasal discharge, which have been bothering her for the past week. Miss Adams tried over-the-counter nasal decongestants with minimal relief. She seeks medical assistance to manage her symptoms and prevent complications. + +FHX: No significant family history of sinusitis or chronic sinus conditions. + +SHX: Graphic designer. Non-smoker. No alcohol or drug use. + +Nasal examination revealed erythematous nasal mucosa with purulent discharge. Based on the clinical presentation, Miss Adams is diagnosed with acute sinusitis. + +Seen by Dr. L. Anderson on 7/11/2023. + +" +doc_14," +Patient Name: Grace Turner +Address: 11 Maple Avenue +City: New York +CC: Rheumatoid arthritis + +HX: Miss Turner +" +doc_15," +Patient Name: Ethan Harris +Address: 16 Pine Street +City: Houston +CC: Gout + +HX: Mr. Harris is a 55-year-old male who presents with symptoms of gout. He reports sudden and severe joint pain, swelling, and redness in his right big toe. The symptoms started yesterday, and he has a history of similar episodes in the past. Mr. Harris seeks medical assistance to manage his acute gout attack and prevent future flares. + +FHX: No significant family history of gout or other rheumatic conditions. + +SHX: Retired. Non-smoker. Occasional alcohol consumption. + +Physical examination revealed warmth, tenderness, and erythema in the affected joint. Based on the clinical presentation and history of recurrent episodes, Mr. Harris is diagnosed with acute gouty arthritis. + +Seen by Dr. M. Johnson on 7/14/2023. + +" +doc_16," +Patient Name: Mia Clark +Address: 7 Willow Lane +City: Los Angeles +CC: Urinary incontinence + +HX: Miss Clark is a 62-year-old female who complains of urinary incontinence. She reports involuntary urine leakage, particularly with coughing, sneezing, and physical exertion. Symptoms have been present for the past six months and have progressively worsened. Miss Clark seeks medical advice to address her urinary incontinence and improve her quality of life. + +FHX: No significant family history of urinary incontinence or pelvic floor disorders. + +SHX: Retired. Non-smoker. No alcohol or drug use. + +Pelvic examination revealed weakened pelvic floor muscles. Based on the clinical presentation, Miss Clark is diagnosed with stress urinary incontinence. + +Seen by Dr. E. Patel on 7/17/2023. + +" +doc_17," +Patient Name: Samuel Wright +Address: 20 Oak Street +City: Chicago +CC: Osteoarthritis + +HX: Mr. Wright is a 70-year-old male who presents with symptoms of osteoarthritis. He reports joint pain, stiffness, and reduced range of motion in his knees and hands. Symptoms have been progressively worsening over the past year and significantly affect his daily activities. Mr. Wright seeks medical assistance to manage his osteoarthritis symptoms and improve his functional ability. + +FHX: No significant family history of musculoskeletal conditions. + +SHX: Retired. Non-smoker. No alcohol or drug use. + +Physical examination revealed crepitus, bony enlargement, and limited range of motion in the affected joints. Based on the clinical presentation and imaging findings, Mr. Wright is diagnosed with osteoarthritis. + +Seen by Dr. R. Anderson on 7/20/2023. + +" +doc_18," +Patient Name: Harper Turner +Address: 13 Maple Avenue +City: New York +CC: Hypothyroidism + +HX: Miss Turner is a 30-year-old female who presents with symptoms of hypothyroidism. She reports fatigue, weight gain, cold intolerance, constipation, and dry skin. These symptoms have been present for the past six months and have gradually worsened. Miss Turner seeks medical assistance to evaluate her thyroid function and explore appropriate treatment options. + +FHX: No significant family history of thyroid disorders. + +SHX: Office manager. Non-smoker. No alcohol or drug use. + +Laboratory tests revealed elevated thyroid-stimulating hormone (TSH) levels and decreased free thyroxine (T4) levels, confirming the diagnosis of primary hypothyroidism. + +Seen by Dr. S. Johnson on 7/23/2023. + +" +doc_19," +Patient Name: Ava Lewis +Address: 10 Pine Street +City: Houston + +" +doc_20," +Patient Name: Henry Adams +Address: 5 Elm Street +City: San Francisco +CC: Type 2 diabetes mellitus + +HX: Mr. Adams is a 50-year-old male who presents with symptoms of increased thirst, frequent urination, and unintentional weight loss. He reports feeling fatigued and has a family history of diabetes. Laboratory tests revealed elevated fasting blood glucose levels and HbA1c levels, indicating poor glycemic control. Mr. Adams seeks medical assistance to manage his diabetes and prevent complications. + +FHX: Father and two siblings have a history of type 2 diabetes. + +SHX: Teacher. Non-smoker. No alcohol or drug use. + +Based on the clinical presentation and laboratory findings, Mr. Adams is diagnosed with type 2 diabetes mellitus. + +Seen by Dr. N. Patel on 7/26/2023. + +" +doc_21," +Patient Name: Emily Wright +Address: 21 Oak Street +City: Chicago +CC: Migraine headaches + +HX: Miss Wright is a 25-year-old female who presents with recurrent episodes of severe headache accompanied by nausea, vomiting, and sensitivity to light and sound. She reports experiencing these symptoms since adolescence and seeks medical assistance to manage her migraines and improve her quality of life. + +FHX: Mother has a history of migraines. + +SHX: Graphic designer. Non-smoker. Occasional alcohol consumption. + +The clinical presentation and symptom pattern are consistent with a diagnosis of migraine headaches. + +Seen by Dr. E. Anderson on 7/29/2023. + +" +doc_22," +Patient Name: Oliver Mitchell +Address: 15 Cedar Avenue +City: Boston +CC: Plantar fasciitis + +HX: Mr. Mitchell is a 42-year-old male who presents with heel pain that is worse in the morning and improves with activity. He reports experiencing pain for the past three months, particularly after prolonged periods of standing or walking. Mr. Mitchell seeks medical assistance to alleviate his foot pain and restore his normal daily activities. + +FHX: No significant family history of foot or musculoskeletal conditions. + +SHX: IT specialist. Non-smoker. No alcohol or drug use. + +Physical examination revealed tenderness and pain along the plantar fascia. Based on the clinical presentation, Mr. Mitchell is diagnosed with plantar fasciitis. + +Seen by Dr. L. Patel on 8/2/2023. + +" +doc_23," +Patient Name: Victoria Turner +Address: 17 Maple Avenue +City: New York +CC: Chronic obstructive pulmonary disease (COPD) + +HX: Miss Turner is a 60-year-old female who presents with symptoms of chronic cough, sputum production, and shortness of breath, particularly during physical exertion. She reports a history of smoking for 30 years. Pulmonary function tests revealed airflow limitation and reduced forced expiratory volume. Miss Turner seeks medical assistance to manage her COPD symptoms and optimize her respiratory function. + +FHX: No significant family history of respiratory conditions. + +SHX: Retired. Former smoker. No alcohol or drug use. + +Based on the clinical presentation, smoking history, and pulmonary function test results, Miss Turner is diagnosed with chronic obstructive pulmonary disease. + +Seen by Dr. S. Johnson on 8/5/2023. + +" +doc_24," +Patient Name: Oliver Parker +Address: 22 Oak Street +City: Chicago +CC: Allergic rhinitis + +HX: Mr. Parker is a 32-year-old male who presents with symptoms of allergic rhinitis. He reports sneezing, nasal congestion, itching, and a runny nose, particularly during the spring and fall seasons. Symptoms significantly interfere with his daily activities + +" +doc_25," +Patient Name: Isabella Cooper +Address: 12 Willow Lane +City: Los Angeles +CC: Anxiety disorder + +HX: Miss Cooper is a 27-year-old female who presents with symptoms of anxiety. She reports excessive worry, restlessness, irritability, muscle tension, and difficulty sleeping. These symptoms have been present for the past year and have progressively worsened. Miss Cooper seeks medical assistance to address her anxiety symptoms and improve her overall well-being. + +FHX: No significant family history of anxiety disorders. + +SHX: Accountant. Non-smoker. Occasional alcohol consumption. + +Psychiatric evaluation revealed symptoms consistent with generalized anxiety disorder. Miss Cooper is experiencing significant distress and impairment in multiple areas of her life. + +Seen by Dr. E. Ramirez on 8/8/2023. + +" +doc_26," +Patient Name: Jacob Martinez +Address: 18 Elm Street +City: San Francisco +CC: Hypertensive crisis + +HX: Mr. Martinez is a 60-year-old male with a known history of hypertension. He presents with severe headache, chest pain, and shortness of breath. He reports missing his antihypertensive medication for the past three days. Upon measurement, his blood pressure is significantly elevated. Mr. Martinez seeks urgent medical attention to manage his hypertensive crisis. + +FHX: Father had a history of hypertension and stroke. + +SHX: Retired. Non-smoker. Occasional alcohol consumption. + +Physical examination and blood pressure measurements confirm the diagnosis of hypertensive crisis. Immediate interventions are initiated to lower blood pressure and prevent complications. + +Seen by Dr. H. Johnson on 8/11/2023. + +" +doc_27," +Patient Name: Ava Foster +Address: 14 Pine Street +City: Houston +CC: Peptic ulcer disease + +HX: Miss Foster is a 35-year-old female who presents with symptoms of abdominal pain, particularly in the upper abdomen. She reports a burning sensation and occasional nausea. Symptoms worsen after meals. Miss Foster seeks medical assistance to evaluate her abdominal pain and determine the underlying cause. + +FHX: No significant family history of gastrointestinal conditions. + +SHX: Marketing executive. Non-smoker. Occasional alcohol consumption. + +Gastroscopy reveals a duodenal ulcer. Helicobacter pylori testing is performed, and the results confirm the presence of H. pylori infection. + +Seen by Dr. M. Johnson on 8/14/2023. + +" +doc_28," +Patient Name: William Turner +Address: 11 Cedar Avenue +City: Boston +CC: Major depressive disorder + +HX: Mr. Turner is a 38-year-old male who presents with symptoms of depression. He reports a persistent depressed mood, loss of interest in activities, feelings of worthlessness, changes in appetite, and difficulty concentrating. These symptoms have been present for the past six months and significantly impair his daily functioning. Mr. Turner seeks medical assistance to address his depressive symptoms. + +FHX: No significant family history of mood disorders. + +SHX: Software engineer. Non-smoker. No alcohol or drug use. + +Psychiatric evaluation reveals symptoms consistent with major depressive disorder. Mr. Turner exhibits significant distress and impairment in multiple areas of his life. + +Seen by Dr. L. Anderson on 8/17/2023. + +" +doc_29," +Patient Name: Sophia Reed +Address: 9 Willow Lane +City: Los Angeles +CC: Iron-deficiency anemia + +HX: Miss Reed is a 29-year-old female who presents with symptoms of fatigue, weakness, and shortness of breath. She reports heavy menstrual bleeding and follows a vegetarian diet. Miss Reed seeks medical assistance to evaluate her symptoms and determine the cause of her anemia. + +FHX: No significant + +" +doc_30," +Name: Olivia Davis +Address: 12 Elm Street +City: Springfield +CC: Chronic back pain. + +HX: Ms. Davis is a 45-year-old female who presents with chronic lower back pain for the past six months. The pain is described as dull and aching, primarily localized to the lumbar region. It worsens with prolonged sitting or physical activity. She has tried over-the-counter pain medications with limited relief. + +FHX: No family history of chronic back pain or spinal disorders. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +On examination, there is tenderness on palpation over the lumbar spine. Range of motion is slightly restricted. No neurological deficits are noted. + +Seen by Dr. R. Martinez on 10/15/2023. + +" +doc_31," +Name: Ethan Thompson +Address: 18 Oak Avenue +City: Riverside +CC: Abdominal pain. + +HX: Mr. Thompson is a 32-year-old male presenting with intermittent abdominal pain for the past two weeks. The pain is localized to the right lower quadrant and is associated with occasional nausea. It is not aggravated by food intake. No changes in bowel movements or urinary symptoms. + +FHX: No significant family history of abdominal disorders. + +SHX: Office worker. Non-smoker. Occasional alcohol consumption. + +Abdominal examination reveals tenderness and mild guarding in the right lower quadrant. No rebound tenderness or palpable masses are noted. + +Seen by Dr. S. Reynolds on 10/18/2023. + +" +doc_32," +Name: Sophia Walker +Address: 9 Maple Lane +City: Willowville +CC: Fatigue and weakness. + +HX: Ms. Walker is a 52-year-old female who presents with complaints of persistent fatigue and weakness for the past two months. She reports feeling tired even after a good night's sleep and experiences difficulty in performing routine tasks. No specific triggers or alleviating factors identified. + +FHX: No family history of chronic fatigue or neuromuscular disorders. + +SHX: Homemaker. Non-smoker. No alcohol consumption. + +Physical examination reveals generalized weakness without focal neurological deficits. No abnormal findings on cardiovascular or respiratory examination. + +Seen by Dr. L. Carter on 10/21/2023. + +" +doc_33," +Name: Benjamin Harris +Address: 5 Pine Street +City: Meadowville +CC: Headache and dizziness. + +HX: Mr. Harris is a 38-year-old male presenting with recurrent headaches and dizziness for the past month. The headaches are described as throbbing in nature and occur mostly in the afternoon. Dizziness is experienced upon standing up quickly or with sudden head movements. + +FHX: No significant family history of migraines or vestibular disorders. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +Neurological examination is unremarkable. No abnormal findings on visual acuity, coordination, or gait. + +Seen by Dr. M. Rodriguez on 10/24/2023. + +" +doc_34," +Name: Lily Green +Address: 23 Cedar Road +City: Woodville +CC: Allergic rhinitis. + +HX: Ms. Green is a 28-year-old female presenting with symptoms of sneezing, nasal congestion, and itchy, watery eyes for the past few weeks. Symptoms are worse in the morning and improve throughout the day. She reports a history of seasonal allergies. + +FHX: No significant family history of allergic rhinitis or respiratory disorders. + +SHX: Teacher. Non-smoker. No alcohol consumption. + +Physical examination reveals clear nasal discharge, congestion, and allergic shiners. No signs of respiratory distress. + +Seen by Dr. K. Mitchell on + +" +doc_35," +Name: Henry Foster +Address: 14 Willow Street +City: Meadowville +CC: Cough and shortness of breath. + +HX: Mr. Foster is a 62-year-old male presenting with a persistent cough and shortness of breath for the past two weeks. The cough is productive of yellowish sputum. He reports feeling breathless even with minimal exertion. No fever or chest pain. + +FHX: No significant family history of respiratory disorders. + +SHX: Retired. Former smoker (quit 10 years ago). No alcohol consumption. + +Chest auscultation reveals decreased breath sounds and scattered crackles. No wheezing or dullness on percussion. + +Seen by Dr. S. Adams on 10/27/2023. + +" +doc_36," +Name: Emily Evans +Address: 8 Cherry Lane +City: Riverside +CC: Sleep disturbances. + +HX: Ms. Evans is a 35-year-old female presenting with complaints of sleep disturbances for the past month. She reports difficulty falling asleep and frequent awakenings during the night. No daytime sleepiness or snoring. No significant life stressors identified. + +FHX: No family history of sleep disorders or psychiatric conditions. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +No significant findings on physical examination. Normal mental status and intact concentration. + +Seen by Dr. L. Carter on 10/30/2023. + +" +doc_37," +Name: Samuel Hayes +Address: 11 Elm Street +City: Springfield +CC: Abnormal mole. + +HX: Mr. Hayes is a 42-year-old male who noticed an abnormal mole on his back. The mole has increased in size and has an irregular border. He reports occasional itching but no pain or bleeding. + +FHX: No significant family history of skin cancer or melanoma. + +SHX: Construction worker. Non-smoker. Occasional alcohol consumption. + +Skin examination reveals a dark, asymmetrical mole with irregular borders and uneven coloration. No palpable lymph nodes in the surrounding area. + +Seen by Dr. R. Martinez on 11/2/2023. + +" +doc_38," +Name: Isabella Simmons +Address: 6 Oak Avenue +City: Willowville +CC: Joint pain and swelling. + +HX: Ms. Simmons is a 55-year-old female presenting with joint pain and swelling in her hands and knees for the past three months. The pain is worse in the morning and improves with movement. No history of trauma or recent infections. + +FHX: No significant family history of autoimmune disorders or arthritis. + +SHX: Teacher. Non-smoker. Rare alcohol consumption. + +Joint examination reveals swelling and tenderness in the proximal and distal interphalangeal joints and knees. No erythema or warmth. + +Seen by Dr. S. Reynolds on 11/5/2023. + +" +doc_39," +Name: Daniel Thompson +Address: 9 Maple Lane +City: Willowville +CC: Epigastric pain and heartburn. + +HX: Mr. Thompson is a 48-year-old male presenting with epigastric pain and heartburn for the past two weeks. The pain is described as a burning sensation and is aggravated by spicy foods and lying down after meals. No vomiting or black, tarry stools. + +FHX: No significant family history of gastrointestinal disorders. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +Abdominal examination reveals epigastric tenderness on palpation. No rebound tenderness or organomegaly. + +Seen by Dr. L. Carter on 11/8/2023. + +" +doc_40," +Name: Emily Turner +Address: 15 Pine Street +City: Meadowville +CC: Fatigue and weight gain. + +HX: Ms. Turner is a 30-year-old female presenting with persistent fatigue and unexplained weight gain over the past six months. She reports feeling tired despite getting adequate sleep and has noticed a significant increase in her weight without changes in her diet or exercise routine. + +FHX: No significant family history of endocrine disorders or autoimmune conditions. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +Physical examination reveals no specific abnormalities. No edema or thyroid enlargement palpable. + +Seen by Dr. M. Rodriguez on 11/11/2023. + +" +doc_41," +Name: Oliver Clark +Address: 7 Cedar Road +City: Woodville +CC: Swollen lymph nodes. + +HX: Mr. Clark is a 44-year-old male presenting with enlarged lymph nodes in his neck and groin for the past two weeks. The lymph nodes are painless and progressively increasing in size. No fever or night sweats reported. + +FHX: No significant family history of lymphatic disorders or malignancies. + +SHX: Teacher. Non-smoker. Rare alcohol consumption. + +Lymph node examination reveals palpable, enlarged lymph nodes in the neck and groin regions. No other abnormal findings. + +Seen by Dr. K. Mitchell on 11/14/2023. + +" +doc_42," +Name: Ava Patterson +Address: 13 Willow Street +City: Meadowville +CC: Irregular menstrual cycles. + +HX: Ms. Patterson is a 27-year-old female presenting with irregular menstrual cycles for the past six months. She reports unpredictable timing, varying durations, and occasional heavy bleeding during her periods. No significant pain or other associated symptoms. + +FHX: No significant family history of gynecological disorders or hormonal imbalances. + +SHX: Office worker. Non-smoker. No alcohol consumption. + +Pelvic examination reveals no palpable masses or tenderness. Normal external genitalia and vaginal walls. + +Seen by Dr. S. Adams on 11/17/2023. + +" +doc_43," +Name: Noah Turner +Address: 16 Oak Avenue +City: Riverside +CC: Frequent urination and increased thirst. + +HX: Mr. Turner is a 58-year-old male presenting with frequent urination and increased thirst for the past month. He reports waking up multiple times during the night to urinate and feeling constantly thirsty throughout the day. No significant weight changes or other urinary symptoms. + +FHX: No significant family history of diabetes or renal disorders. + +SHX: Retired. Non-smoker. Occasional alcohol consumption. + +No specific findings on physical examination. No edema or signs of dehydration. + +Seen by Dr. L. Carter on 11/20/2023. + +" +doc_44," +Name: Mia Mitchell +Address: 10 Cherry Lane +City: Riverside +CC: Skin rash and itching. + +HX: Ms. Mitchell is a 36-year-old female presenting with a skin rash and intense itching for the past week. The rash is characterized by red, raised bumps and appears primarily on her arms and legs. It worsens at night and with exposure to heat. + +FHX: No significant family history of skin conditions or allergies. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +Skin examination reveals multiple erythematous papules and plaques with excoriation marks. No signs of infection. + +Seen by Dr. R. Martinez on 11/23/2023. + +" +doc_45," +Name: Ethan Johnson +Address: 11 Maple Lane +City: Willowville +CC: Abdominal bloating and constipation. + +HX: Mr. Johnson is a 50-year-old male presenting with complaints of abdominal bloating and constipation for the past two months. He reports feeling full quickly after eating and experiences infrequent bowel movements. No significant changes in diet or exercise. + +FHX: No significant family history of gastrointestinal disorders. + +SHX: Construction worker. Non-smoker. Occasional alcohol consumption. + +Abdominal examination reveals distension and mild tenderness on palpation. No masses or organomegaly appreciated. + +Seen by Dr. S. Reynolds on 11/26/2023. + +" +doc_46," +Name: Sophia Nelson +Address: 17 Elm Street +City: Springfield +CC: Anxiety and panic attacks. + +HX: Ms. Nelson is a 33-year-old female presenting with symptoms of anxiety and recurrent panic attacks for the past six months. She describes episodes of sudden fear, rapid heartbeat, shortness of breath, and sweating. No specific triggers identified. + +FHX: No significant family history of anxiety or psychiatric disorders. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +Normal findings on physical examination. No signs of distress during the evaluation. + +Seen by Dr. M. Rodriguez on 11/29/2023. + +" +doc_47," +Name: Olivia Clark +Address: 9 Cedar Road +City: Woodville +CC: Knee pain and swelling. + +HX: Ms. Clark is a 42-year-old female presenting with pain and swelling in her right knee for the past month. She reports that the symptoms started gradually and worsen with prolonged activity or climbing stairs. No history of trauma or previous knee issues. + +FHX: No significant family history of joint disorders or arthritis. + +SHX: Teacher. Non-smoker. No alcohol consumption. + +On examination, there is swelling and tenderness in the right knee joint. Limited range of motion due to pain. + +Seen by Dr. K. Mitchell on 12/2/2023. + +" +doc_48," +Name: Benjamin Anderson +Address: 12 Oak Avenue +City: Riverside +CC: Sore throat and difficulty swallowing. + +HX: Mr. Anderson is a 28-year-old male presenting with a sore throat and difficulty swallowing for the past week. He reports pain and discomfort with swallowing, especially with solid foods. No fever, cough, or other respiratory symptoms. + +FHX: No significant family history of throat infections or inflammatory conditions. + +SHX: Office worker. Non-smoker. Occasional alcohol consumption. + +Throat examination reveals erythema and swelling of the posterior pharynx. No tonsillar enlargement or exudate. + +Seen by Dr. L. Carter on 12/5/2023. + +" +doc_49," +Name: Lily Cooper +Address: 14 Cherry Lane +City: Riverside +CC: Frequent headaches. + +HX: Ms. Cooper is a 25-year-old female presenting with recurrent headaches for the past three months. The headaches occur several times a week and are described as throbbing in nature. No specific triggers or associated symptoms identified. + +FHX: No significant family history of migraines or neurological disorders. + +SHX: Teacher. Non-smoker. Rare alcohol consumption. + +Normal neurological examination. No focal deficits or abnormalities. + +Seen by Dr. R. Martinez on 12/8/2023. + +" +doc_50," +Name: Sophia Williams +Address: 15 Elm Street +City: Springfield +CC: Fatigue and muscle weakness. + +HX: Ms. Williams is a 42-year-old female presenting with persistent fatigue and muscle weakness for the past two months. She reports feeling tired even after getting sufficient rest and experiences difficulty performing daily activities. No significant weight changes or other associated symptoms. + +FHX: No significant family history of muscular disorders or autoimmune conditions. + +SHX: Office worker. Non-smoker. Occasional alcohol consumption. + +Physical examination reveals decreased muscle strength and generalized weakness. No specific findings on neurological evaluation. + +Seen by Dr. S. Adams on 12/11/2023. + +" +doc_51," +Name: Benjamin Turner +Address: 16 Pine Street +City: Meadowville +CC: Chest pain and shortness of breath. + +HX: Mr. Turner is a 52-year-old male presenting with complaints of chest pain and shortness of breath for the past week. The chest pain is described as a squeezing sensation and is accompanied by breathlessness during exertion. No associated symptoms of dizziness or palpitations. + +FHX: No significant family history of cardiac disorders or cardiovascular conditions. + +SHX: Retired. Non-smoker. Rare alcohol consumption. + +Cardiovascular examination reveals regular heart sounds and no murmurs. No signs of respiratory distress. + +Seen by Dr. M. Rodriguez on 12/14/2023. + +" +doc_52," +Name: Chloe Parker +Address: 13 Cedar Road +City: Woodville +CC: Frequent urination and burning sensation. + +HX: Ms. Parker is a 30-year-old female presenting with frequent urination and a burning sensation during urination for the past week. She reports a sense of urgency to urinate and occasional lower abdominal discomfort. No fever or back pain. + +FHX: No significant family history of urinary tract infections or urological conditions. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +No specific findings on physical examination. No costovertebral angle tenderness. + +Seen by Dr. K. Mitchell on 12/17/2023. + +" +doc_53," +Name: Oliver Lewis +Address: 10 Maple Lane +City: Willowville +CC: Vision changes and eye pain. + +HX: Mr. Lewis is a 60-year-old male presenting with vision changes and intermittent eye pain in his right eye for the past month. He reports blurred vision and the sensation of pressure in the eye. No redness or discharge noted. + +FHX: No significant family history of eye disorders or ocular conditions. + +SHX: Construction worker. Non-smoker. Occasional alcohol consumption. + +Visual acuity testing reveals decreased vision in the right eye. No external abnormalities or conjunctival injection. + +Seen by Dr. S. Reynolds on 12/20/2023. + +" +doc_54," +Name: Emma Peterson +Address: 17 Oak Avenue +City: Riverside +CC: Abdominal pain and diarrhea. + +HX: Ms. Peterson is a 38-year-old female presenting with abdominal pain and frequent episodes of diarrhea for the past week. The abdominal pain is crampy in nature and is associated with loose, watery stools. No blood or mucus in the stool. + +FHX: No significant family history of gastrointestinal disorders. + +SHX: Teacher. Non-smoker. No alcohol consumption. + +Abdominal examination reveals tenderness in the lower abdomen. No rebound tenderness or palpable masses. + +Seen by Dr. L. Carter on 12/23/2023. + +" +doc_55," +Name: Amelia Adams +Address: 11 Willow Street +City: Meadowville +CC: Depression and loss of interest. + +HX: Ms. Adams is a 35-year-old female presenting with symptoms of depression and loss of interest in activities for the past six months. She reports feeling sad, hopeless, and having a decreased motivation to engage in previously enjoyed hobbies. No suicidal thoughts or changes in appetite. + +FHX: No significant family history of mood disorders or psychiatric conditions. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +No specific findings on physical examination. No signs of distress during the evaluation. + +Seen by Dr. R. Martinez on 12/26/2023. + +" +doc_56," +Name: Henry Turner +Address: 12 Pine Street +City: Meadowville +CC: Joint pain and stiffness. + +HX: Mr. Turner is a 60-year-old male presenting with joint pain and stiffness in his hands and knees for the past three months. He reports difficulty with movements, especially in the mornings, and occasional swelling in the affected joints. No history of trauma or previous joint disorders. + +FHX: No significant family history of arthritis or rheumatic conditions. + +SHX: Retired. Non-smoker. Rare alcohol consumption. + +On examination, there is tenderness, warmth, and swelling in the affected joints. Limited range of motion due to pain. + +Seen by Dr. M. Rodriguez on 12/29/2023. + +" +doc_57," +Name: Harper Mitchell +Address: 14 Cedar Road +City: Woodville +CC: Allergic rhinitis and nasal congestion. + +HX: Ms. Mitchell is a 28-year-old female presenting with symptoms of allergic rhinitis, including nasal congestion, sneezing, and itchy eyes, for the past two weeks. She reports these symptoms are worse in the morning and in certain environments. No history of sinus infections or nasal polyps. + +FHX: No significant family history of allergies or respiratory conditions. + +SHX: Office worker. Non-smoker. Rare alcohol consumption. + +Nasal examination reveals nasal congestion, clear rhinorrhea, and pale, boggy nasal mucosa. No signs of septal deviation or polyps. + +Seen by Dr. K. Mitchell on 1/2/2024. + +" +doc_58," +Name: Jackson Turner +Address: 9 Maple Lane +City: Willowville +CC: Sleep disturbances and daytime sleepiness. + +HX: Mr. Turner is a 45-year-old male presenting with complaints of sleep disturbances and excessive daytime sleepiness for the past three months. He reports difficulty falling asleep, frequent awakenings during the night, and feeling tired during the day despite sufficient hours of sleep. + +FHX: No significant family history of sleep disorders or neurological conditions. + +SHX: Construction worker. Non-smoker. Occasional alcohol consumption. + +No specific findings on physical examination. No signs of respiratory disorders. + +Seen by Dr. S. Reynolds on 1/5/2024. + +" +doc_59," +Name: Penelope Walker +Address: 13 Oak Avenue +City: Riverside +CC: Nausea and vomiting. + +HX: Ms. Walker is a 42-year-old female presenting with symptoms of nausea and vomiting for the past two days. She reports episodes of sudden, uncontrollable vomiting and a persistent feeling of queasiness. No abdominal pain or changes in bowel movements. + +FHX: No significant family history of gastrointestinal disorders. + +SHX: Teacher. Non-smoker. No alcohol consumption. + +Abdominal examination reveals no tenderness or palpable masses. No signs of dehydration. + +Seen by Dr. L. Carter on 1/8/2024." diff --git a/medcat/compare_models/model_comparison.ipynb b/medcat/compare_models/model_comparison.ipynb new file mode 100644 index 0000000..0a723d9 --- /dev/null +++ b/medcat/compare_models/model_comparison.ipynb @@ -0,0 +1,1504 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This notebook will attempt to show how to compare two models\n", + "\n", + "We often arrive in a situation where we've got multiple different models. Yet we're note sure which one we should focus on or start from for a particular task.\n", + "This notebook aims to introduce some tools that (hopefully) help us do that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initial input - models and data\n", + "\n", + "There are two different workflows this notebook can handle:\n", + "1. Compare two different model packs\n", + " - Provide 2 model pack paths\n", + " - Provide a documents file\n", + "2. Compare model pack with and without supervised training\n", + " - Provide 1 model pack path\n", + " - Provide a file path to a MedCATtrainer (MCT) export\n", + " - Provide a document file\n", + "\n", + "The model packs can be either the `.zip` file (which will be automatically unzipped) or the folder.\n", + "\n", + "The documents file is expected in a `.csv` format with two columns (`id`, and `text`).\n", + "\n", + "The MCT export is expected in the format given by MedCATtrainer.\n", + "\n", + "For the two approaches, there is a slightly different internal workflow.\n", + "But other than ticking the checkbox, the process should be identical to the user." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "af259b92622d4fd2b04ab64549e17715", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FileChooser(path='/Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/models/modelpack', …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c9edb3221d464cb7a2968d42595188fa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FileChooser(path='/Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/models/modelpack', …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "75fd5d8c615149ff8bc94f0fdacde303", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FileChooser(path='/Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/medcat/compare_mode…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from ipyfilechooser import FileChooser\n", + "from ipywidgets import widgets\n", + "import os\n", + "_def_path = '../../models/modelpack'\n", + "_def_path = _def_path if os.path.exists(_def_path) else '.'\n", + "model1_chooser = FileChooser(_def_path)\n", + "model2_chooser = FileChooser(_def_path)\n", + "documents_chooser = FileChooser(\".\")\n", + "display(model1_chooser)\n", + "display(model2_chooser)\n", + "display(documents_chooser)\n", + "ckbox = widgets.Checkbox(description=\"MCT export compare\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CUI filter settings\n", + "\n", + "These are optional.\n", + "\n", + "If you wish to filter based on CUIs (i.e only run the comparison for some CUIs), you can do so.\n", + "You can either list the CUIs (separated by comma) or provide a file that lists them (separated by comma).\n", + "\n", + "You can also include the children of the selected CUIs.The default is not to do so.\n", + "But you can opt to include children of a certain order (i.e `1` means direct children only, `2` meand children of children as well, and so on)." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "59dd2ef908544289b722f01df6bb2cd3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FileChooser(path='/Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/medcat/compare_mode…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "94b9caa5b32548bc8d0a64a36ebcff6b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Textarea(value='', description='CUI list')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6e7cbc6e4b4d4b05b9eb9c731811afc1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntText(value=-1, description='Children')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from ipywidgets import widgets\n", + "cui_filter_chooser = FileChooser(\".\", description=\"The CUI filter file\")\n", + "cui_filter_box = widgets.Textarea(description=\"CUI list\")\n", + "cui_children = widgets.IntText(description=\"Children\", value=-1)\n", + "display(cui_filter_chooser)\n", + "display(cui_filter_box)\n", + "display(cui_children)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For models, selected:\n", + "Model1: /Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/models/modelpack/KCH2024_snomed_no_enrichment.zip\n", + "Model2: /Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/models/modelpack/SNOMED2024_UK_FINAL_0c0de303b6dc0020.zip\n", + "Documents: /Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/medcat/compare_models/data/some_synthetic_data.csv\n", + "For CUI filter, selected:\n", + "Filter: None\n", + "Children: None\n" + ] + } + ], + "source": [ + "model_path_1 = model1_chooser.selected\n", + "model_path_2 = model2_chooser.selected\n", + "documents_file = documents_chooser.selected\n", + "is_mct_export_compare = ckbox.value\n", + "if not is_mct_export_compare:\n", + " print(f\"For models, selected:\\nModel1: {model_path_1}\\nModel2: {model_path_2}\"\n", + " f\"\\nDocuments: {documents_file}\")\n", + "else:\n", + " print(f\"Selected:\\nModel: {model_path_1}\\nMCT export: {model_path_2}\"\n", + " f\"\\nDocuments: {documents_file}\")\n", + "# CUI filter\n", + "cui_filter = None\n", + "filter_children = None\n", + "if cui_filter_chooser.selected:\n", + " cui_filter = cui_filter_chooser.selected\n", + "elif cui_filter_box.value:\n", + " cui_filter = cui_filter_box.value\n", + "if cui_children.value and cui_children.value > 0:\n", + " filter_children = cui_children.value\n", + "print(f\"For CUI filter, selected:\\nFilter: {cui_filter}\\nChildren: {filter_children}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Running the difference finder\n", + "\n", + "Now that we've got the input data, we need to figure out how they work and what their differences are.\n", + "We use the `get_diffs_for` method that loads both models, runs `CAT.get_entities` on each document for either model, and then returns some results.\n", + "\n", + "These results show describe the difference in the raw CDB (i.e the number of concepts (join and unique), amount of training, and so on), the total differences in the entities extracted (i.e the number of recognitions and forms per CUI) as well as per document differences (i.e the number of identical as well as different entity recognitions found).\n", + "\n", + "We will look into the details later." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading [1] ../../../MedCAT/temp/model_packs/20230227__kch_gstt_trained_model_494c3717f637bb89.zip\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/martratas/Documents/CogStack/.MedCAT.nosync/working_with_cogstack/venv310/lib/python3.10/site-packages/spacy/util.py:877: UserWarning: [W095] Model 'en_core_web_md' (3.1.0) was trained with spaCy v3.1 and may not be 100% compatible with the current version (3.4.4). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n", + " warnings.warn(warn_msg)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading [2] ../../../MedCAT/temp/model_packs/snomed2024_kch_trained_d4092ab9f5360973.zip\n", + "Per annotations diff finding\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 60/60 [00:09<00:00, 6.53it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counting [1&2]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 60/60 [00:00<00:00, 10632.40it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CDB compare\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "keys: 100%|██████████| 794151/794151 [00:01<00:00, 557600.58it/s]\n", + "keys: 100%|██████████| 794151/794151 [00:02<00:00, 308384.42it/s]\n" + ] + } + ], + "source": [ + "from compare import get_diffs_for\n", + "from output import parse_and_show, show_dict_deep, compare_dicts\n", + "\n", + "cdb_comp, tally1, tally2, ann_diffs = get_diffs_for(model_path_1, model_path_2, documents_file, cui_filter=cui_filter, include_children_in_filter=filter_children,\n", + " supervised_train_comparison_model=is_mct_export_compare)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For now, we'll use the common parser/display method to dispaly an overview of the results.\n", + "We can later look at more granual details as well." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CDB overall differences:\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| names.keys.joint | 752042 | |\n", + "| names.keys.total | 760283 | 785910 |\n", + "| names.keys.not_in_ | 33868 | 8241 |\n", + "| names.values.joint | 2327941 | |\n", + "| names.values.total | 3149859 | 2510372 |\n", + "| names.values.unique_in_ | 752906 | 152108 |\n", + "| names.values.not_in_ | 170834 | 810321 |\n", + "| snames.keys.joint | 752042 | |\n", + "| snames.keys.total | 760283 | 785910 |\n", + "| snames.keys.not_in_ | 33868 | 8241 |\n", + "| snames.values.joint | 5094031 | |\n", + "| snames.values.total | 13486640 | 11958247 |\n", + "| snames.values.unique_in_ | 1565939 | 349022 |\n", + "| snames.values.not_in_ | 670099 | 2198492 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now tally differences\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pt2ch (Dict[str, Set]) | 352226 keys (mean 2.0 values per key) | 147466 keys (mean 2.0 values per key) |\n", + "| cat_data | {'Number of concepts': 760283, 'Number of names': 3080845, 'Number of concepts that received training': 38460, 'Number of seen training examples in total': 153875883, 'Average training examples per concept': 4000.932995319813} | {'Number of concepts': 785910, 'Number of names': 2480049, 'Number of concepts that received training': 373727, 'Number of seen training examples in total': 1474910653, 'Average training examples per concept': 3946.492099848285} |\n", + "| per_cui_count (Dict[str, int]) | 621 keys (total 2220 in value) | 584 keys (total 2162 in value) |\n", + "| per_cui_acc (Dict[str, float]) | 621 keys (mean 0.9029113037725474 in value) | 584 keys (mean 0.963999005716541 in value) |\n", + "| per_cui_forms (Dict[str, Set]) | 621 keys (mean 2.0 values per key) | 584 keys (mean 2.0 values per key) |\n", + "| per_type_counts (Dict[str, int]) | 25 keys (total 2220 in value) | 24 keys (total 2162 in value) |\n", + "| total_count | 2220 | 2162 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now per-annotation differences:\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 1406 | |\n", + "| FIRST_HAS | 419 | |\n", + "| SECOND_HAS | 361 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 183 | |\n", + "| SAME_GRANDPARENT | 1 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 129 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_2ND | 18 | |\n", + "| SAME_PARENT | 38 | |\n", + "| OVERLAPP_2ND_LARGER_DIFF_CONCEPT | 14 | |\n", + "| OVERLAPP_1ST_LARGER_SAME_CONCEPT | 9 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_1ST | 3 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# show results\n", + "parse_and_show(cdb_comp, tally1, tally2, ann_diffs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More granual details (per document view)\n", + "\n", + "The above does not give us all the information we need.\n", + "For instance, we may also want to compare the performance accross some documents.\n", + "We can do so as follows." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_0 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 41 | |\n", + "| FIRST_HAS | 6 | |\n", + "| SECOND_HAS | 6 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 3 | |\n", + "| SAME_GRANDPARENT | 1 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 4 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_1 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 28 | |\n", + "| FIRST_HAS | 10 | |\n", + "| SECOND_HAS | 5 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_2ND | 1 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 3 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 3 | |\n", + "| SAME_PARENT | 2 | |\n", + "| OVERLAPP_2ND_LARGER_DIFF_CONCEPT | 1 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 33 | |\n", + "| FIRST_HAS | 6 | |\n", + "| SECOND_HAS | 6 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 2 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 2 | |\n", + "| SAME_PARENT | 1 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_3 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 26 | |\n", + "| FIRST_HAS | 5 | |\n", + "| OVERLAPP_1ST_LARGER_SAME_CONCEPT | 2 | |\n", + "| SECOND_HAS | 10 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_2ND | 1 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 10 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 3 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_4 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 25 | |\n", + "| FIRST_HAS | 6 | |\n", + "| OVERLAPP_1ST_LARGER_SAME_CONCEPT | 1 | |\n", + "| SECOND_HAS | 6 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_2ND | 1 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_1ST | 1 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 2 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 3 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_5 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 29 | |\n", + "| FIRST_HAS | 6 | |\n", + "| SECOND_HAS | 6 | |\n", + "| OVERLAPP_2ND_LARGER_DIFF_CONCEPT | 1 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 1 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 2 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_6 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 28 | |\n", + "| FIRST_HAS | 9 | |\n", + "| SECOND_HAS | 6 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 2 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_2ND | 1 | |\n", + "| SAME_PARENT | 1 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 1 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_7 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 28 | |\n", + "| FIRST_HAS | 7 | |\n", + "| OVERLAPP_1ST_LARGER_SAME_CONCEPT | 1 | |\n", + "| SECOND_HAS | 4 | |\n", + "| SAME_SPAN_CONCEPT_NOT_IN_2ND | 1 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 1 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 2 | |\n", + "| OVERLAPP_2ND_LARGER_DIFF_CONCEPT | 1 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_8 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 24 | |\n", + "| FIRST_HAS | 6 | |\n", + "| SECOND_HAS | 7 | |\n", + "| OVERLAPP_1ST_LARGER_DIFF_CONCEPT | 2 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 3 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_9 \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | Value | [Optional] Comparison |\n", + "| ----- | ----- | ----- |\n", + "| IDENTICAL | 12 | |\n", + "| FIRST_HAS | 4 | |\n", + "| SECOND_HAS | 6 | |\n", + "| SAME_SPAN_DIFF_CONCEPT | 1 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# you can play with individual parts as well.\n", + "# for example, isolate a specific document\n", + "ann_diffs.per_doc_results.keys()\n", + "\n", + "for key in list(ann_diffs.per_doc_results.keys())[0:10]:\n", + " print('='*20,f'\\n{key}', f'\\n{\"=\"*20}')\n", + " show_dict_deep(ann_diffs.per_doc_results[key].nr_of_comparisons)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Saving annotation output to CSV file\n", + "You can also save the annotation output to a .csv file. That file inclues the following columns:\n", + "```\n", + "doc_id text ann1 ann2\n", + "```\n", + "where `doc_id` refers to the ID of the document in question, `text` is the relevant text around the specific annotation, `ann1` is the annotation json for model 1 (if present), and `ann2` is the annotation json for model 2 (if present).\n", + "\n", + "*Note:* One of the annotations may not be present. This is the case if one of the models did not annotate that specific span." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "ann_diffs.to_csv(\"23vs24_annotations.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More granual details (per cui view)\n", + "\n", + "We may also want to look at how we did for a specific CUI.\n", + "This is how we can do that." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| name | Headache (and 76 children) | Headache (and 96 children) |\n", + "| count | 12 | 18 |\n", + "| acc | 3.0 | 3.0 |\n", + "| forms | 3 | 3 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# cui = '37151006' # Erythromelalgia\n", + "cui = '25064002' # headache\n", + "per_cui1 = tally1.get_for_cui(cui, include_children=2)\n", + "per_cui2 = tally2.get_for_cui(cui, include_children=2)\n", + "compare_dicts(per_cui1, per_cui2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More granual details (per annotation view)\n", + "Sometimes we may want to look at things on a per annotation basis as well.\n", + "That is, we want to look at some annotations and compare them between the two models." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.FIRST_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Genus Quercus | |\n", + "| cui | 53347009 | |\n", + "| type_ids | ['81102976'] | |\n", + "| types | [''] | |\n", + "| source_value | Oak | |\n", + "| detected_name | oak | |\n", + "| acc | 0.6368384509248382 | |\n", + "| context_similarity | 0.6368384509248382 | |\n", + "| start | 43 | |\n", + "| end | 46 | |\n", + "| icd10 | [] | |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | |\n", + "| snomed | [] | |\n", + "| id | 3 | |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 43 | |\n", + "| end-raw | 46 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.FIRST_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Milliliter | |\n", + "| cui | 258773002 | |\n", + "| type_ids | ['7882689'] | |\n", + "| types | [''] | |\n", + "| source_value | CC | |\n", + "| detected_name | cc | |\n", + "| acc | 0.5504460208011586 | |\n", + "| context_similarity | 0.5504460208011586 | |\n", + "| start | 68 | |\n", + "| end | 70 | |\n", + "| icd10 | [] | |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | |\n", + "| snomed | [] | |\n", + "| id | 5 | |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 68 | |\n", + "| end-raw | 70 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SECOND_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | | Acute bronchitis |\n", + "| cui | | 10509002 |\n", + "| type_ids | | ['9090192'] |\n", + "| types | | ['disorder'] |\n", + "| source_value | | Acute bronchitis |\n", + "| detected_name | | acute~bronchitis |\n", + "| acc | | 1.0 |\n", + "| context_similarity | | 1.0 |\n", + "| start | | 72 |\n", + "| end | | 88 |\n", + "| icd10 | | ['J205', 'J206', 'J208', 'J202', 'J207', 'J200', 'J201', 'J700', 'J209', 'J203', 'J204', 'J680'] |\n", + "| ontologies | | ['SNOMED-CT'] |\n", + "| snomed | | [] |\n", + "| id | | 6 |\n", + "| meta_anns (Dict[str, dict]) | 0 | 0 |\n", + "| start-raw | | 72 |\n", + "| end-raw | | 88 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.FIRST_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | History of (contextual qualifier) | |\n", + "| cui | 392521001 | |\n", + "| type_ids | ['7882689'] | |\n", + "| types | [''] | |\n", + "| source_value | HX | |\n", + "| detected_name | hx | |\n", + "| acc | 1.0 | |\n", + "| context_similarity | 1.0 | |\n", + "| start | 90 | |\n", + "| end | 92 | |\n", + "| icd10 | [] | |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | |\n", + "| snomed | [] | |\n", + "| id | 9 | |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 90 | |\n", + "| end-raw | 92 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SECOND_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | | year |\n", + "| cui | | 258707000 |\n", + "| type_ids | | ['7882689'] |\n", + "| types | | ['qualifier value'] |\n", + "| source_value | | year |\n", + "| detected_name | | year |\n", + "| acc | | 0.99 |\n", + "| context_similarity | | 0.99 |\n", + "| start | | 114 |\n", + "| end | | 118 |\n", + "| icd10 | | [] |\n", + "| ontologies | | ['SNOMED-CT'] |\n", + "| snomed | | [] |\n", + "| id | | 9 |\n", + "| meta_anns (Dict[str, dict]) | 0 | 0 |\n", + "| start-raw | | 114 |\n", + "| end-raw | | 118 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SECOND_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | | Old episode |\n", + "| cui | | 272131007 |\n", + "| type_ids | | ['7882689'] |\n", + "| types | | ['qualifier value'] |\n", + "| source_value | | old |\n", + "| detected_name | | old |\n", + "| acc | | 0.9644956622075471 |\n", + "| context_similarity | | 0.9644956622075471 |\n", + "| start | | 119 |\n", + "| end | | 122 |\n", + "| icd10 | | [] |\n", + "| ontologies | | ['SNOMED-CT'] |\n", + "| snomed | | [] |\n", + "| id | | 10 |\n", + "| meta_anns (Dict[str, dict]) | 0 | 0 |\n", + "| start-raw | | 119 |\n", + "| end-raw | | 122 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SECOND_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | | Male |\n", + "| cui | | 248153007 |\n", + "| type_ids | | ['67667581'] |\n", + "| types | | ['finding'] |\n", + "| source_value | | male |\n", + "| detected_name | | male |\n", + "| acc | | 0.99 |\n", + "| context_similarity | | 0.99 |\n", + "| start | | 123 |\n", + "| end | | 127 |\n", + "| icd10 | | ['#NC'] |\n", + "| ontologies | | ['SNOMED-CT'] |\n", + "| snomed | | [] |\n", + "| id | | 11 |\n", + "| meta_anns (Dict[str, dict]) | 0 | 0 |\n", + "| start-raw | | 123 |\n", + "| end-raw | | 127 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.FIRST_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Presentation | |\n", + "| cui | 246105001 | |\n", + "| type_ids | ['43039974'] | |\n", + "| types | [''] | |\n", + "| source_value | presents | |\n", + "| detected_name | present | |\n", + "| acc | 0.4530222896013254 | |\n", + "| context_similarity | 0.4530222896013254 | |\n", + "| start | 132 | |\n", + "| end | 140 | |\n", + "| icd10 | [] | |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | |\n", + "| snomed | [] | |\n", + "| id | 15 | |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 132 | |\n", + "| end-raw | 140 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SECOND_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | | Report |\n", + "| cui | | 229059009 |\n", + "| type_ids | | ['90170645'] |\n", + "| types | | ['record artifact'] |\n", + "| source_value | | reports |\n", + "| detected_name | | report |\n", + "| acc | | 1.0 |\n", + "| context_similarity | | 1.0 |\n", + "| start | | 179 |\n", + "| end | | 186 |\n", + "| icd10 | | [] |\n", + "| ontologies | | ['SNOMED-CT'] |\n", + "| snomed | | [] |\n", + "| id | | 15 |\n", + "| meta_anns (Dict[str, dict]) | 0 | 0 |\n", + "| start-raw | | 179 |\n", + "| end-raw | | 186 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.OVERLAPP_1ST_LARGER_DIFF_CONCEPT) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Productive cough | Cough |\n", + "| cui | 28743005 | 49727002 |\n", + "| type_ids | ['67667581'] | ['67667581'] |\n", + "| types | [''] | ['finding'] |\n", + "| source_value | cough productive | cough |\n", + "| detected_name | cough~productive | cough |\n", + "| acc | 1.0 | 1.0 |\n", + "| context_similarity | 1.0 | 1.0 |\n", + "| start | 189 | 189 |\n", + "| end | 205 | 194 |\n", + "| icd10 | ['R05'] | ['R05X', 'J410', 'J111', 'F453', 'R042'] |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | ['SNOMED-CT'] |\n", + "| snomed | [] | [] |\n", + "| id | 21 | 16 |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 189 | 189 |\n", + "| end-raw | 205 | 194 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SAME_SPAN_DIFF_CONCEPT) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | In the past | Past |\n", + "| cui | 410513005 | 716861000000108 |\n", + "| type_ids | ['7882689'] | ['90170645'] |\n", + "| types | [''] | ['record artifact'] |\n", + "| source_value | past | past |\n", + "| detected_name | past | past |\n", + "| acc | 0.915658888260423 | 0.99 |\n", + "| context_similarity | 0.915658888260423 | 0.99 |\n", + "| start | 200 | 200 |\n", + "| end | 204 | 204 |\n", + "| icd10 | [] | [] |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | ['SNOMED-CT'] |\n", + "| snomed | [] | [] |\n", + "| id | 31 | 26 |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 278 | 278 |\n", + "| end-raw | 282 | 282 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.FIRST_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Mitral valve regurgitation | |\n", + "| cui | 48724000 | |\n", + "| type_ids | ['9090192'] | |\n", + "| types | [''] | |\n", + "| source_value | Mr | |\n", + "| detected_name | mr | |\n", + "| acc | 0.3131859075574164 | |\n", + "| context_similarity | 0.3131859075574164 | |\n", + "| start | 200 | |\n", + "| end | 202 | |\n", + "| icd10 | ['I34.0'] | |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | |\n", + "| snomed | [] | |\n", + "| id | 36 | |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 341 | |\n", + "| end-raw | 343 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SECOND_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | | Medical |\n", + "| cui | | 74188005 |\n", + "| type_ids | | ['7882689'] |\n", + "| types | | ['qualifier value'] |\n", + "| source_value | | medical |\n", + "| detected_name | | medical |\n", + "| acc | | 1.0 |\n", + "| context_similarity | | 1.0 |\n", + "| start | | 200 |\n", + "| end | | 207 |\n", + "| icd10 | | [] |\n", + "| ontologies | | ['SNOMED-CT'] |\n", + "| snomed | | [] |\n", + "| id | | 31 |\n", + "| meta_anns (Dict[str, dict]) | 0 | 0 |\n", + "| start-raw | | 360 |\n", + "| end-raw | | 367 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.FIRST_HAS) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Respiratory rate | |\n", + "| cui | 86290005 | |\n", + "| type_ids | ['2680757'] | |\n", + "| types | [''] | |\n", + "| source_value | respiratory | |\n", + "| detected_name | respiratory | |\n", + "| acc | 0.370196740030693 | |\n", + "| context_similarity | 0.370196740030693 | |\n", + "| start | 200 | |\n", + "| end | 211 | |\n", + "| icd10 | [] | |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | |\n", + "| snomed | [] | |\n", + "| id | 50 | |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 497 | |\n", + "| end-raw | 508 | |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SAME_SPAN_DIFF_CONCEPT) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Disease | Condition |\n", + "| cui | 64572001 | 260905004 |\n", + "| type_ids | ['9090192'] | ['43039974'] |\n", + "| types | [''] | ['attribute'] |\n", + "| source_value | conditions | conditions |\n", + "| detected_name | condition | condition |\n", + "| acc | 0.5839914477394028 | 1.0 |\n", + "| context_similarity | 0.5839914477394028 | 1.0 |\n", + "| start | 200 | 200 |\n", + "| end | 210 | 210 |\n", + "| icd10 | [''] | [] |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | ['SNOMED-CT'] |\n", + "| snomed | [] | [] |\n", + "| id | 51 | 44 |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 509 | 509 |\n", + "| end-raw | 519 | 519 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.OVERLAPP_1ST_LARGER_DIFF_CONCEPT) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Alcoholic beverage intake | Substance with alcohol structure |\n", + "| cui | 897148007 | 53041004 |\n", + "| type_ids | ['2680757'] | ['91187746'] |\n", + "| types | [''] | ['substance'] |\n", + "| source_value | alcohol consumption | alcohol |\n", + "| detected_name | alcohol~consumption | alcohol |\n", + "| acc | 1.0 | 1.0 |\n", + "| context_similarity | 1.0 | 1.0 |\n", + "| start | 200 | 200 |\n", + "| end | 219 | 207 |\n", + "| icd10 | [] | [] |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | ['SNOMED-CT'] |\n", + "| snomed | [] | [] |\n", + "| id | 64 | 55 |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 596 | 596 |\n", + "| end-raw | 615 | 603 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================== \n", + "doc_2 (AnnotationComparisonType.SAME_PARENT) \n", + "====================\n" + ] + }, + { + "data": { + "text/markdown": [ + "| Path | First | Second |\n", + "| ----- | ----- | ----- |\n", + "| pretty_name | Physical examination procedure | Examination - action |\n", + "| cui | 5880005 | 302199004 |\n", + "| type_ids | ['28321150'] | ['7882689'] |\n", + "| types | [''] | ['qualifier value'] |\n", + "| source_value | examination | examination |\n", + "| detected_name | examination | examination |\n", + "| acc | 0.99 | 1.0 |\n", + "| context_similarity | 0.99 | 1.0 |\n", + "| start | 200 | 200 |\n", + "| end | 211 | 211 |\n", + "| icd10 | [] | [] |\n", + "| ontologies | ['20220803_SNOMED_UK_CLINICAL_EXT'] | ['SNOMED-CT'] |\n", + "| snomed | [] | [] |\n", + "| id | 66 | 57 |\n", + "| meta_anns (Dict[str, dict]) | 3 | 0 |\n", + "| start-raw | 628 | 628 |\n", + "| end-raw | 639 | 639 |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can iterate over annotation pairs.\n", + "# we may optionally specify the documents we wish to look at\n", + "# we will specify one document here so as to not generate too much output\n", + "docs = ['doc_2']\n", + "# by default, this will omit identical annotations\n", + "# but this can be changed by setting omit_identical=False\n", + "for doc_name, pair in ann_diffs.iter_ann_pairs(docs=docs, omit_identical=True):\n", + " print('='*20,f'\\n{doc_name} ({pair.comparison_type})', f'\\n{\"=\"*20}')\n", + " # NOTE: if only one of the two has an annotation, the other one will be None\n", + " # the following will deal with that automatically, though\n", + " compare_dicts(pair.one, pair.two)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv3.10.13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/medcat/compare_models/output.py b/medcat/compare_models/output.py new file mode 100644 index 0000000..103049f --- /dev/null +++ b/medcat/compare_models/output.py @@ -0,0 +1,308 @@ +from typing import Any, Optional, Callable, Tuple, Dict + +from enum import Enum +from copy import deepcopy +import numbers + +from compare_cdb import compare as CDBCompareResults +from compare_annotations import ResultsTally, PerAnnotationDifferences + +from IPython.display import display, Markdown +from IPython import get_ipython + + +def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell': + return True # Jupyter notebook or qtconsole + elif shell == 'TerminalInteractiveShell': + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + +def _get_other_key(key: str) -> str: + """Get the corresponding paired key. + + This expects the last character of the string to be a number. + It is designed to work for 1 and 2; and will the former into + the latter and vice versa. + + Args: + key (str): The input key. + + Returns: + str: The output paired key. + """ + + # "1" -> False or "2" -> True + helper = bool(int(key[-1:]) - 1) + other_nr = "1" if helper else "2" + return f"{key[:-1]}{other_nr}" + + +def _has_paired_key(d: dict, key: str) -> bool: + """Checks whether the key has a paired key in the dict. + + Args: + d (dict): The dict to look in. + key (str): The key in question. + + Returns: + bool: Whether or not the paired key exists in the dict. + """ + if not isinstance(key, str): + return False + if not key.endswith("1") and not key.endswith("2"): + return False + other_key = _get_other_key(key) + return other_key in d + + +def default_formatter(path: str, v1: str, v2: Optional[str] = None) -> str: + return f"{path:40s}\t{str(v1):40s}\t{str(v2 or ''):40s}" + + +def markdown_formatter(path: str, v1: str, v2: Optional[str] = None) -> str: + return f"{path:40s} | {str(v1):40s} | {str(v2 or ''):40s}" + + +def show_dict_deep(d: dict, path: str = '', + auto_output: bool = True, + output_formatter: Callable[[str, str, Optional[str]], str] = default_formatter, + notebook_output: bool = False, do_show: bool = True) -> str: + """Shows the values key-value pairs of a dict depthwise. + + It will show each specific value in the (potentially) nested dict. + I.e for top level dict the path will be its key, but for + dicts within there, the paths will be the keys to get there + joined by decimals. E.g root.key1.key2. + + Args: + d (dict): The input (potentially nested) dict. + path (str, optional): The current path. Defaults to ''. + auto_output (bool): Whether to automatically determine output. + This will prefer regular print statements for a terminal and + markdown for a notebook. If set to `True`, other formatting + options will be ignored. Defaults to True. + output_formatter (Callable[[str, str, Optional[str]], str], optional): The output formatter. + Defaults to default_formatter. + notebook_output (bool): Whether to use notebook-specific output. Defaults to False. + do_show (bool): Whether to show the output. Defaults to True. + """ + if auto_output: + if is_notebook(): + output_formatter = markdown_formatter + notebook_output = True + else: + output_formatter = default_formatter + notebook_output = False + paired_keys = set(key for key in d if _has_paired_key(d, key)) + key_pairs = [(key1, _get_other_key(key1)) for key1 in paired_keys if key1 < _get_other_key(key1)] + total_out = [] + for key, value in d.items(): + if key in paired_keys: + continue + if path: + total_path = f"{path}.{key}" + elif isinstance(key, Enum): + total_path = key.name + else: + total_path = key + if isinstance(value, dict): + cur_out = show_dict_deep(value, path=total_path, output_formatter=output_formatter, + notebook_output=notebook_output, do_show=False) + total_out.append(cur_out) + continue + text = output_formatter(total_path, value, None) + total_out.append(text) + # for paired keys + for key1, key2 in key_pairs: + common_key = key1[:-1] + total_path = f"{path}.{common_key}" if path else key + text = output_formatter(total_path, d[key1], d[key2]) + total_out.append(text) + all_text = '\n'.join(total_out) + if do_show: + if notebook_output: + # add column markers + all_text = "| " + all_text.replace("\n", " |\n| ") + " |" + header = '| Path | Value | [Optional] Comparison |\n| ----- | ----- | ----- |\n' + display(Markdown(header + all_text)) + else: + print(all_text) + return all_text + +def _empty_values_recursively(d: dict, cur_depth: int = 0, max_depth: int = 2) -> None: + for k in set(d.keys()): + v = d[k] + if isinstance(v, dict) and cur_depth < max_depth: + _empty_values_recursively(v, cur_depth=cur_depth + 1, max_depth=max_depth) + else: + if isinstance(v, str): + d[k] = '' + elif isinstance(v, numbers.Number): + d[k] = 0 + if isinstance(v, dict): + d[k] = {} + else: + # unknown + d[k] = '' + + +def _get_nulled_copy(d: dict, depth: int = 0) -> dict: + d2 = deepcopy(d) + _empty_values_recursively(d2, cur_depth=0, max_depth=depth) + return d2 + + +def compare_dicts(d1: Optional[dict], d2: Optional[dict], + auto_output: bool = True, + output_formatter: Callable[[str, str, Optional[str]], str] = default_formatter, + ignore_callables: bool = True, + custom_printval_gens: Optional[Dict[str, Callable[[Any], str]]] = None, + notebook_output: bool = False): + """Compares two dicts with identical schemas to oneanother. + + This will attempt to unravel dict values in the following way + - The number of keys will be used + - For the values + - If the dict maps to integers, the total value is counted (e.g train counts) + - If the dict maps to floats, the average value is measured (e.g accuracy) + - If the dict maps to sets the mean number of elements is measued (e.g per-cui forms) + + Args: + d1 (Optional[dict]): The first dict. + d2 (Optional[dict]): The second dict. + auto_output (bool): Whether to automatically determine output. + This will prefer regular print statements for a terminal and + markdown for a notebook. If set to `True`, other formatting + options will be ignored. Defaults to True. + output_formatter (Callable[[str, str, Optional[str]], str], optional): The output formatter. + Defaults to default_formatter. + ignore_callables (bool): Whether to ignore callable values. Defaults to True. + custom_printval_gens (Optional[Dict[str, Callable[[Any], str]]]): + The keys are ones that have a custom print value generator. + And the values are the corresponding custom print value generators. + Defaults to None (or an empty dict). + notebook_output (bool): Whether to use notebook output. Defaults to False. + raises: + AssertionError: If the keys of the two dicts differ; or if value types mismatch. + """ + if auto_output: + if is_notebook(): + output_formatter = markdown_formatter + notebook_output = True + else: + output_formatter = default_formatter + notebook_output = False + if d1 is None and d2 is None: + raise ValueError("At least one of the two dicts needs to be non-None") + # latter condition is for mypy + if d1 is None and d2 is not None: + d1 = _get_nulled_copy(d2) + # latter condition is for mypy + if d2 is None and d1 is not None: + d2 = _get_nulled_copy(d1) + # for mypy - these are now both non-None + d1: Dict = d1 # type: ignore + d2: Dict = d2 # type: ignore + assert d1.keys() == d2.keys() + all_out = [] + for key in d1: + v1 = d1[key] + v2 = d2[key] + if custom_printval_gens and key in custom_printval_gens: + printval1 = custom_printval_gens[key](v1) + printval2 = custom_printval_gens[key](v2) + elif callable(v1): + if ignore_callables: + continue + printval1 = str(v1) + printval2 = str(v2) + elif isinstance(v1, dict): + assert isinstance(v2, dict) + # just number of items + nr_of_keys1 = len(v1) + nr_of_keys2 = len(v2) + value_keys = list(v1) + if value_keys: + k0 = value_keys[0] + v0 = v1[k0] + else: + # empty dict + v0 = v1 + if isinstance(v0, int): + key = f"{key} (Dict[{type(key).__name__}, int])" + total1 = sum(v1.values()) + total2 = sum(v2.values()) + printval1 = f"{nr_of_keys1} keys (total {total1} in value)" + printval2 = f"{nr_of_keys2} keys (total {total2} in value)" + elif isinstance(v0, float): + key = f"{key} (Dict[{type(key).__name__}, float])" + if nr_of_keys1: + mean1 = sum(v1.values())/nr_of_keys1 + else: + mean1 = 0.0 + if nr_of_keys2: + mean2 = sum(v2.values())/nr_of_keys2 + else: + mean2 = 0.0 + printval1 = f"{nr_of_keys1} keys (mean {mean1} in value)" + printval2 = f"{nr_of_keys2} keys (mean {mean2} in value)" + elif isinstance(v0, set): + key = f"{key} (Dict[{type(key).__name__}, Set])" + total1 = sum(len(v) for v in v1.items()) + total2 = sum(len(v) for v in v2.items()) + if nr_of_keys1: + mean1 = total1/nr_of_keys1 + else: + mean1 = 0.0 + if nr_of_keys2: + mean2 = total2/nr_of_keys2 + else: + mean2 = 0.0 + printval1 = f"{nr_of_keys1} keys (mean {mean1} values per key)" + printval2 = f"{nr_of_keys2} keys (mean {mean2} values per key)" + else: + key = f"{key} (Dict[{type(key).__name__}, {type(v0).__name__}])" + printval1 = str(len(v1)) + printval2 = str(len(v2)) + else: + printval1 = str(v1) + printval2 = str(v2) + all_out.append(output_formatter(key, printval1, printval2)) + all_text = "\n".join(all_out) + if notebook_output: + # add column markers + all_text = "| " + all_text.replace("\n", " |\n| ") + " |" + header = '| Path | First | Second |\n| ----- | ----- | ----- |\n' + display(Markdown(header + all_text)) + else: + print(all_text) + + +def parse_and_show(cdb_diff: CDBCompareResults, tally1: ResultsTally, tally2: ResultsTally, + ann_diffs: PerAnnotationDifferences, + output_formatter: Callable[[str, str, Optional[str]], str] = default_formatter, + notebook_output: bool = False): + if notebook_output: + display(Markdown("# CDB overall differences")) + else: + print("CDB overall differences:") + show_dict_deep(cdb_diff.dict(), output_formatter=output_formatter, notebook_output=notebook_output) + if notebook_output: + display(Markdown("# Now tally differences")) + else: + print("Now tally differences") + gens = {"cat_data": lambda v: str(v)} + compare_dicts(tally1.dict(), tally2.dict(), output_formatter=output_formatter, custom_printval_gens=gens, + notebook_output=notebook_output) + if notebook_output: + display(Markdown("# Now per-annotation differences:")) + else: + print("Now per-annotation differences:") + show_dict_deep(ann_diffs.totals, output_formatter=output_formatter, notebook_output=notebook_output) diff --git a/medcat/compare_models/tests/__init__.py b/medcat/compare_models/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/medcat/compare_models/tests/resources/docs/not_real.csv b/medcat/compare_models/tests/resources/docs/not_real.csv new file mode 100644 index 0000000..9c55620 --- /dev/null +++ b/medcat/compare_models/tests/resources/docs/not_real.csv @@ -0,0 +1,3 @@ +"id","text" +"-1","Not real text. Just Virus and Virus Z" +"-2","Really not real Virus text" \ No newline at end of file diff --git a/medcat/compare_models/tests/resources/mct_export/medcat_trainer_expoert2.json b/medcat/compare_models/tests/resources/mct_export/medcat_trainer_expoert2.json new file mode 100644 index 0000000..b56a849 --- /dev/null +++ b/medcat/compare_models/tests/resources/mct_export/medcat_trainer_expoert2.json @@ -0,0 +1,21 @@ +{"projects": + [ + { + "name": "SAMPLE FAKE PROJECT", + "id": -2, + "cuis": "", + "tuis": "", + "documents": [ + { + "id": -2, + "name": "FAKE-TEXT", + "text": "FAKE TEXT WITH fake concepts, i.e Virus Z, and Virus.", + "annotations": [ + {"id": -3, "user": "fake", "cui": "C0000139", "value": "gastroesophageal reflux", "start": 34, "end": 41, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2024-04-16 11:54:00.00000+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []}, + {"id": -4, "user": "fake", "cui": "C0000039", "value": "hypertension", "start": 47, "end": 52, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2020-04-01 22:06:30.394941+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []} + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/medcat/compare_models/tests/resources/mct_export/medcat_trainer_export.json b/medcat/compare_models/tests/resources/mct_export/medcat_trainer_export.json new file mode 100644 index 0000000..a25b865 --- /dev/null +++ b/medcat/compare_models/tests/resources/mct_export/medcat_trainer_export.json @@ -0,0 +1,21 @@ +{"projects": + [ + { + "name": "SAMPLE FAKE PROJECT", + "id": -1, + "cuis": "", + "tuis": "", + "documents": [ + { + "id": -1, + "name": "FAKE-TEXT", + "text": "FAKE TEXT WITH fake concepts, i.e Virus, and Virus Z.", + "annotations": [ + {"id": -1, "user": "fake", "cui": "C0000039", "value": "gastroesophageal reflux", "start": 34, "end": 39, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2024-04-16 11:54:00.00000+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []}, + {"id": -2, "user": "fake", "cui": "C0000139", "value": "hypertension", "start": 45, "end": 52, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2020-04-01 22:06:30.394941+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []} + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/medcat/compare_models/tests/resources/model_pack/cdb.dat b/medcat/compare_models/tests/resources/model_pack/cdb.dat new file mode 100644 index 0000000..d972e22 Binary files /dev/null and b/medcat/compare_models/tests/resources/model_pack/cdb.dat differ diff --git a/medcat/compare_models/tests/resources/model_pack/vocab.dat b/medcat/compare_models/tests/resources/model_pack/vocab.dat new file mode 100644 index 0000000..32fd3ed Binary files /dev/null and b/medcat/compare_models/tests/resources/model_pack/vocab.dat differ diff --git a/medcat/compare_models/tests/test_compare.py b/medcat/compare_models/tests/test_compare.py new file mode 100644 index 0000000..79f53e9 --- /dev/null +++ b/medcat/compare_models/tests/test_compare.py @@ -0,0 +1,105 @@ +import unittest.mock +from compare import _add_all_children +from compare import get_diffs_for +from compare import (CDBCompareResults, ResultsTally, + ResultsTally, PerAnnotationDifferences) +import unittest +import os + +from medcat.cat import CAT + + +class FakeCDBWithPt2Ch: + + def __init__(self, pt2ch: dict) -> None: + self.pt2ch = pt2ch + self.addl_info = {"pt2ch": self.pt2ch} + + +class FakeCATWithCDBAndPt2Ch: + + def __init__(self, pt2ch: dict) -> None: + self.cdb = FakeCDBWithPt2Ch(pt2ch) + + +_PT2CH = { + "C1": ["C11", "C12", "C13"], + "C2": ["C21"], + # grandchildren + "C11": ["C111", "C112", "C113"], + "C13": ["C131", "C132"], + # great grandchildren + "C132": ["C1321", "C1322"], + } + + +class AddAllChildrenTests(unittest.TestCase): + pt2ch = _PT2CH + fake_cat = FakeCATWithCDBAndPt2Ch(pt2ch) + + _cui_filter = set(['C1', 'C2']) + a = [c for c in pt2ch.get("", [])] + children_1st_order = set(ch for cui in _cui_filter for ch in _PT2CH.get(cui, [])) + children_2nd_order = set(gch for ch in children_1st_order for gch in _PT2CH.get(ch, [])) + + @property + def cui_filter(self) -> set: + return set(self._cui_filter) + + def test_adds_no_children_with_0(self): + f = self.cui_filter # copy + _add_all_children(self.fake_cat, f, include_children=0) + self.assertEqual(f, self.cui_filter) + + def test_add_first_children_with_1(self): + f = self.cui_filter + _add_all_children(self.fake_cat, f, include_children=1) + self.assertGreater(f, self.cui_filter) + self.assertEqual(f, self.cui_filter | self.children_1st_order) + # no grandchildren + self.assertFalse(f & self.children_2nd_order) + + def test_add_grandchildren_with_2(self): + f = self.cui_filter + _add_all_children(self.fake_cat, f, include_children=2) + self.assertGreater(f, self.cui_filter) + self.assertGreater(f, self.cui_filter | self.children_1st_order) + self.assertEqual(f, self.cui_filter | self.children_1st_order | self.children_2nd_order) + + +class TrainAndCompareTests(unittest.TestCase): + _file_dir = os.path.dirname(__file__) + _resources_path = os.path.join(_file_dir, "resources") + cat_path = os.path.join(_resources_path, "model_pack") + mct_export_path_1 = os.path.join(_resources_path, "mct_export", "medcat_trainer_export.json") + mct_export_path_glob = os.path.join(_resources_path, "mct_export", "medcat_trainer_export*.json") + docs_file = os.path.join(_resources_path, "docs", "not_real.csv") + + # this tests that the training is called + @classmethod + @unittest.mock.patch("medcat.cat.CAT.train_supervised_from_json") + def _get_diffs(cls, mct_export_path: str, method): + diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file, + supervised_train_comparison_model=True) + cls.assertTrue(cls, method.called) + return diffs + + + @classmethod + def setUpClass(cls) -> None: + ann_diffs1 = cls._get_diffs(cls.mct_export_path_1) + cls.cdb_comp1, cls.tally1_1, cls.tally1_2, cls.ann_diffs1 = ann_diffs1 + ann_diffs_many = cls._get_diffs(cls.mct_export_path_glob) + cls.cdb_comp_many, cls.tally_many_1, cls.tally_many_2, cls.ann_diffs_many = ann_diffs_many + + def test_compares_with_one_file(self): + self.assertIsInstance(self.cdb_comp1, CDBCompareResults) + self.assertIsInstance(self.tally1_1, ResultsTally) + self.assertIsInstance(self.tally1_2, ResultsTally) + self.assertIsInstance(self.ann_diffs1, PerAnnotationDifferences) + + def test_compares_with_multiple_file(self): + self.assertIsInstance(self.cdb_comp_many, CDBCompareResults) + self.assertIsInstance(self.tally_many_1, ResultsTally) + self.assertIsInstance(self.tally_many_2, ResultsTally) + self.assertIsInstance(self.ann_diffs_many, PerAnnotationDifferences) diff --git a/medcat/compare_models/tests/test_compare_annotations.py b/medcat/compare_models/tests/test_compare_annotations.py new file mode 100644 index 0000000..55f19e1 --- /dev/null +++ b/medcat/compare_models/tests/test_compare_annotations.py @@ -0,0 +1,883 @@ +import compare_annotations + +import unittest +import tempfile +import os +import pandas as pd + + +# helper class for substituting @classmethod and @property +# this is needed because this functionality is deprecated +# in python3.11 and will be removed in 3.13 +class classproperty: + def __init__(self, func): + self.fget = func + def __get__(self, instance, owner): + return self.fget(owner) + + +class ResultsTallyTests(unittest.TestCase): + common = {"type_ids": ["T1"], "detected_name": "NOT IMPORTANT", 'acc': 1.0} + cui2name = {"C1": "Concept 1", + "C2": "Concept 2"} + entities = [ {"entities": + {"0": {"start": 10, "end": 15, "cui": "C1"}, + "1": {"start": 20, "end": 35, "cui": "C2"}}}, + {"entities": + {"0": {"start": 5, "end": 15, "cui": "C2"}, + "1": {"start": 25, "end": 30, "cui": "C1"}}} + ] + + @classmethod + def setUpClass(cls) -> None: + for doc in cls.entities: + for ent in doc['entities'].values(): + ent.update(cls.common) + + def _cui2name(self, cui: str) -> str: + return self.cui2name[cui] + + def setUp(self) -> None: + self.res = compare_annotations.ResultsTally(cat_data={"stats": "don't matter"}, + cui2name=self._cui2name) + for entities in self.entities: + self.res.count(entities['entities']) + + def test_filter_works(self, cuis = {"C1"}): + self.res.filter_cuis(cuis) + for cui in cuis: + with self.subTest(cui): + self.assertIn(cui, self.res.per_cui_count) + for cui in self.cui2name: + if cui in cuis: + continue + with self.subTest(cui): + per_cui = self.res.get_for_cui(cui) + per_cui_values = set(per_cui.values()) + self.assertEqual(len(per_cui_values), len(self.cui2name) - 1) + self.assertEqual(per_cui_values, {"N/A"}) + self.assertEqual(set(self.res.per_cui_count), cuis) + + +class EntityOverlapIdenticalTests(unittest.TestCase): + + def test_identical_overlap(self, start=10, end=15): + self.assertTrue(compare_annotations._check_overlap_internal(start, end, start, end)) + +class EntityOverlapFarAwayTests(unittest.TestCase): + start1 = 10 + end1 = 15 + one = (start1, end1) + start2 = 20 + end2 = 25 + two = (start2, end2) + + def test_no_overlap_12(self): + self.assertFalse(compare_annotations._check_overlap_internal(*self.one, *self.two)) + + def test_no_overlap_21(self): + self.assertFalse(compare_annotations._check_overlap_internal(*self.two, *self.one)) + + +class PartialOverlapTests(unittest.TestCase): + start1 = 10 + end1 = 20 + one = (start1, end1) + start2 = 15 + end2 = 25 + two = (start2, end2) + + def test_12(self): + self.assertTrue(compare_annotations._check_overlap_internal(*self.one, *self.two)) + + def test_21(self): + self.assertTrue(compare_annotations._check_overlap_internal(*self.two, *self.one)) + + +class IdenticalStartOverlapTests(unittest.TestCase): + start = 10 + end1 = 20 + end2 = 25 + one = (start, end1) + two = (start, end2) + + def test_12(self): + self.assertTrue(compare_annotations._check_overlap_internal(*self.one, *self.two)) + + def test_21(self): + self.assertTrue(compare_annotations._check_overlap_internal(*self.two, *self.one)) + + +class IdenticalEndOverlapTests(unittest.TestCase): + start1 = 10 + start2 = 5 + end = 20 + one = (start1, end) + two = (start2, end) + + def test_12(self): + self.assertTrue(compare_annotations._check_overlap_internal(*self.one, *self.two)) + + def test_21(self): + self.assertTrue(compare_annotations._check_overlap_internal(*self.two, *self.one)) + + +# START the annotation comparison + + +def _find_cuis(d: dict, target: str = "cui") -> set: + results = set() + for k, v in d.items(): + if k == target: + results.add(v) + elif isinstance(v, dict): + results.update(_find_cuis(v)) + return results + + +def _get_cuis(cls, start_char: str = "d") -> set: + attr_names = [attr for attr in dir(cls) if attr.startswith(start_char)] + all_cuis = set() + for attr in attr_names: + dict_value = getattr(cls, attr) + if not isinstance(dict_value, dict): + # NOTE: most of those will be methods + # in the base unittest.TestCase class, + # e.g doClassCleanups + continue + # find recursively all "cui" values in dict + all_cuis.update(_find_cuis(dict_value)) + return all_cuis + + +class NoOverlapFarAwaySameCUITests(unittest.TestCase): + FIRST = compare_annotations.AnnotationComparisonType.FIRST_HAS + SECOND = compare_annotations.AnnotationComparisonType.SECOND_HAS + d1 = {"start": 10, "end": 15, "cui": 'C1'} + d2 = {"start": 20, "end": 25, "cui": 'C1'} + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls) + + def setUp(self) -> None: + self.c12 = compare_annotations.AnnotationComparisonType.determine(self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.c21 = compare_annotations.AnnotationComparisonType.determine(self.d2, self.d1, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + + def test_1st_has_12(self): + self.assertIs(self.c12, self.FIRST) + + def test_2nd_has_21(self): + self.assertIs(self.c21, self.SECOND) + + +class NoOverlapFarAwayDiffCUITests(NoOverlapFarAwaySameCUITests): + + @classmethod + def setUpClass(cls) -> None: + cls.d1 = dict(cls.d1, cui='C2') + cls.d2 = dict(cls.d2, cui='C3') + + +class PartialOverlapSameCUITests(unittest.TestCase): + DIFF_CONCEPT = compare_annotations.AnnotationComparisonType.PARTIAL_OVERLAP_DIFF_CONCEPT + SAME_CONCEPT = compare_annotations.AnnotationComparisonType.PARTIAL_OVERLAP_SAME_CONCEPT + d1 = {"start": 10, "end": 20, "cui": 'C1'} + d2 = {"start": 15, "end": 25, "cui": 'C1'} + expect_identical_cui = True + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls) + + def setUp(self) -> None: + self.c12 = compare_annotations.AnnotationComparisonType.determine(self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.c21 = compare_annotations.AnnotationComparisonType.determine(self.d2, self.d1, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.expected = self.SAME_CONCEPT if self.expect_identical_cui else self.DIFF_CONCEPT + + def test_partial_12(self): + self.assertIs(self.c12, self.expected) + + def test_partial_21(self): + self.assertIs(self.c21, self.expected) + + +class PartialOverlapDiffCUITests(PartialOverlapSameCUITests): + + @classmethod + def setUpClass(cls) -> None: + cls.d1 = dict(cls.d1, cui='C2') + cls.d2 = dict(cls.d2, cui='C3') + cls.expect_identical_cui = False + + +class IdenticalOverlapSameCUITests(unittest.TestCase): + SAME = compare_annotations.AnnotationComparisonType.IDENTICAL + DIFF = compare_annotations.AnnotationComparisonType.SAME_SPAN_DIFF_CONCEPT + d1 = {"start": 10, "end": 20, "cui": 'C1'} + d2 = {"start": 10, "end": 20, "cui": 'C1'} + expect_identical_cui = True + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls) + + def setUp(self) -> None: + self.c12 = compare_annotations.AnnotationComparisonType.determine(self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.c21 = compare_annotations.AnnotationComparisonType.determine(self.d2, self.d1, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.expected = self.SAME if self.expect_identical_cui else self.DIFF + + def test_identical_12(self): + self.assertIs(self.c12, self.expected) + + def test_identical_21(self): + self.assertIs(self.c21, self.expected) + + +class IdenticalOverlapDiffCUITests(IdenticalOverlapSameCUITests): + + @classmethod + def setUpClass(cls) -> None: + cls.d1 = dict(cls.d1, cui='C2') + cls.d2 = dict(cls.d2, cui='C3') + cls.expect_identical_cui = False + + +class OverLapOneLargerSameConceptTests(unittest.TestCase): + L1_DC = compare_annotations.AnnotationComparisonType.OVERLAPP_1ST_LARGER_DIFF_CONCEPT + L2_DC = compare_annotations.AnnotationComparisonType.OVERLAPP_2ND_LARGER_DIFF_CONCEPT + L1_SC = compare_annotations.AnnotationComparisonType.OVERLAPP_1ST_LARGER_SAME_CONCEPT + L2_SC = compare_annotations.AnnotationComparisonType.OVERLAPP_2ND_LARGER_SAME_CONCEPT + d1 = {"start": 10, "end": 25, "cui": 'C1'} + d2 = {"start": 10, "end": 20, "cui": 'C1'} + expect_identical_cui = True + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls) + + def setUp(self) -> None: + self.c12 = compare_annotations.AnnotationComparisonType.determine(self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.c21 = compare_annotations.AnnotationComparisonType.determine(self.d2, self.d1, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.expected_12 = self.L1_SC if self.expect_identical_cui else self.L1_DC + self.expected_21 = self.L2_SC if self.expect_identical_cui else self.L2_DC + + def test_12(self): + self.assertTrue(self.c12, self.expected_12) + + def test_21(self): + self.assertTrue(self.c12, self.expected_21) + + +class OverLapOneLargerDiffConceptTests(OverLapOneLargerSameConceptTests): + + @classmethod + def setUpClass(cls) -> None: + cls.d1 = dict(cls.d1, cui='C2') + cls.d2 = dict(cls.d2, cui='C3') + cls.expect_identical_cui = False + + +# per document tests + + +class PerDocAnnotationSameTests(unittest.TestCase): + entities = {"0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'}} + d = {"entities": entities} + d1 = d + d2 = d + expected = compare_annotations.AnnotationComparisonType.IDENTICAL + TEXT = "Some TEXT" + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls) + + def setUp(self) -> None: + self.pdad = compare_annotations.PerDocAnnotationDifferences.get("doc0", self.TEXT, self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + + def test_all_same(self): + pdad = self.pdad + self.assertEqual(len(pdad.nr_of_comparisons), 1) + for el in compare_annotations.AnnotationComparisonType: + with self.subTest(f"{el}"): + if el is self.expected: + self.assertIn(self.expected, pdad.nr_of_comparisons) + else: + self.assertNotIn(el, pdad.nr_of_comparisons) + + def test_def_keeps_raw(self): + pdad = self.pdad + self.assertNotEqual(pdad.raw_text, '') + + def test_can_omit_raw(self): + pdad = pdad = compare_annotations.PerDocAnnotationDifferences.get("doc0", self.TEXT, self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis, + keep_raw=False) + self.assertEqual(pdad.raw_text, '') + + +class PerDocAnnotationSameSpanDiffCUITests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.entities2 = {k0: {k1: v1 if k1 != "cui" else f"{v1}0" for k1, v1 in v0.items()} + for k0, v0 in cls.entities.items()} + cls.d2 = {"entities": cls.entities2} + cls.expected = compare_annotations.AnnotationComparisonType.SAME_SPAN_DIFF_CONCEPT + + +class PerDocAnnotationLargerSpan2ndSameCUITests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.entities2 = {k0: {k1: v1 if k1 != "end" else v1 + 10 for k1, v1 in v0.items()} + for k0, v0 in cls.entities.items()} + cls.d2 = {"entities": cls.entities2} + cls.expected = compare_annotations.AnnotationComparisonType.OVERLAPP_2ND_LARGER_SAME_CONCEPT + + +class PerDocAnnotationLargerSpan2ndDiffCUITests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.entities2 = {k0: {k1: v1 if k1 != "end" else v1 + 10 for k1, v1 in v0.items()} + for k0, v0 in cls.entities.items()} + # change cuis + for ent in cls.entities2.values(): + ent['cui'] += "C" + cls.d2 = {"entities": cls.entities2} + cls.expected = compare_annotations.AnnotationComparisonType.OVERLAPP_2ND_LARGER_DIFF_CONCEPT + + +class PerDocAnnotationLargerSpan1stSameCUITests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.entities2 = {k0: {k1: v1 if k1 != "end" else v1 - 2 for k1, v1 in v0.items()} + for k0, v0 in cls.entities.items()} + cls.d2 = {"entities": cls.entities2} + cls.expected = compare_annotations.AnnotationComparisonType.OVERLAPP_1ST_LARGER_SAME_CONCEPT + + +class PerDocAnnotationLargerSpan1stDiffCUITests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.entities2 = {k0: {k1: v1 if k1 != "end" else v1 - 2 for k1, v1 in v0.items()} + for k0, v0 in cls.entities.items()} + # change cuis + for ent in cls.entities2.values(): + ent['cui'] += "C" + cls.d2 = {"entities": cls.entities2} + cls.expected = compare_annotations.AnnotationComparisonType.OVERLAPP_1ST_LARGER_DIFF_CONCEPT + + +class PerDocAnnotatingUneventLengthsAll1stTests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + # empty d2 + cls.d2 = {"entities": {}} + cls.expected = compare_annotations.AnnotationComparisonType.FIRST_HAS + + +class PerDocAnnotatingUneventLengthsAll2ndTests(PerDocAnnotationSameTests): + + @classmethod + def setUpClass(cls) -> None: + # empty d2 + cls.d1, cls.d2 = {"entities": {}}, cls.d1 + cls.expected = compare_annotations.AnnotationComparisonType.SECOND_HAS + + +class PerDocAnnotatingUnevenLengthsComplicatedTests(unittest.TestCase): + entities1 = {"0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'}} + d1 = {"entities": entities1} + entities2 = dict(entities1, + **{"2": {"start": 50, "end": 60, "cui": 'C3'}, + "3": {"start": 65, "end": 70, "cui": 'C2'}}) + d2 = {"entities": entities2} + expected12 = {compare_annotations.AnnotationComparisonType.IDENTICAL: 2, + compare_annotations.AnnotationComparisonType.SECOND_HAS: 2} + expected21 = {compare_annotations.AnnotationComparisonType.IDENTICAL: 2, + compare_annotations.AnnotationComparisonType.FIRST_HAS: 2} + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls) + + def test_has_expected_comparison_12(self): + pdad = compare_annotations.PerDocAnnotationDifferences.get("doc0", "", self.d1, self.d2, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.assertEqual(pdad.nr_of_comparisons, self.expected12) + + def test_has_expected_comparison_21(self): + pdad = compare_annotations.PerDocAnnotationDifferences.get("doc0", "", self.d2, self.d1, + pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + self.assertEqual(pdad.nr_of_comparisons, self.expected21) + + +# now for PerAnnotationDifferences + + +class PerAnnotationSameDifferencesIdenticalTests(unittest.TestCase): + annotations = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'} + }}, + # doc2 + {"entities": { + "0": {"start": 12, "end": 22, "cui": 'C1'}, + "1": {"start": 42, "end": 52, "cui": 'C2'} + }}, + ] + expected_totals = {compare_annotations.AnnotationComparisonType.IDENTICAL: 4} + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls, start_char="annotations") + + def setUp(self): + self.pad = compare_annotations.PerAnnotationDifferences(pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + for nr, ann in enumerate(self.annotations): + self.pad.look_at_doc(ann, ann, f"{nr}", "") + self.pad.finalise() + + def test_identical(self): + self.assertEqual(self.pad.totals, self.expected_totals) + + +class PerAnnotationSomeDifferencesIdenticalTests(unittest.TestCase): + annotations1 = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'} + }}, + # doc2 + {"entities": { + "0": {"start": 12, "end": 22, "cui": 'C1'}, + "1": {"start": 42, "end": 52, "cui": 'C2'} + }}, + ] + annotations2 = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'} + }}, + # doc2 + {"entities": { + "0": {"start": 80, "end": 88, "cui": 'C3'}, + }}, + ] + expected_totals = {compare_annotations.AnnotationComparisonType.IDENTICAL: 2, + compare_annotations.AnnotationComparisonType.FIRST_HAS: 2, + compare_annotations.AnnotationComparisonType.SECOND_HAS: 1, + } + expected_pair_order = [ + ("0", compare_annotations.AnnotationPair(one=annotations1[0]['entities']['0'], + two=annotations2[0]['entities']['0'], + comparison_type=compare_annotations.AnnotationComparisonType.IDENTICAL)), + ("0", compare_annotations.AnnotationPair(one=annotations1[0]['entities']['1'], + two=annotations2[0]['entities']['1'], + comparison_type=compare_annotations.AnnotationComparisonType.IDENTICAL)), + ("1", compare_annotations.AnnotationPair(one=annotations1[1]['entities']['0'], + two=None, + comparison_type=compare_annotations.AnnotationComparisonType.FIRST_HAS)), + ("1", compare_annotations.AnnotationPair(one=annotations1[1]['entities']['1'], + two=None, + comparison_type=compare_annotations.AnnotationComparisonType.FIRST_HAS)), + ("1", compare_annotations.AnnotationPair(one=None, + two=annotations2[1]['entities']['0'], + comparison_type=compare_annotations.AnnotationComparisonType.SECOND_HAS)), + ] + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls, start_char="annotations") + + def setUp(self): + self.pad = compare_annotations.PerAnnotationDifferences(pt2ch1=None, pt2ch2=None, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + for nr, (ann1, ann2) in enumerate(zip(self.annotations1, self.annotations2)): + self.pad.look_at_doc(ann1, ann2, f"{nr}", "") + self.pad.finalise() + + def test_identical(self): + self.assertEqual(self.pad.totals, self.expected_totals) + + def assertCorrectPairs(self, list_of_pairs: list): + self.assertEqual(len(list_of_pairs), len(self.expected_pair_order)) + for nr, (pair, expected_pair) in enumerate(zip(list_of_pairs, self.expected_pair_order)): + with self.subTest(f"{nr}"): + self.assertEqual(pair, expected_pair) + + def test_iteration(self): + list_of_pairs = list(self.pad.iter_ann_pairs(omit_identical=False)) + self.assertCorrectPairs(list_of_pairs) + + def test_iteration_omit_identical(self): + list_of_pairs = list(self.pad.iter_ann_pairs(omit_identical=True)) + # check with pop + for expected in self.expected_pair_order: + if expected[1].comparison_type == compare_annotations.AnnotationComparisonType.IDENTICAL: + continue + with self.subTest(f"{expected}"): + self.assertEqual(list_of_pairs.pop(0), expected) + + def test_iteration_filter_all(self, doc_list = ['0', '1']): + list_of_pairs = list(self.pad.iter_ann_pairs(docs=doc_list, omit_identical=False)) + self.assertCorrectPairs(list_of_pairs) + + def test_iteration_filter_none(self, docs=[]): + list_of_pairs = list(self.pad.iter_ann_pairs(docs=docs, omit_identical=False)) + self.assertEqual(list_of_pairs, docs) + + def test_iteration_filter_some(self, doc='1'): + list_of_pairs = list(self.pad.iter_ann_pairs(docs=[doc], omit_identical=False)) + # check has only this document + doc_numbers = set([pair[0] for pair in list_of_pairs]) + self.assertEqual(doc_numbers, {doc}) + # check that is has an entry for each annotation in doc + expected_annotations = [ann for ann in self.expected_pair_order if ann[0] == doc] + self.assertEqual(len(expected_annotations), len(list_of_pairs)) + # check has correct pairs + # pop the first off list every time + for doc_name, expected_pair in self.expected_pair_order: + if doc_name != doc: + continue + with self.subTest(f"{doc_name}: {expected_pair}"): + self.assertEqual(list_of_pairs.pop(0), (doc_name, expected_pair)) + + +class FindsParentsTest(unittest.TestCase): + pt2ch = { + # children + 'c1': ['c10', 'c11'], + 'c2': ['c20', 'c21', 'c22'], + 'c3': ['c30'], + # grandchildren + 'c10': ['c100', 'c101'], + 'c30': ['c300'], + # great grandchildren + 'c300': ['c3001', 'c3002', 'c3003'] + } + annotations = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'c1'} + }}, + ] + annotations_child = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'c10'}, + }}, + ] + annotations_grandchild = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'c101'}, + }}, + ] + annotations_ggc1 = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'c3'}, + }}, + ] + annotations_ggc2 = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'c3003'}, + }}, + ] + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls, start_char="annotations") + + def _set_up_for(self, anns1: list, anns2: list + ) -> compare_annotations.PerAnnotationDifferences: + pad = compare_annotations.PerAnnotationDifferences(pt2ch1=self.pt2ch, + pt2ch2=self.pt2ch, + model1_cuis=self.cuis, + model2_cuis=self.cuis) + for nr, (ann1, ann2) in enumerate(zip(anns1, anns2)): + pad.look_at_doc(ann1, ann2, f"{nr}", "") + pad.finalise() + return pad + + def setUp(self): + # reg->child + self.reg_child = self._set_up_for(self.annotations, self.annotations_child) + # reg->grandchild + self.reg_grandchild = self._set_up_for(self.annotations, self.annotations_grandchild) + # child->grandchild + self.child_grandchild = self._set_up_for(self.annotations_child, self.annotations_grandchild) + # the opposite direction + # child->reg + self.child_reg = self._set_up_for(self.annotations_child, self.annotations) + # grandchild->reg + self.grandchild_reg = self._set_up_for(self.annotations_grandchild, self.annotations) + # grandchild->child + self.grandchild_child = self._set_up_for(self.annotations_grandchild, self.annotations_child) + # great-granchild->reg + self.ggc_reg = self._set_up_for(self.annotations_ggc1, self.annotations_ggc2) + + def assertCorrectRecognition(self, pad: compare_annotations.PerAnnotationDifferences, + exp_type: compare_annotations.AnnotationComparisonType): + self.assertEqual(len(pad.totals), 1) + self.assertIn(exp_type, pad.totals) + + def test_child_recognised(self): + self.assertCorrectRecognition(self.reg_child, + compare_annotations.AnnotationComparisonType.SAME_PARENT) + + def test_child_recognised_reverse(self): + self.assertCorrectRecognition(self.child_reg, + compare_annotations.AnnotationComparisonType.SAME_PARENT) + + def test_grandchild_recognised_from_child(self): + self.assertCorrectRecognition(self.child_grandchild, + compare_annotations.AnnotationComparisonType.SAME_PARENT) + + def test_grandchild_recognised_from_child_reverse(self): + self.assertCorrectRecognition(self.grandchild_child, + compare_annotations.AnnotationComparisonType.SAME_PARENT) + + def test_grandchild_recognised(self): + self.assertCorrectRecognition(self.reg_grandchild, + compare_annotations.AnnotationComparisonType.SAME_GRANDPARENT) + + def test_grandchild_recognised_reverse(self): + self.assertCorrectRecognition(self.grandchild_reg, + compare_annotations.AnnotationComparisonType.SAME_GRANDPARENT) + + def test_great_grandchildren_not_recognised(self): + self.assertCorrectRecognition(self.ggc_reg, + compare_annotations.AnnotationComparisonType.SAME_SPAN_CONCEPT_NOT_IN_2ND) + + +class PerAnnotationCSVTests(unittest.TestCase): + docs = [ + # doc1 10 ... 25 + "Some doc. C1 C1 and some text " + #40 ... 55 + "C2 C2 and some more", + # doc2 12 ... 22 + "Some docum C1 C1 and some text and " + # 40 ... 52 + " C2 C2 and some more " + #80 ...88 + "C3 C3 anf final" + ] + annotations1 = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'} + }}, + # doc2 + {"entities": { + "0": {"start": 12, "end": 22, "cui": 'C1'}, + "1": {"start": 42, "end": 52, "cui": 'C2'} + }}, + ] + annotations2 = [ + # doc1 + {"entities": { + "0": {"start": 10, "end": 25, "cui": 'C1'}, + "1": {"start": 40, "end": 55, "cui": 'C2'} + }}, + # doc2 + {"entities": { + "0": {"start": 80, "end": 88, "cui": 'C3'}, + }}, + ] + temp_folder = tempfile.TemporaryDirectory() + file_name = 'temp_out.csv' + file = os.path.join(temp_folder.name, file_name) + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls, start_char="annotations") + + @classmethod + def setUpClass(cls) -> None: + cls.pad = compare_annotations.PerAnnotationDifferences(pt2ch1=None, + pt2ch2=None, + model1_cuis=cls.cuis, + model2_cuis=cls.cuis) + for doc_nr, (doc, ents1, ents2) in enumerate(zip(cls.docs, cls.annotations1, cls.annotations2)): + cls.pad.look_at_doc(ents1, ents2, f"doc_{doc_nr}", doc) + cls.pad.finalise() + + def setUp(self) -> None: + self.pad.to_csv(self.file) + + def tearDown(self) -> None: + os.remove(self.file) + + @classmethod + def tearDownClass(cls) -> None: + cls.temp_folder.cleanup() + + def test_creates_csv(self): + self.assertTrue(os.path.exists(self.file)) + + def test_file_can_be_read(self): + df = pd.read_csv(self.file) + self.assertIsInstance(df, pd.DataFrame) + + def test_file_has_columns(self, + columns = ["doc_id", "text", "ann1", "ann2"]): + df = pd.read_csv(self.file) + self.assertEqual(len(columns), len(df.columns)) + for col in columns: + with self.subTest(f"Column: {col}"): + self.assertIn(col, df.columns) + + def test_file_has_annotations(self, exp_total = 5): + df = pd.read_csv(self.file) + self.assertEqual(len(df.index), exp_total) + + def assert_can_recreate_dicts(self, df: pd.DataFrame, column: str): + with self.subTest(f"Col: {column}"): + series = df[column] + for _, val in series.items(): + if not val or val != val: + # ingore NaN / None + continue + d = eval(val) + self.assertIsInstance(d, dict) + + def test_can_recreate_dicts(self): + df = pd.read_csv(self.file) + self.assert_can_recreate_dicts(df, "ann1") + self.assert_can_recreate_dicts(df, "ann2") + + def assert_annotations_remain_same(self, df: pd.DataFrame, column: str, + expected = list): + expected = [value for part in expected for value in part["entities"].values()] + series = df[column] + anns = [v for _, v in series[series.notnull()].items() if v == v] + anns = [eval(v) for v in anns] + # remove raw starts (additions) + # NOTE: this only works so far since the span is greater + # than the document length. + # Otherwise, I'd need to read the `-raw` parts and + # write them to the correspoding spot + for v in anns: + del v['start-raw'] + del v['end-raw'] + self.assertEqual(len(anns), len(expected)) + for nr, (got, expect) in enumerate(zip(anns, expected)): + with self.subTest(f"Nr: {nr}"): + self.assertEqual(got, expect) + + def test_annotations_remain_same(self): + df = pd.read_csv(self.file) + self.assert_annotations_remain_same(df, 'ann1', self.annotations1) + self.assert_annotations_remain_same(df, 'ann2', self.annotations2) + + +class DocumentIterationTests(unittest.TestCase): + docs = PerAnnotationCSVTests.docs + annotations1 = PerAnnotationCSVTests.annotations1 + annotations2 = PerAnnotationCSVTests.annotations2 + expected_all = { + compare_annotations.AnnotationComparisonType.IDENTICAL: 2, + compare_annotations.AnnotationComparisonType.FIRST_HAS: 2, + compare_annotations.AnnotationComparisonType.SECOND_HAS: 1 + } + + @classproperty + def cuis(cls) -> set: + return _get_cuis(cls, start_char="annotations") + + @classmethod + def setUpClass(cls) -> None: + cls.pad = compare_annotations.PerAnnotationDifferences(pt2ch1=None, + pt2ch2=None, + model1_cuis=cls.cuis, + model2_cuis=cls.cuis) + for doc_nr, (doc, ents1, ents2) in enumerate(zip(cls.docs, cls.annotations1, cls.annotations2)): + cls.pad.look_at_doc(ents1, ents2, f"doc_{doc_nr}", doc) + cls.pad.finalise() + + def assert_filters1(self, comp_type: compare_annotations.AnnotationComparisonType): + docs = list( + self.pad.iter_document_annotations(types_filter={comp_type}) + ) + self.assertEqual(len(docs), self.expected_all[comp_type]) + + def assert_filters_many(self, *comp_types: compare_annotations.AnnotationComparisonType): + docs = list( + self.pad.iter_document_annotations(types_filter=comp_types) + ) + expected_sum = sum(self.expected_all[comp_type] for comp_type in comp_types) + self.assertEqual(len(docs), expected_sum) + + def test_all_has_all(self): + self.assert_filters_many(*tuple(self.expected_all)) + + def test_filters_identical(self): + self.assert_filters1(compare_annotations.AnnotationComparisonType.IDENTICAL) + + def test_filters_first_has(self): + self.assert_filters1(compare_annotations.AnnotationComparisonType.FIRST_HAS) + + def test_filters_second_has(self): + self.assert_filters1(compare_annotations.AnnotationComparisonType.SECOND_HAS) + + def test_filters_problematic(self): + self.assert_filters_many(compare_annotations.AnnotationComparisonType.FIRST_HAS, + compare_annotations.AnnotationComparisonType.SECOND_HAS) diff --git a/medcat/compare_models/tests/test_compare_cdb.py b/medcat/compare_models/tests/test_compare_cdb.py new file mode 100644 index 0000000..e2e2e44 --- /dev/null +++ b/medcat/compare_models/tests/test_compare_cdb.py @@ -0,0 +1,70 @@ +import compare_cdb + +import unittest +EXAMPLE1 = { + "C0": {"n01", "n02", "n03"}, # 1 non-unique (#2 CS) + "C1": {"n11", "n12" }, + + "C3": {"n31", "n33"}, # adds 1 CUI, 2 names + + "C5": { "n53"}, # adds 1 CUI, 1 name + } +EXAMPLE2 = { + "C0": {"n01", "n02", "n03"}, # 1 non-unique (CS) + "C1": {"n11", "n12", "n13"}, # adds 1 name + "C2": {"n21", "n23"}, # adds 1 CUI, 2 names + + "C4": {"n41", "n42", "n43"}, # adds 1 CUI, 3 names; 1 non-unique (CS) + + "CS": {"n01", "n42", }, # adds 1 CUI, no names + } +# this should be equivalent to the above +EXPECTED_VALUES_MAN = compare_cdb.DictCompareValues(total1=8, + total2=13, + not_in_1=8, # n13, n21, n23, n41, n42, n43, "n01", "n42" + not_in_2=3, # n31, n33, n53 + joint=5, # n01, n02, n03, n11, n12 + unique_in_1=3, # overall unique in 1st + unique_in_2=6, # overall unique in 2nd + ) + +keys1 = set(EXAMPLE1.keys()) +keys2 = set(EXAMPLE2.keys()) +EXPECTED_KEYS = compare_cdb.DictCompareKeys(total1=len(keys1), + total2=len(keys2), + joint=len(keys1 & keys2), + not_in_1=(len(keys1 | keys2)) - len(keys1), + not_in_2=(len(keys1 | keys2)) - len(keys2),) +# this should be equivalent to the above +EXPECTED_KEYS_MAN = compare_cdb.DictCompareKeys(total1=4, # C0, C1, C3, C5 + total2=5, # C0, C1, C2, C4, CS + joint=2, # C0, C1 + not_in_1=3, # C2, C4, CS + not_in_2=2, # C3, C5 + ) +vals1 = set(e for v in EXAMPLE1.values() for e in v) +total1 = sum(len(v) for v in EXAMPLE1.values()) +vals2 = set(e for v in EXAMPLE2.values() for e in v) +total2 = sum(len(v) for v in EXAMPLE2.values()) +EXPECTED_VALUES = compare_cdb.DictCompareValues(total1=total1, + total2=total2, + not_in_1=8, # the new/misplaced CUIs in 2nd + not_in_2=3, # the new/misplaced CUIs in 1st + joint=len(vals1 & vals2), + unique_in_1=3, # overall unique in 1st + unique_in_2=6, # overall unique in 2nd + ) + + +class CompareDictTests(unittest.TestCase): + + def test_compare_keys_works(self, d1=EXAMPLE1, d2=EXAMPLE2, exp=EXPECTED_KEYS, exp_man=EXPECTED_KEYS_MAN): + res = compare_cdb.DictCompareKeys.get(d1, d2) + self.assertEqual(res.dict(), exp.dict()) + self.assertEqual(res.dict(), exp_man.dict()) + + def test_compare_values_works(self, d1=EXAMPLE1, d2=EXAMPLE2, exp=EXPECTED_VALUES, exp_man=EXPECTED_VALUES_MAN): + res = compare_cdb.DictCompareValues.get(d1, d2, progress=False) + self.assertEqual(res.dict(), exp.dict()) + self.assertEqual(res.dict(), exp_man.dict()) + diff --git a/medcat/compare_models/tests/test_output.py b/medcat/compare_models/tests/test_output.py new file mode 100644 index 0000000..31a9c48 --- /dev/null +++ b/medcat/compare_models/tests/test_output.py @@ -0,0 +1,77 @@ +import output + +import contextlib +import io +import sys + +import unittest + + +@contextlib.contextmanager +def nostdout(): + save_stdout = sys.stdout + sys.stdout = io.StringIO() + yield + sys.stdout = save_stdout + + +class CompareDictTests(unittest.TestCase): + example_dict = {"k1": "v1", + "k2": "v2", + "k3": {"sk1": 1.0}} + example_dict2 = {'pretty_name': 'Genus Quercus', + 'cui': '53347009', + 'type_ids': ['81102976'], + 'types': [''], + 'source_value': 'Oak', + 'detected_name': 'oak', + 'acc': 0.6368384509248382, + 'context_similarity': 0.6368384509248382, + 'start': 43, + 'end': 46, + 'icd10': [], + 'ontologies': + ['20220803_SNOMED_UK_CLINICAL_EXT'], + 'snomed': [], + 'id': 3, + 'meta_anns': { + 'Presence': {'value': 'True', 'confidence': 0.999996542930603, 'name': 'Presence'}, + 'Subject': {'value': 'Patient', 'confidence': 0.9396798014640808, 'name': 'Subject'}, + 'Time': {'value': 'Recent', 'confidence': 0.9999940395355225, 'name': 'Time'} + } + } + expected_nulled_dict2 = {'pretty_name': '', + 'cui': '', + 'type_ids': '', + 'types': '', + 'source_value': '', + 'detected_name': '', + 'acc': '', + 'context_similarity': '', + 'start': '', + 'end': '', + 'icd10': '', + 'ontologies': '', + 'snomed': '', + 'id': '', + 'meta_anns': {} + } + + def setUp(self) -> None: + self.nulled = output._get_nulled_copy(self.example_dict) + self.nulled2 = output._get_nulled_copy(self.example_dict2) + + def test_compare_dicts_works_1st_None(self): + with nostdout(): + output.compare_dicts(None, self.example_dict) + + def test_compare_dicts_works_2nd_None(self): + with nostdout(): + output.compare_dicts(self.example_dict, None) + + def test_expected_nulled_real(self): + self.assertEqual(self.nulled2, self.expected_nulled_dict2) + + def test_compare_dicts_1st_only_real(self): + with nostdout(): + output.compare_dicts(self.example_dict2, None) diff --git a/medcat/compare_models/validation.py b/medcat/compare_models/validation.py new file mode 100644 index 0000000..689996c --- /dev/null +++ b/medcat/compare_models/validation.py @@ -0,0 +1,63 @@ +from typing import Optional, Union, Set +import os +import glob + + +def _is_mct_export(file_path: str) -> bool: + if "*" in file_path: + nr_of_matching_files = len(list(glob.iglob(file_path))) + print("GLOB w", nr_of_matching_files, nr_of_matching_files > 0) + return nr_of_matching_files > 0 + print("MCT EXPORT (no-glob?", os.path.exists(file_path), file_path.endswith(".json")) + return os.path.exists(file_path) and file_path.endswith(".json") + + +def validate_input(model_path1: str, model_path2: str, documents_file: str, + cui_filter: Optional[Union[Set[str], str]], + supevised_train_comp: bool): + if not os.path.exists(model_path1): + raise ValueError(f"No model found at specified path (1st model): {model_path1}") + if not is_medcat_model(model_path1): + raise ValueError(f"Not a medcat model: {model_path1}") + if not os.path.exists(model_path2): + if supevised_train_comp and not _is_mct_export(model_path2): + raise ValueError(f"No matching MCT export found for: {model_path2}") + elif not supevised_train_comp: + raise ValueError(f"No file found at specified path (2nd model): {model_path2}") + if supevised_train_comp: + if not os.path.isfile(model_path2) and not _is_mct_export(model_path2): + raise ValueError(f"MCT export provided should be a file not a folder: {model_path2}") + if not model_path2.lower().endswith(".json"): + raise ValueError(f"MCT export expected in .json format, Got: {model_path2}") + elif not is_medcat_model(model_path2): + raise ValueError(f"Not a medcat model: {model_path2}") + if cui_filter is not None: + if isinstance(cui_filter, str): + if not os.path.exists(cui_filter): + raise ValueError(f"File passed as CUI filter does not exist: {cui_filter}") + if not os.path.exists(documents_file): + raise ValueError(f"No documents file found: {documents_file}") + if not documents_file.lower().endswith(".csv"): + raise ValueError(f"Expected a .csv file for documnets, got: {documents_file}") + + +def _is_medcat_model_folder(model_folder: str): + # needs to have CDB and vocab + cdb_path = os.path.join(model_folder, 'cdb.dat') + vocab_path = os.path.join(model_folder, "vocab.dat") + return ((os.path.exists(cdb_path) and os.path.isfile(cdb_path)) and + (os.path.exists(vocab_path) and os.path.isfile(vocab_path))) + + +def is_medcat_model(model_path: str) -> bool: + if os.path.isdir(model_path): + return _is_medcat_model_folder(model_path) + model_folder = model_path[:-len(".zip")] + if os.path.exists(model_folder): + # NOTE: if the model folder doesn't exist, it will + # be extracted upon loading the model + return _is_medcat_model_folder(model_folder) + # NOTE: this does not actually guarantee that it's a model pack + # but it would be outside the scope of this method + # to try and extract or list the contents + return model_path.endswith(".zip") diff --git a/requirements.txt b/requirements.txt index cccbcaf..936a5be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ medcat==1.10.2 plotly~=5.19.0 eland==8.12.1 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl +ipyfilechooser +jupyter_contrib_nbextensions \ No newline at end of file