Add support for disease queries, remove phenotype factories.

P2GX · Mar 26, 2024 · 73b785d · 73b785d
1 parent f0046b9
commit 73b785d
Show file tree

Hide file tree

Showing 13 changed files with 541 additions and 475 deletions.
diff --git a/src/genophenocorr/analysis/_api.py b/src/genophenocorr/analysis/_api.py
@@ -6,7 +6,8 @@
 import pandas as pd
 
 from genophenocorr.model import VariantEffect, Patient, FeatureType
-from .predicate import PatientCategory
+from .predicate import PolyPredicate, PatientCategory
+from .predicate.phenotype import P
 
 PatientsByHPO = namedtuple('PatientsByHPO', field_names=['all_with_hpo', 'all_without_hpo'])
 
@@ -16,18 +17,20 @@ class GenotypePhenotypeAnalysisResult:
     `GenotypePhenotypeAnalysisResult` summarizes results of genotype-phenotype correlation analysis of a cohort.
     """
 
-    def __init__(self, n_usable: pd.Series,
-                 all_counts: pd.DataFrame,
-                 pvals: pd.Series,
-                 corrected_pvals: typing.Optional[pd.Series],
-                 phenotype_categories: typing.Iterable[PatientCategory],
-                 question: str):
+    def __init__(
+            self, n_usable: pd.Series,
+            all_counts: typing.Mapping[P, pd.DataFrame],
+            pvals: pd.Series,
+            corrected_pvals: typing.Optional[pd.Series],
+            phenotype_categories: typing.Iterable[PatientCategory],
+            geno_predicate: PolyPredicate,
+    ):
         self._n_usable = n_usable
         self._all_counts = all_counts
         self._pvals = pvals
         self._corrected_pvals = corrected_pvals
         self._phenotype_categories = tuple(phenotype_categories)
-        self._question = question
+        self._geno_predicate = geno_predicate
 
     @property
     def n_usable(self) -> pd.Series:
@@ -38,23 +41,22 @@ def n_usable(self) -> pd.Series:
         return self._n_usable
 
     @property
-    def all_counts(self) -> pd.DataFrame:
+    def all_counts(self) -> typing.Mapping[P, pd.DataFrame]:
         """
-        Get a :class:`pandas.DataFrame` with counts of patients in genotype and phenotype groups.
+        Get a mapping from the phenotype item to :class:`pandas.DataFrame` with counts of patients
+        in genotype and phenotype groups.
 
         An example for a genotype predicate that bins into two categories (`Yes` and `No`) based on presence
-        of a missense variant in transcript, and phenotype predicate that checks presence/absence of a phenotype term::
-
-                                   Has MISSENSE_VARIANT in NM_123456.7
-                                   No       Yes
-            HPO         Present
-            HP:0001166  Yes        1        13
-            HP:0001166  No         7        5
-            HP:0003011  Yes        3        11
-            HP:0003011  No         7        16
-            HP:0012638  Yes        9        15
-            HP:0012638  No         3        4
+        of a missense variant in transcript `NM_123456.7`, and phenotype predicate that checks
+        presence/absence of `HP:0001166` (a phenotype term)::
 
+                       Has MISSENSE_VARIANT in NM_123456.7
+                       No       Yes
+            Present
+            Yes        1        13
+            No         7        5
+
+        The rows correspond to the phenotype categories, and the columns represent the genotype categories.
         """
         return self._all_counts
 
@@ -79,70 +81,50 @@ def phenotype_categories(self) -> typing.Sequence[PatientCategory]:
         """
         return self._phenotype_categories
 
-    def summarize(self, hpo: hpotk.MinimalOntology,
-                  phenotype_category: PatientCategory) -> pd.DataFrame:
+    def summarize(
+            self, hpo: hpotk.MinimalOntology,
+            category: PatientCategory,
+    ) -> pd.DataFrame:
         """
         Create a data frame with summary of the genotype phenotype analysis.
 
         The *rows* of the frame correspond to the analyzed HPO terms.
 
         The columns of the data frame have `Count` and `Percentage` per used genotype predicate.
-        For instance, if we used :class:`genophenocorr.analysis.predicate.genotype.VariantEffectPredicate`
-        which compares phenotype with and without variants with certain :class:`genophenocorr.model.VariantEffect`,
-        we will have the following columns:
 
-        =====  =======  =====  =======
-        Yes             No
-        --------------  --------------
-        Count  Percent  Count  Percent
-        =====  =======  =====  =======
+        **Example**
 
-        The final data frame may look something like this::
+        If we use :class:`genophenocorr.analysis.predicate.genotype.VariantEffectPredicate`
+        which can compare phenotype with and without a missense variant, we will have a data frame
+        that looks like this::
 
             MISSENSE_VARIANT on `NM_1234.5`                       No                  Yes
                                                                   Count    Percent    Count    Percent   p value   Corrected p value
-            Arachnodactyly [HP:0001166]                           1/14       7.14%    13/14     92.86%   0.000781  0.020299
-            Abnormality of the musculature [HP:0003011]           6/17      35.29%    11/17     64.71%   1.000000  1.000000
-            Abnormal nervous system physiology [HP:0012638]       9/24      37.50%    15/24     62.50%   1.000000  1.000000
+            Arachnodactyly [HP:0001166]                           1/10         10%    13/16        81%   0.000781  0.020299
+            Abnormality of the musculature [HP:0003011]           6/6         100%    11/11       100%   1.000000  1.000000
+            Abnormal nervous system physiology [HP:0012638]       9/9         100%    15/15       100%   1.000000  1.000000
             ...                                                   ...      ...        ...      ...       ...       ...
         """
-        if phenotype_category not in self._phenotype_categories:
-            raise ValueError(f'Unknown phenotype category: {phenotype_category}. Use one of {self._phenotype_categories}')
+        if category not in self._phenotype_categories:
+            raise ValueError(f'Unknown phenotype category: {category}. Use one of {self._phenotype_categories}')
 
         # Row index: a list of tested HPO terms
         pheno_idx = pd.Index(self._n_usable.index)
         # Column index: multiindex of counts and percentages for all genotype predicate groups
         geno_idx = pd.MultiIndex.from_product(
-            iterables=(self._all_counts.columns, ('Count', 'Percent')),
-            names=(self._question, None))
+            iterables=(self._geno_predicate.get_categories(), ('Count', 'Percent')),
+            names=(self._geno_predicate.get_question(), None),
+        )
 
         # We'll fill this frame with data
         df = pd.DataFrame(index=pheno_idx, columns=geno_idx)
 
-        # We must choose the counts with the `phenotype_category` of interest.
-        # This depends on the used phenotype predicate.
-        # Usually, we are interested patients who HAVE a feature,
-        # but we can also investigate those who do NOT have the feature.
-        # Hence, we need `phenotype_category`.
-        counts = self._all_counts.loc[(slice(None), phenotype_category), :]
-        counts.set_index(pheno_idx, inplace=True)
-
-        # We must find the total count of all samples in each genotype category
-        # stack turns all indexes into columns. swaplevel switches the pheno_category with geno_category.
-        # unstack puts pheno_category as the column and geno_category as the multi-index.
-        # sum(axis=1) adds across the geno_category, giving us the total patients in that category, removing pheno_category.
-        # unstack puts the geno_category back as the column name. reindex_like(counts) formats it like counts, making the 
-        # following for loop work.
-        totals = self._all_counts.stack().swaplevel().unstack().sum(axis=1).unstack().reindex_like(counts)
-
-        # Fill the frame cells
-        for col in geno_idx.levels[0]:
-            cnt = counts[col]
-            tot = totals[col]
-            # Format `Count` as `N/M` string, where `N` is the sample count for the genotype category
-            # within that phenotype category and `M` is the total count of all samples in that genotype
-            df[col, 'Count'] = cnt.map(str) + '/' + tot.map(str)
-            df[col, 'Percent'] = (cnt * 100 / tot).round(decimals=2).map(str) + '%'
+        for pf, count in self._all_counts.items():
+            total = count.sum()  # Sum across the phenotype categories (collapse the rows).
+            for gt_cat in count.columns:
+                cnt = count.loc[category, gt_cat]
+                df.loc[pf, (gt_cat, 'Count')] = f'{cnt}/{total[gt_cat]}'
+                df.loc[pf, (gt_cat, 'Percent')] = f'{round(cnt * 100 / total[gt_cat])}%'
 
         # Add columns with p values and corrected p values (if present)
         df.insert(df.shape[1], ('', self._pvals.name), self._pvals)

diff --git a/src/genophenocorr/analysis/_config.py b/src/genophenocorr/analysis/_config.py
@@ -11,7 +11,6 @@
 from ._filter import SimplePhenotypeFilter
 from ._gp_analysis import FisherExactAnalyzer
 from ._gp_impl import GpCohortAnalysis
-from .predicate.phenotype import PropagatingPhenotypeBooleanPredicateFactory
 
 P_VAL_OPTIONS = (
     'bonferroni', 'b',
@@ -196,14 +195,8 @@ def configure_cohort_analysis(cohort: Cohort,
         config.min_perc_patients_w_hpo,
     )
 
-    # Configure the nuts and bolts of the G/P analysis.
-    phenotype_predicate_factory = PropagatingPhenotypeBooleanPredicateFactory(
-        hpo=hpo,
-        missing_implies_excluded=config.missing_implies_excluded,
-    )
     # Choosing a simple Fisher's exact test for now.
     gp_analyzer = FisherExactAnalyzer(
-        pheno_predicate_factory=phenotype_predicate_factory,
         p_val_correction=config.pval_correction,
         mtc_alpha=config.mtc_alpha,
     )
@@ -214,6 +207,7 @@ def configure_cohort_analysis(cohort: Cohort,
         protein_service=protein_metadata_service,
         phenotype_filter=phenotype_filter,
         gp_analyzer=gp_analyzer,
+        missing_implies_excluded=config.missing_implies_excluded,
         include_sv=config.include_sv,
     )
 

diff --git a/src/genophenocorr/analysis/_gp_analysis.py b/src/genophenocorr/analysis/_gp_analysis.py
@@ -1,38 +1,31 @@
 import abc
 import typing
 
-import hpotk
-
 import pandas as pd
 
 from statsmodels.stats import multitest
 
 from genophenocorr.model import Patient
 
-from .predicate import PolyPredicate
-from .predicate.phenotype import PhenotypePredicateFactory
+from .predicate import PolyPredicate, PatientCategory
+from .predicate.phenotype import PhenotypePolyPredicate, P
 
 from ._api import GenotypePhenotypeAnalysisResult
 from ._stats import run_fisher_exact, run_recessive_fisher_exact
 
 
-class GPAnalyzer(metaclass=abc.ABCMeta):
+class GPAnalyzer(typing.Generic[P], metaclass=abc.ABCMeta):
     """
     `GPAnalyzer` calculates p values for genotype-phenotype correlation of phenotypic features of interest.
     """
 
-    def __init__(self,
-                 pheno_predicate_factory: PhenotypePredicateFactory,
-                 ):
-        self._pheno_predicate_factory = hpotk.util.validate_instance(
-            pheno_predicate_factory, PhenotypePredicateFactory, 'pheno_predicate_factory')
-
     @abc.abstractmethod
-    def analyze(self,
-                patients: typing.Iterable[Patient],
-                phenotypic_features: typing.Iterable[hpotk.TermId],
-                predicate: PolyPredicate,
-                ) -> GenotypePhenotypeAnalysisResult:
+    def analyze(
+            self,
+            patients: typing.Iterable[Patient],
+            pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+            gt_predicate: PolyPredicate,
+    ) -> GenotypePhenotypeAnalysisResult:
         """
         Test for association between `phenotypic_features` of interest groups or `patients` determined
         by the `predicate`.
@@ -42,75 +35,91 @@ def analyze(self,
         pass
 
     @staticmethod
-    def _count_patients(patients: typing.Iterable[Patient],
-                        phenotypes_of_interest: typing.Iterable[hpotk.TermId],
-                        pheno_predicate_factory: PhenotypePredicateFactory,
-                        geno_predicate: PolyPredicate) -> typing.Tuple[pd.Series, pd.DataFrame]:
-        row_idx = pd.MultiIndex.from_product(
-            iterables=(phenotypes_of_interest, pheno_predicate_factory.get_categories()),
-            names=('phenotypic_feature', 'category')
-        )
-        col_idx = pd.Index(geno_predicate.get_categories(), name=geno_predicate.get_question())
-        counts = pd.DataFrame(data=0, index=row_idx, columns=col_idx)
-        n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes_of_interest))
+    def _count_patients(
+            patients: typing.Iterable[Patient],
+            pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+            gt_predicate: PolyPredicate,
+    ) -> typing.Tuple[typing.Iterable[PatientCategory], pd.Series, typing.Mapping[P, pd.DataFrame]]:
+        phenotypes = set()
+        categories = set()
+        for predicate in pheno_predicates:
+            categories.update(predicate.get_categories())
+            phenotypes.update(predicate.phenotypes)
+
+        n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes))
 
         # Apply genotype and phenotype predicates
-        for pf in phenotypes_of_interest:
-            pheno_predicate = pheno_predicate_factory.get_predicate(pf)
+        counts = {}
+        for ph_predicate in pheno_predicates:
+            phenotypes = ph_predicate.phenotypes
+
+            for phenotype in phenotypes:
+                if phenotype not in counts:
+                    # Make an empty frame for keeping track of the counts.
+                    counts[phenotype] = pd.DataFrame(
+                        data=0,
+                        index=pd.Index(
+                            data=ph_predicate.get_categories(),
+                            name=ph_predicate.get_question(),
+                        ),
+                        columns=pd.Index(
+                            data=gt_predicate.get_categories(),
+                            name=gt_predicate.get_question(),
+                        ),
+                    )
 
             for patient in patients:
-                pheno_cat = pheno_predicate.test(patient)
-                geno_cat = geno_predicate.test(patient)
+                pheno_cat = ph_predicate.test(patient)
+                geno_cat = gt_predicate.test(patient)
 
                 if pheno_cat is not None and geno_cat is not None:
-                    counts.loc[(pf, pheno_cat), geno_cat] += 1
-                    n_usable_patients[pf] += 1
+                    counts[pheno_cat.phenotype].loc[pheno_cat.category, geno_cat.category] += 1
+                    n_usable_patients[pheno_cat.phenotype] += 1
 
-        return n_usable_patients, counts
+        return categories, n_usable_patients, counts
 
 
-class FisherExactAnalyzer(GPAnalyzer):
+class FisherExactAnalyzer(typing.Generic[P], GPAnalyzer[P]):
     """
     `FisherExactAnalyzer` uses Fisher's exact test to calculate p value for phenotypic features of interest.
 
     Following the test, the code applies one of the multiple testing corrections provided by
     :func:`statsmodels.stats.multitest.multipletests` function at given `mtc_alpha`.
 
-    :param p_val_correction: a `str` with name of the multiple testing correction method or `None` if no correction should be applied.
+    :param p_val_correction: a `str` with name of the multiple testing correction method or `None` if no correction
+      should be applied.
     :param mtc_alpha: a `float` in range :math:`(0, 1]` with the multiple testing correction alpha value.
     """
 
-    def __init__(self,
-                 pheno_predicate_factory: PhenotypePredicateFactory,
-                 p_val_correction: typing.Optional[str] = None,
-                 mtc_alpha: float = .05,
-                 ):
-        super().__init__(pheno_predicate_factory)
+    def __init__(
+            self,
+            p_val_correction: typing.Optional[str] = None,
+            mtc_alpha: float = .05,
+    ):
         self._correction = p_val_correction
         self._mtc_alpha = mtc_alpha
 
     def analyze(
             self,
             patients: typing.Iterable[Patient],
-            phenotypic_features: typing.Iterable[hpotk.TermId],
-            predicate: PolyPredicate,
+            pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
+            gt_predicate: PolyPredicate,
     ) -> GenotypePhenotypeAnalysisResult:
         # 1) Count the patients
-        n_usable, all_counts = GPAnalyzer._count_patients(
-            patients, phenotypic_features,
-            self._pheno_predicate_factory, predicate,
+        categories, n_usable, all_counts = GPAnalyzer._count_patients(
+            patients, pheno_predicates, gt_predicate,
         )
 
         # 2) Statistical tests
-        pvals_idx = pd.Index(phenotypic_features, name='p_val')
-        pvals = pd.Series(float('nan'), index=pvals_idx, name='p value')
-        for pf in phenotypic_features:
-            counts = all_counts.loc[pf]
+        pheno_idx = pd.Index(n_usable.index, name='p_val')
+        pvals = pd.Series(float('nan'), index=pheno_idx, name='p value')
+        for phenotype in pheno_idx:
+            counts = all_counts[phenotype]
             # TODO - this is where we must fail unless we have the contingency table of the right size!
             if counts.shape == (2, 2):
-                pvals[pf] = run_fisher_exact(counts)
+                pvals[phenotype] = run_fisher_exact(counts)
             elif counts.shape == (3, 2):
-                pvals[pf] = run_recessive_fisher_exact(counts)
+                pvals[phenotype] = run_recessive_fisher_exact(counts)
             else:
                 raise ValueError(
                     "Invalid number of categories. "
@@ -120,7 +129,7 @@ def analyze(
         # 3) Multiple correction
         if self._correction is not None:
             _, pvals_corrected, _, _ = multitest.multipletests(pvals, alpha=self._mtc_alpha, method=self._correction)
-            corrected_idx = pd.Index(phenotypic_features, name='p_val_corrected')
+            corrected_idx = pd.Index(n_usable.index, name='p_val_corrected')
             corrected_pvals_series = pd.Series(data=pvals_corrected, index=corrected_idx,
                                                name='Corrected p value')
         else:
@@ -132,6 +141,6 @@ def analyze(
             all_counts=all_counts,
             pvals=pvals,
             corrected_pvals=corrected_pvals_series,
-            phenotype_categories=self._pheno_predicate_factory.get_categories(),
-            question=predicate.get_question(),
+            phenotype_categories=categories,
+            geno_predicate=gt_predicate,
         )