Skip to content

Commit

Permalink
Add support for disease queries, remove phenotype factories.
Browse files Browse the repository at this point in the history
  • Loading branch information
ielis committed Mar 26, 2024
1 parent f0046b9 commit 73b785d
Show file tree
Hide file tree
Showing 13 changed files with 541 additions and 475 deletions.
108 changes: 45 additions & 63 deletions src/genophenocorr/analysis/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import pandas as pd

from genophenocorr.model import VariantEffect, Patient, FeatureType
from .predicate import PatientCategory
from .predicate import PolyPredicate, PatientCategory
from .predicate.phenotype import P

PatientsByHPO = namedtuple('PatientsByHPO', field_names=['all_with_hpo', 'all_without_hpo'])

Expand All @@ -16,18 +17,20 @@ class GenotypePhenotypeAnalysisResult:
`GenotypePhenotypeAnalysisResult` summarizes results of genotype-phenotype correlation analysis of a cohort.
"""

def __init__(self, n_usable: pd.Series,
all_counts: pd.DataFrame,
pvals: pd.Series,
corrected_pvals: typing.Optional[pd.Series],
phenotype_categories: typing.Iterable[PatientCategory],
question: str):
def __init__(
self, n_usable: pd.Series,
all_counts: typing.Mapping[P, pd.DataFrame],
pvals: pd.Series,
corrected_pvals: typing.Optional[pd.Series],
phenotype_categories: typing.Iterable[PatientCategory],
geno_predicate: PolyPredicate,
):
self._n_usable = n_usable
self._all_counts = all_counts
self._pvals = pvals
self._corrected_pvals = corrected_pvals
self._phenotype_categories = tuple(phenotype_categories)
self._question = question
self._geno_predicate = geno_predicate

@property
def n_usable(self) -> pd.Series:
Expand All @@ -38,23 +41,22 @@ def n_usable(self) -> pd.Series:
return self._n_usable

@property
def all_counts(self) -> pd.DataFrame:
def all_counts(self) -> typing.Mapping[P, pd.DataFrame]:
"""
Get a :class:`pandas.DataFrame` with counts of patients in genotype and phenotype groups.
Get a mapping from the phenotype item to :class:`pandas.DataFrame` with counts of patients
in genotype and phenotype groups.
An example for a genotype predicate that bins into two categories (`Yes` and `No`) based on presence
of a missense variant in transcript, and phenotype predicate that checks presence/absence of a phenotype term::
Has MISSENSE_VARIANT in NM_123456.7
No Yes
HPO Present
HP:0001166 Yes 1 13
HP:0001166 No 7 5
HP:0003011 Yes 3 11
HP:0003011 No 7 16
HP:0012638 Yes 9 15
HP:0012638 No 3 4
of a missense variant in transcript `NM_123456.7`, and phenotype predicate that checks
presence/absence of `HP:0001166` (a phenotype term)::
Has MISSENSE_VARIANT in NM_123456.7
No Yes
Present
Yes 1 13
No 7 5
The rows correspond to the phenotype categories, and the columns represent the genotype categories.
"""
return self._all_counts

Expand All @@ -79,70 +81,50 @@ def phenotype_categories(self) -> typing.Sequence[PatientCategory]:
"""
return self._phenotype_categories

def summarize(self, hpo: hpotk.MinimalOntology,
phenotype_category: PatientCategory) -> pd.DataFrame:
def summarize(
self, hpo: hpotk.MinimalOntology,
category: PatientCategory,
) -> pd.DataFrame:
"""
Create a data frame with summary of the genotype phenotype analysis.
The *rows* of the frame correspond to the analyzed HPO terms.
The columns of the data frame have `Count` and `Percentage` per used genotype predicate.
For instance, if we used :class:`genophenocorr.analysis.predicate.genotype.VariantEffectPredicate`
which compares phenotype with and without variants with certain :class:`genophenocorr.model.VariantEffect`,
we will have the following columns:
===== ======= ===== =======
Yes No
-------------- --------------
Count Percent Count Percent
===== ======= ===== =======
**Example**
The final data frame may look something like this::
If we use :class:`genophenocorr.analysis.predicate.genotype.VariantEffectPredicate`
which can compare phenotype with and without a missense variant, we will have a data frame
that looks like this::
MISSENSE_VARIANT on `NM_1234.5` No Yes
Count Percent Count Percent p value Corrected p value
Arachnodactyly [HP:0001166] 1/14 7.14% 13/14 92.86% 0.000781 0.020299
Abnormality of the musculature [HP:0003011] 6/17 35.29% 11/17 64.71% 1.000000 1.000000
Abnormal nervous system physiology [HP:0012638] 9/24 37.50% 15/24 62.50% 1.000000 1.000000
Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.000781 0.020299
Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000
Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000
... ... ... ... ... ... ...
"""
if phenotype_category not in self._phenotype_categories:
raise ValueError(f'Unknown phenotype category: {phenotype_category}. Use one of {self._phenotype_categories}')
if category not in self._phenotype_categories:
raise ValueError(f'Unknown phenotype category: {category}. Use one of {self._phenotype_categories}')

# Row index: a list of tested HPO terms
pheno_idx = pd.Index(self._n_usable.index)
# Column index: multiindex of counts and percentages for all genotype predicate groups
geno_idx = pd.MultiIndex.from_product(
iterables=(self._all_counts.columns, ('Count', 'Percent')),
names=(self._question, None))
iterables=(self._geno_predicate.get_categories(), ('Count', 'Percent')),
names=(self._geno_predicate.get_question(), None),
)

# We'll fill this frame with data
df = pd.DataFrame(index=pheno_idx, columns=geno_idx)

# We must choose the counts with the `phenotype_category` of interest.
# This depends on the used phenotype predicate.
# Usually, we are interested patients who HAVE a feature,
# but we can also investigate those who do NOT have the feature.
# Hence, we need `phenotype_category`.
counts = self._all_counts.loc[(slice(None), phenotype_category), :]
counts.set_index(pheno_idx, inplace=True)

# We must find the total count of all samples in each genotype category
# stack turns all indexes into columns. swaplevel switches the pheno_category with geno_category.
# unstack puts pheno_category as the column and geno_category as the multi-index.
# sum(axis=1) adds across the geno_category, giving us the total patients in that category, removing pheno_category.
# unstack puts the geno_category back as the column name. reindex_like(counts) formats it like counts, making the
# following for loop work.
totals = self._all_counts.stack().swaplevel().unstack().sum(axis=1).unstack().reindex_like(counts)

# Fill the frame cells
for col in geno_idx.levels[0]:
cnt = counts[col]
tot = totals[col]
# Format `Count` as `N/M` string, where `N` is the sample count for the genotype category
# within that phenotype category and `M` is the total count of all samples in that genotype
df[col, 'Count'] = cnt.map(str) + '/' + tot.map(str)
df[col, 'Percent'] = (cnt * 100 / tot).round(decimals=2).map(str) + '%'
for pf, count in self._all_counts.items():
total = count.sum() # Sum across the phenotype categories (collapse the rows).
for gt_cat in count.columns:
cnt = count.loc[category, gt_cat]
df.loc[pf, (gt_cat, 'Count')] = f'{cnt}/{total[gt_cat]}'
df.loc[pf, (gt_cat, 'Percent')] = f'{round(cnt * 100 / total[gt_cat])}%'

# Add columns with p values and corrected p values (if present)
df.insert(df.shape[1], ('', self._pvals.name), self._pvals)
Expand Down
8 changes: 1 addition & 7 deletions src/genophenocorr/analysis/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from ._filter import SimplePhenotypeFilter
from ._gp_analysis import FisherExactAnalyzer
from ._gp_impl import GpCohortAnalysis
from .predicate.phenotype import PropagatingPhenotypeBooleanPredicateFactory

P_VAL_OPTIONS = (
'bonferroni', 'b',
Expand Down Expand Up @@ -196,14 +195,8 @@ def configure_cohort_analysis(cohort: Cohort,
config.min_perc_patients_w_hpo,
)

# Configure the nuts and bolts of the G/P analysis.
phenotype_predicate_factory = PropagatingPhenotypeBooleanPredicateFactory(
hpo=hpo,
missing_implies_excluded=config.missing_implies_excluded,
)
# Choosing a simple Fisher's exact test for now.
gp_analyzer = FisherExactAnalyzer(
pheno_predicate_factory=phenotype_predicate_factory,
p_val_correction=config.pval_correction,
mtc_alpha=config.mtc_alpha,
)
Expand All @@ -214,6 +207,7 @@ def configure_cohort_analysis(cohort: Cohort,
protein_service=protein_metadata_service,
phenotype_filter=phenotype_filter,
gp_analyzer=gp_analyzer,
missing_implies_excluded=config.missing_implies_excluded,
include_sv=config.include_sv,
)

Expand Down
121 changes: 65 additions & 56 deletions src/genophenocorr/analysis/_gp_analysis.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,31 @@
import abc
import typing

import hpotk

import pandas as pd

from statsmodels.stats import multitest

from genophenocorr.model import Patient

from .predicate import PolyPredicate
from .predicate.phenotype import PhenotypePredicateFactory
from .predicate import PolyPredicate, PatientCategory
from .predicate.phenotype import PhenotypePolyPredicate, P

from ._api import GenotypePhenotypeAnalysisResult
from ._stats import run_fisher_exact, run_recessive_fisher_exact


class GPAnalyzer(metaclass=abc.ABCMeta):
class GPAnalyzer(typing.Generic[P], metaclass=abc.ABCMeta):
"""
`GPAnalyzer` calculates p values for genotype-phenotype correlation of phenotypic features of interest.
"""

def __init__(self,
pheno_predicate_factory: PhenotypePredicateFactory,
):
self._pheno_predicate_factory = hpotk.util.validate_instance(
pheno_predicate_factory, PhenotypePredicateFactory, 'pheno_predicate_factory')

@abc.abstractmethod
def analyze(self,
patients: typing.Iterable[Patient],
phenotypic_features: typing.Iterable[hpotk.TermId],
predicate: PolyPredicate,
) -> GenotypePhenotypeAnalysisResult:
def analyze(
self,
patients: typing.Iterable[Patient],
pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
gt_predicate: PolyPredicate,
) -> GenotypePhenotypeAnalysisResult:
"""
Test for association between `phenotypic_features` of interest groups or `patients` determined
by the `predicate`.
Expand All @@ -42,75 +35,91 @@ def analyze(self,
pass

@staticmethod
def _count_patients(patients: typing.Iterable[Patient],
phenotypes_of_interest: typing.Iterable[hpotk.TermId],
pheno_predicate_factory: PhenotypePredicateFactory,
geno_predicate: PolyPredicate) -> typing.Tuple[pd.Series, pd.DataFrame]:
row_idx = pd.MultiIndex.from_product(
iterables=(phenotypes_of_interest, pheno_predicate_factory.get_categories()),
names=('phenotypic_feature', 'category')
)
col_idx = pd.Index(geno_predicate.get_categories(), name=geno_predicate.get_question())
counts = pd.DataFrame(data=0, index=row_idx, columns=col_idx)
n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes_of_interest))
def _count_patients(
patients: typing.Iterable[Patient],
pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
gt_predicate: PolyPredicate,
) -> typing.Tuple[typing.Iterable[PatientCategory], pd.Series, typing.Mapping[P, pd.DataFrame]]:
phenotypes = set()
categories = set()
for predicate in pheno_predicates:
categories.update(predicate.get_categories())
phenotypes.update(predicate.phenotypes)

n_usable_patients = pd.Series(data=0, index=pd.Index(phenotypes))

# Apply genotype and phenotype predicates
for pf in phenotypes_of_interest:
pheno_predicate = pheno_predicate_factory.get_predicate(pf)
counts = {}
for ph_predicate in pheno_predicates:
phenotypes = ph_predicate.phenotypes

for phenotype in phenotypes:
if phenotype not in counts:
# Make an empty frame for keeping track of the counts.
counts[phenotype] = pd.DataFrame(
data=0,
index=pd.Index(
data=ph_predicate.get_categories(),
name=ph_predicate.get_question(),
),
columns=pd.Index(
data=gt_predicate.get_categories(),
name=gt_predicate.get_question(),
),
)

for patient in patients:
pheno_cat = pheno_predicate.test(patient)
geno_cat = geno_predicate.test(patient)
pheno_cat = ph_predicate.test(patient)
geno_cat = gt_predicate.test(patient)

if pheno_cat is not None and geno_cat is not None:
counts.loc[(pf, pheno_cat), geno_cat] += 1
n_usable_patients[pf] += 1
counts[pheno_cat.phenotype].loc[pheno_cat.category, geno_cat.category] += 1
n_usable_patients[pheno_cat.phenotype] += 1

return n_usable_patients, counts
return categories, n_usable_patients, counts


class FisherExactAnalyzer(GPAnalyzer):
class FisherExactAnalyzer(typing.Generic[P], GPAnalyzer[P]):
"""
`FisherExactAnalyzer` uses Fisher's exact test to calculate p value for phenotypic features of interest.
Following the test, the code applies one of the multiple testing corrections provided by
:func:`statsmodels.stats.multitest.multipletests` function at given `mtc_alpha`.
:param p_val_correction: a `str` with name of the multiple testing correction method or `None` if no correction should be applied.
:param p_val_correction: a `str` with name of the multiple testing correction method or `None` if no correction
should be applied.
:param mtc_alpha: a `float` in range :math:`(0, 1]` with the multiple testing correction alpha value.
"""

def __init__(self,
pheno_predicate_factory: PhenotypePredicateFactory,
p_val_correction: typing.Optional[str] = None,
mtc_alpha: float = .05,
):
super().__init__(pheno_predicate_factory)
def __init__(
self,
p_val_correction: typing.Optional[str] = None,
mtc_alpha: float = .05,
):
self._correction = p_val_correction
self._mtc_alpha = mtc_alpha

def analyze(
self,
patients: typing.Iterable[Patient],
phenotypic_features: typing.Iterable[hpotk.TermId],
predicate: PolyPredicate,
pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
gt_predicate: PolyPredicate,
) -> GenotypePhenotypeAnalysisResult:
# 1) Count the patients
n_usable, all_counts = GPAnalyzer._count_patients(
patients, phenotypic_features,
self._pheno_predicate_factory, predicate,
categories, n_usable, all_counts = GPAnalyzer._count_patients(
patients, pheno_predicates, gt_predicate,
)

# 2) Statistical tests
pvals_idx = pd.Index(phenotypic_features, name='p_val')
pvals = pd.Series(float('nan'), index=pvals_idx, name='p value')
for pf in phenotypic_features:
counts = all_counts.loc[pf]
pheno_idx = pd.Index(n_usable.index, name='p_val')
pvals = pd.Series(float('nan'), index=pheno_idx, name='p value')
for phenotype in pheno_idx:
counts = all_counts[phenotype]
# TODO - this is where we must fail unless we have the contingency table of the right size!
if counts.shape == (2, 2):
pvals[pf] = run_fisher_exact(counts)
pvals[phenotype] = run_fisher_exact(counts)
elif counts.shape == (3, 2):
pvals[pf] = run_recessive_fisher_exact(counts)
pvals[phenotype] = run_recessive_fisher_exact(counts)
else:
raise ValueError(
"Invalid number of categories. "
Expand All @@ -120,7 +129,7 @@ def analyze(
# 3) Multiple correction
if self._correction is not None:
_, pvals_corrected, _, _ = multitest.multipletests(pvals, alpha=self._mtc_alpha, method=self._correction)
corrected_idx = pd.Index(phenotypic_features, name='p_val_corrected')
corrected_idx = pd.Index(n_usable.index, name='p_val_corrected')
corrected_pvals_series = pd.Series(data=pvals_corrected, index=corrected_idx,
name='Corrected p value')
else:
Expand All @@ -132,6 +141,6 @@ def analyze(
all_counts=all_counts,
pvals=pvals,
corrected_pvals=corrected_pvals_series,
phenotype_categories=self._pheno_predicate_factory.get_categories(),
question=predicate.get_question(),
phenotype_categories=categories,
geno_predicate=gt_predicate,
)
Loading

0 comments on commit 73b785d

Please sign in to comment.