-
Notifications
You must be signed in to change notification settings - Fork 0
/
motif_enrichment_functions.py
91 lines (74 loc) · 2.92 KB
/
motif_enrichment_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import scipy
import scipy.ndimage
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pybedtools as pbt
from aux_functions import fetch_mean_matched_values
def get_motif_enrichments(df, motif_df, mean_match=True, mod=10):
rs = []
for c, col in enumerate(motif_df):
motif_idx = motif_df[col] > 0
x = df.loc[motif_idx, 'log2FoldChange']
if mean_match:
motif_nonidx = fetch_mean_matched_values(df.loc[motif_idx, 'baseMean'],
df.loc[~motif_idx, 'baseMean']).index
else:
motif_nonidx = motif_df[col] == 0
y = df.loc[motif_nonidx, 'log2FoldChange']
_, p = scipy.stats.ranksums(x, y)
delta = (x.mean() - y.mean())
rs.append([col, p, delta])
if c % mod == 0:
print("Done with", c)
return pd.DataFrame(rs, columns = ['factor', 'p', 'delta'])
from concurrent.futures import ProcessPoolExecutor
# Ensure your fetch_mean_matched_values function is defined here
def compute_enrichment(args):
"""
Compute enrichment for a single motif.
This function is designed to be run in parallel.
"""
df, motif_df_path, col = args
motif_df = pd.read_csv(motif_df_path, index_col = 0)
motif_df = motif_df.loc[df.index]
motif_df = motif_df[~motif_df.index.duplicated()]
motif_idx = motif_df[col] > 0
x = df.loc[motif_idx, 'log2FoldChange']
if motif_idx.mean() > .7:
return [col, np.nan, np.nan]
elif motif_idx.sum() > .3:
motif_nonidx = motif_df[col] == 0
else:
motif_nonidx = fetch_mean_matched_values(df.loc[motif_idx, 'baseMean'], df.loc[~motif_idx, 'baseMean']).index
y = df.loc[motif_nonidx, 'log2FoldChange']
_, p = scipy.stats.ranksums(x, y)
delta = (x.mean() - y.mean())
print("Done with", col)
return col, p, delta
def get_motif_enrichments_parallel(df, motif_df_path, n_cores=20):
motif_df = pd.read_csv(motif_df_path, index_col=0)
# Prepare arguments for parallel computation
tasks = [(df, motif_df_path, col) for col in motif_df.columns]
# Process in parallel
with ProcessPoolExecutor(max_workers=n_cores) as executor:
results = list(executor.map(compute_enrichment, tasks))
# Return results as a DataFrame
return pd.DataFrame(results, columns=['factor', 'p', 'delta'])
def get_motif_enrichments_using_idx(motif_df, idx1, idx2):
rows = []
for col in motif_df:
x = motif_df.loc[idx1, col]
y = motif_df.loc[idx2, col]
_, p = scipy.stats.fisher_exact([[(x>0).sum(), (x==0).sum()], [(y>0).sum(), (y==0).sum()]])
xval = (x.sum())
yval = (y.sum())
if (xval==0):
xval = 1
if (yval==0):
yval = 1
xval = xval / len(x)
yval = yval / len(y)
delta = np.log2(xval/yval)
rows.append([col, p, delta])
return pd.DataFrame(rows, columns=['factor', 'p', 'delta'])