-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
142 lines (113 loc) · 6.82 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
row_colors = ['lightgreen', 'green', 'orange']
columns_to_names = {
0 : 'Constitutive',
4 : 'Dynamic',
18 : 'Repressive',
}
row_colors_dict = {
'Constitutive' : 'lightgreen',
'Dynamic' : 'green',
'Repressive' : 'orange',
}
from aux_functions import bedtool_to_index, add_chr_to_bedtool
import pybedtools as pbt
import pandas as pd
def get_h3k27ac_motif_counts(pco=5e-5):
atac_peak_motif_counts = bedtool_to_index(add_chr_to_bedtool(pbt.BedTool('h3k27ac/all_threshold_27ac.csv',
)))
atac_peak_motif_counts.index = atac_peak_motif_counts.values
meme_motif_df = pd.read_csv('h3k27ac_peaks/h3k27ac_motifs.bed', sep='\t')
motif_id_to_name_dict = dict(zip(meme_motif_df['motif_id'], meme_motif_df['dedup_protein_name']))
meme_motif_df = meme_motif_df[meme_motif_df['pval'] < pco]
meme_motif_df = meme_motif_df.value_counts(['peak', 'dedup_protein_name']).unstack()
meme_motif_df.index = meme_motif_df.index.str.lower().str.replace(":", "_").str.replace("-", "_")
meme_motif_df = meme_motif_df.fillna(0)
vs = []
missing_inds = atac_peak_motif_counts.index[~atac_peak_motif_counts.index.isin(meme_motif_df.index)]
if len(missing_inds) > 0:
for _ in missing_inds:
v = pd.Series(index=meme_motif_df.columns).fillna(0)
vs.append(v)
tmpdf = pd.concat(vs, axis=1).T
tmpdf.index = missing_inds
meme_motif_df = pd.concat([meme_motif_df, tmpdf], axis=0)
meme_motif_df = meme_motif_df.loc[atac_peak_motif_counts.index]
return meme_motif_df, motif_id_to_name_dict
import numpy as np
def load_scrna():
susie_path = './coexpression_analysis/'
# treg_precursor_rna_umi = pd.read_parquet(susie_path + 'GSM3978654_CD4plusCD25plus_dense.umis_per_gene.parquet')
# tregs_rna_umi = pd.read_parquet(susie_path + 'GSM3978655_Foxp3plus_dense.umis_per_gene.parquet')
treg_precursor_rna_corr = pd.read_parquet('./coexpression_analysis/GSM3978654_CD4plusCD25plus_dense.coexpression_mat.parquet')
tregs_rna_corr = pd.read_parquet('./coexpression_analysis/GSM3978655_Foxp3plus_dense.coexpression_mat.parquet')
treg_precursor_rna_corr += np.diag(np.diag(treg_precursor_rna_corr)*np.nan)
tregs_rna_corr += np.diag(np.diag(tregs_rna_corr)*np.nan)
same_genes = treg_precursor_rna_corr.index.intersection(tregs_rna_corr.index)
df1 = pd.read_parquet('./coexpression_analysis/GSM3978654_CD4plusCD25plus_dense.umis_per_gene.parquet')
df2 = pd.read_parquet('./coexpression_analysis/GSM3978655_Foxp3plus_dense.umis_per_gene.parquet')
df1 = df1.loc[same_genes]
df2 = df2.loc[same_genes]
same_genes = same_genes[(df1['Raw_UMI_count'] > 50) & (df2['Raw_UMI_count'] > 50)]
treg_precursor_rna_corr = treg_precursor_rna_corr.loc[same_genes, same_genes]
tregs_rna_corr = tregs_rna_corr.loc[same_genes, same_genes]
return treg_precursor_rna_corr, tregs_rna_corr
cp /Genomics/argo/users/gdolsten/pritlab/yanjin_motifreg_hic/meme/fimo_output/treg_tcon_atac.bed atac_peaks/atac_motifs.bed
from load_DESeq2_data import mod_index
def get_motif_counts(atac_peaks_index, pco=5e-5):
# atac_peak_motif_counts = pd.read_csv('/Genomics/argo/users/gdolsten/pritlab/jupys/tregs/rudensky_scrna/prelim-analysis/call_motifs/output/counts/previous_bulk_atac_peaks_Mus_musculus/previous_bulk_atac_peaks_p=1e-05.csv',
# index_col = 0) > 0
meme_motif_df = pd.read_csv('./atac/atac_motifs.bed', sep='\t')
motif_id_to_name_dict = dict(zip(meme_motif_df['motif_id'], meme_motif_df['dedup_protein_name']))
meme_motif_df = meme_motif_df[meme_motif_df['pval'] < pco]
meme_motif_df = meme_motif_df.value_counts(['peak', 'dedup_protein_name']).unstack()
meme_motif_df.index = meme_motif_df.index.str.lower().str.replace(":", "_").str.replace("-", "_")
meme_motif_df = meme_motif_df.fillna(0)
meme_motif_df = mod_index(meme_motif_df)
vs = []
missing_inds = atac_peaks_index
for i in missing_inds:
v = pd.Series(index=meme_motif_df.columns).fillna(0)
vs.append(v)
tmpdf = pd.concat(vs, axis=1).T
tmpdf.index = missing_inds
meme_motif_df = pd.concat([meme_motif_df, tmpdf], axis=0)
meme_motif_df = meme_motif_df.loc[atac_peaks_index.index]
return meme_motif_df, motif_id_to_name_dict
def make_metadomain_hub_freq_df(self, inter_and_intra_connections_tcon, inter_and_intra_connections_treg):
metadomain_mat_dict = {
'Treg' : (inter_and_intra_connections_tcon == 0) & (inter_and_intra_connections_treg > 0),
'Both' : (inter_and_intra_connections_tcon > 0) & (inter_and_intra_connections_treg > 0),
'Tcon' : (inter_and_intra_connections_tcon > 0) & (inter_and_intra_connections_treg == 0),
'Neither' : (inter_and_intra_connections_tcon == 0) & (inter_and_intra_connections_treg == 0),
}
metadomain_hub_freq_df = pd.DataFrame()
for key, metadomain_mat in metadomain_mat_dict.items():
data = []
us = [0, 4, 18]
for u in us:
inds = self.goodinds[self.merged_clustdict['all']==u]
data.append(np.sum(metadomain_mat[inds, :][:, inds]))
series_data = pd.Series(data, index=us)
metadomain_hub_freq_df[key] = series_data
metadomain_hub_freq_df.index = [columns_to_names.get(x, "Other") for x in metadomain_hub_freq_df.index]
metadomain_hub_freq_df = (metadomain_hub_freq_df.T / metadomain_hub_freq_df.sum(axis=1)).T
metadomain_hub_freq_df = metadomain_hub_freq_df.drop("Neither", axis=1)
return metadomain_hub_freq_df
def load_bulkrna(my_tss_df, geneLengths):
active_gene_lfcs = pbt.BedTool('./gene_expression/Treg_actv_vs_Tcon_actv_thresh=0.25.csv.narrowPeak').to_dataframe().set_index('name')
resting_gene_lfcs = pbt.BedTool('./gene_expression/Treg_rest_vs_Tcon_rest_thresh=0.25.csv.narrowPeak').to_dataframe().set_index('name')
active_gene_lfcs = active_gene_lfcs[active_gene_lfcs['itemRgb'] > 4]
resting_gene_lfcs = resting_gene_lfcs[resting_gene_lfcs['itemRgb'] > 4]
active_gene_lfcs = active_gene_lfcs[active_gene_lfcs.index.isin(my_tss_df['gene_name']) &
active_gene_lfcs.index.isin(geneLengths.index)]
resting_gene_lfcs = resting_gene_lfcs[resting_gene_lfcs.index.isin(my_tss_df['gene_name']) &
resting_gene_lfcs.index.isin(geneLengths.index)]
active_gene_lfcs = active_gene_lfcs[~active_gene_lfcs.index.duplicated()]
resting_gene_lfcs = resting_gene_lfcs[~resting_gene_lfcs.index.duplicated()]
active_gene_lfcs['rpkm'] = active_gene_lfcs['itemRgb'] / geneLengths.loc[active_gene_lfcs.index]
resting_gene_lfcs['rpkm'] = resting_gene_lfcs['itemRgb'] / geneLengths.loc[resting_gene_lfcs.index]
gene_dict = {
'Active' : active_gene_lfcs,
'Resting' : resting_gene_lfcs,
}
return gene_dict