-
Notifications
You must be signed in to change notification settings - Fork 15
/
PFTLS_Chapter_13.py
executable file
·110 lines (87 loc) · 3.73 KB
/
PFTLS_Chapter_13.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
__author__ = 'Amber Biology LLC'
# Python For The Life Sciences
# By Alex Lancaster & Gordon Webster
# Chapter 13
# The text of the book is (c) Amber Biology LLC (www.amberbiology.com)
# The Python code from the book is released into the public domain, as follows:
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# For more information, please refer to <http://unlicense.org/>
# get the TF regulation data from yeastract
def get_tf_yeastract():
with open("RegulationTwoColumnTable_Documented_2013927.tsv") as file:
lines = file.readlines()
# create two data structures:
# (1) one indexed by TF
# (2) one indexed by gene
# create a dictionary indexed by transcription factors (TF)
# each key is a TF, each value is a *list* of genes it regulates
alltfs = {}
# create a dictionary of indexed by genes
# each key is a gene, each value is a *list* of TFs regulated by that gene
allgenes = {}
for line in lines:
# split line into elements using ';' as separator
items = line.split(';')
# lowercase each item, so that minor typographical differences in
# casing don't confuse
items = [item.lower() for item in items]
# first column = TF (element 0)
# second column = gene that TF regulates (element 1)
tf = items[0].strip()
gene = items[1].strip()
# genes keyed by TF
if tf in alltfs:
# TF already added, we just add the gene
(alltfs[tf]).add(gene)
else:
# otherwise, we create an empty set
alltfs[tf] = set()
# then add it
(alltfs[tf]).add(gene)
# TFs keyed by gene
if gene in allgenes:
# if gene already exists, we just add the TF
(allgenes[gene]).add(tf)
else:
# otherwise create empty set
allgenes[gene] = set()
# then add it
(allgenes[gene]).add(tf)
print("total TFs:", len(alltfs))
print("total genes:", len(allgenes))
return alltfs, allgenes
def get_common_genes(alltfs, tf1, tf2):
return alltfs[tf1] & alltfs[tf2]
def get_all_genes(alltfs, tf1, tf2):
return alltfs[tf1] | alltfs[tf2]
if __name__ == "__main__":
# get TF regulation data
alltfs, allgenes = get_tf_yeastract()
# remember we lowercased the gene names!
common_genes = get_common_genes(alltfs, 'abf1', 'cyc8')
print("genes regulated by both abf1 and cyc8", sorted(common_genes))
all_genes = get_all_genes(alltfs, 'abf1', 'cyc8')
print("genes regulated by abf1 or cyc8", sorted(all_genes))