Skip to content

Commit ae85a55

Browse files
committed
initial commit of trees_report
1 parent 77b5419 commit ae85a55

File tree

7 files changed

+638
-0
lines changed

7 files changed

+638
-0
lines changed

trees_report/Makefile

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# $^ = all prerequisites
2+
# $< = first prerequisite
3+
# $@ = file name of target
4+
5+
# Parent directory for github repo clones
6+
REPOS=../..
7+
8+
# How to invoke jython (with smasher built in)
9+
JYTHON=$(REPOS)/reference-taxonomy/bin/jython
10+
11+
# Location of synthesis products (output/ directory)
12+
SYNTH=$(REPOS)/files.opentreeoflife.org/synthesis/opentree5.0/opentree5.0/output
13+
14+
15+
all: trees_report.csv
16+
17+
trees_report.csv: shards/phylesystem-1 trees_report.py \
18+
work/conflict.csv synthesis_tree_list.csv taxa_in_synthesis.txt
19+
PEYOTL_CONFIG_FILE=dot_peyotl python trees_report.py $@
20+
21+
shards/phylesystem-1:
22+
mkdir -p shards
23+
ln -s ../$(REPOS)/phylesystem-1 shards/
24+
25+
conflict: work/conflict.csv
26+
27+
work/conflict.csv: conflict.py
28+
$(JYTHON) conflict.py \
29+
--out $@ \
30+
--shard $(REPOS)/phylesystem-1 \
31+
--ref $(SYNTH)/labelled_supertree/labelled_supertree.tre
32+
33+
# smaller, for testing
34+
work/conflict_small.csv: conflict.py
35+
$(JYTHON) conflict.py \
36+
--out $@ \
37+
--shard $(REPOS)/asterales-phylesystem \
38+
--ref $(REPOS)/reference-taxonomy/registry/aster-synth4/
39+
40+
work/synthesis_tree_list.csv: make_synthesis_tree_list.py
41+
python make_synthesis_tree_list.py $(REPOS)/collections-1 work/synthesis_tree_list.csv
42+
43+
work/taxa_in_synthesis.txt:
44+
$(JYTHON) taxa_in_synthesis.py \
45+
$(SYNTH)/grafted_solution/grafted_solution.tre \
46+
$@

trees_report/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Report various information on all trees in phylesystem
2+
3+
'make' creates trees_report.csv, which has the following columns:
4+
5+
* tree - identifier for the tree in studyid@treeid form
6+
* intended - 0 if study not intended for synthesis, 2 if intended, 1 if not specified
7+
* preferred - 2 if preferred tree or only tree, 1 if no tree is preferred, 0 if not preferred
8+
* has ingroup - 1 if has designated ingroup, 0 otherwise
9+
* has method - 1 if there's something in the 'curated type' field, 0 otherwise
10+
* #mapped - number of taxa (in OTT) to which tips are mapped
11+
* #tips - total number of tips in tree
12+
* #conflicts - number of nodes in synthetic tree that conflict with tree nodes
13+
* #resolved - number of nodes in tree that resolve synthetic tree nodes
14+
* #new - number of taxa from tree that aren't already in synthetic tree
15+
16+
The table is generated with a particular heuristic sort order, but may
17+
be re-sorted at will in your favorite spreadsheet program.
18+
19+
Smasher is used for conflict analysis and for parsing the Newick file
20+
for the synthetic tree without taxonomy. The results are placed in
21+
work/conflict.csv and work/synthesis_tree_list.csv.
22+
23+
## Configuration
24+
25+
* Assuming this directory is `trees_report` in some repo, say G, the parent directory of
26+
G should contain the following either as directories or as symbolic links:
27+
* reference_taxonomy
28+
* peyotl
29+
* collections-1
30+
* phylesystem-1
31+
* Install peyotl
32+
* Go to the reference-taxonomy repo and say `make compile bin/jython`
33+
* Adjust SYNTH in the Makefile or put synthesis files in that location
34+
* dot_peyotl is the peyotl configuration file, but it should require
35+
36+
## Known issues
37+
38+
* Does not processes the forwards.tsv file from OTT, meaning many
39+
taxa from trees will appear new that aren't
40+
* Does not know anything about incertae sedis (unplaced etc.), again making taxa
41+
appear new that cannot go into synthesis

trees_report/conflict.py

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
# Some class and method names borrowed from peyotl/nexson_proxy.py
2+
3+
import sys, os, json, argparse, csv
4+
from org.opentreeoflife.taxa import Taxonomy, Nexson, Flag
5+
from org.opentreeoflife.conflict import ConflictAnalysis, Disposition
6+
7+
repo_dir = '../repo' # directory containing repo clones
8+
9+
default_shard = os.path.join(repo_dir, 'phylesystem-1')
10+
11+
# Report generation
12+
13+
# Report on every tree in every study
14+
15+
def report_on_trees(study_ids, shard, refs, outfile):
16+
out = sys.stdout
17+
if outfile != '-': out = open(outfile, 'w')
18+
writer = start_report(refs, out)
19+
count = 0
20+
for study_id in study_ids:
21+
study = get_study(study_id, shard)
22+
for tree in tree_iter_nexson_proxy(study):
23+
row = report_on_tree(tree, study, refs)
24+
writer.writerow(row)
25+
count += 1
26+
if count % 100 == 0: print count, row
27+
if outfile != '-': out.close()
28+
29+
def start_report(refs, out):
30+
writer = csv.writer(out)
31+
writer.writerow(tree_report_header(refs))
32+
return writer
33+
34+
# 'tree' report
35+
# Write one row using the given csv writer summarizing how the given
36+
# tree conflicts with taxonomy and/or synthesis.
37+
38+
tree_report_header_1 = ['tree']
39+
tree_report_header_2 = ['conflicted', 'resolves']
40+
41+
def tree_report_header(refs):
42+
row = tree_report_header_1
43+
for ref in refs:
44+
row = row + tree_report_header_2
45+
return row
46+
47+
def report_on_tree(tree, study, refs):
48+
input = import_tree(tree)
49+
50+
# One row per tree
51+
row = ['%s@%s' % (study.id, tree.tree_id)]
52+
for ref in refs:
53+
row = row + [count_conflicts(input, ref, tree.ingroup()),
54+
count_resolutions(input, ref, tree.ingroup())]
55+
return row
56+
57+
def count_conflicts(input, ref, ingroup):
58+
analysis = ConflictAnalysis(input, ref, ingroup)
59+
arts = analysis.articulations(True)
60+
if arts == None: arts = []
61+
count = 0
62+
for art in arts:
63+
if art.disposition == Disposition.CONFLICTS_WITH:
64+
count += 1
65+
return count
66+
67+
def count_resolutions(input, ref, ingroup):
68+
analysis = ConflictAnalysis(input, ref, ingroup)
69+
arts = analysis.articulations(False)
70+
if arts == None: arts = []
71+
count = 0
72+
for art in arts:
73+
if art.disposition == Disposition.RESOLVES:
74+
count += 1
75+
return count
76+
77+
# Proxy object for study file in nexson format
78+
79+
class NexsonProxy(object):
80+
def __init__(self, filepath):
81+
self.filepath = filepath # peyotl name
82+
self.nexson = None
83+
self.reftax_otus = {}
84+
self.nexson_trees = {} # tree_id -> blob
85+
self.preferred_trees = []
86+
self._tree_proxy_cache = {}
87+
88+
self.nexson = Nexson.load(self.filepath)
89+
self._nexml_element = self.nexson[u'nexml'] # peyotl name
90+
self.reftax_otus = Nexson.getOtus(self.nexson) # sort of wrong
91+
z = self.get(u'^ot:candidateTreeForSynthesis')
92+
if z != None: self.preferred_trees = z
93+
self.id = self.get(u'^ot:studyId')
94+
95+
def get(self, attribute):
96+
if attribute in self.nexson[u'nexml']:
97+
return self._nexml_element[attribute]
98+
else:
99+
return None
100+
101+
# cf. peyotl _create_tree_proxy (does not always create)
102+
def _get_tree_proxy(self, tree_id, tree, otus_id):
103+
tp = self._tree_proxy_cache.get(tree_id)
104+
if tp is None:
105+
np = NexsonTreeProxy(tree, tree_id, otus_id, self)
106+
self._tree_proxy_cache[tree_id] = np
107+
return np
108+
109+
def get_tree(self, tree_id):
110+
np = self._tree_proxy_cache.get(tree_id)
111+
if np is not None:
112+
return np
113+
tgd = self._nexml_element[u'treesById']
114+
for tg in tgd.values():
115+
tbid = tg[u'treeById']
116+
if tree_id in tbid:
117+
otus_id = tg[u'@otus']
118+
nexson_tree = tbid[tree_id]
119+
return self._get_tree_proxy(tree_id=tree_id, tree=nexson_tree, otus_id=otus_id)
120+
return None
121+
122+
def tree_iter_nexson_proxy(nexson_proxy): # peyotl
123+
'''Iterates over NexsonTreeProxy objects in order determined by the nexson blob'''
124+
nexml_el = nexson_proxy._nexml_element
125+
tg_order = nexml_el['^ot:treesElementOrder']
126+
tgd = nexml_el['treesById']
127+
for tg_id in tg_order:
128+
tg = tgd[tg_id]
129+
tree_order = tg['^ot:treeElementOrder']
130+
tbid = tg['treeById']
131+
otus_id = tg['@otus']
132+
for k in tree_order:
133+
v = tbid[k]
134+
yield nexson_proxy._get_tree_proxy(tree_id=k, tree=v, otus_id=otus_id)
135+
136+
class NexsonTreeProxy(object):
137+
def __init__(self, nexson_tree, tree_id, otus_id, nexson_proxy):
138+
self.nexson_tree = nexson_tree
139+
self.nexson_otus_id = otus_id
140+
self.tree_id = tree_id
141+
self._nexson_proxy = nexson_proxy
142+
by_id = nexson_proxy._nexml_element[u'otusById']
143+
if otus_id not in by_id:
144+
print '** otus id not found', nexson_proxy.id, tree_id, otus_id, by_id.keys()
145+
self._otus = by_id[otus_id][u'otuById']
146+
def ingroup(self):
147+
ingroup = self.nexson_tree.get(u'^ot:inGroupClade')
148+
if ingroup == '':
149+
return None
150+
else:
151+
return ingroup
152+
153+
# Marshalling arguments
154+
155+
# shard is the path to the root of the repository (or shard) clone
156+
157+
def study_id_to_path(study_id, shard=default_shard):
158+
(prefix, number) = study_id.split('_', 1)
159+
if len(number) == 1:
160+
residue = '_' + number
161+
else:
162+
residue = number[-2:]
163+
return os.path.join(shard, 'study', prefix + '_' + residue, study_id, study_id + '.json')
164+
165+
# Load a study
166+
167+
single_study_cache = {'id': None, 'study': None}
168+
169+
def get_study(study_id, shard):
170+
if study_id == single_study_cache['id']:
171+
study = single_study_cache['study']
172+
else:
173+
single_study_cache['id'] = None
174+
single_study_cache['study'] = None
175+
study = gobble_study(study_id, shard)
176+
if study != None:
177+
single_study_cache['study'] = study
178+
single_study_cache['id'] = study_id
179+
return study
180+
181+
def gobble_study(study_id, phylesystem):
182+
filepath = study_id_to_path(study_id, phylesystem)
183+
# should do try/catch for file-not-found
184+
if not os.path.exists(filepath):
185+
# foo, should be using try/catch
186+
print '** Not found:', self.filepath
187+
return None
188+
return NexsonProxy(filepath)
189+
190+
def import_tree(tree):
191+
return Nexson.importTree(tree.nexson_tree, tree._nexson_proxy.reftax_otus, tree.tree_id)
192+
193+
# Study/tree ids in a collection
194+
195+
def collection_treespecs(path):
196+
with open(path, 'r') as infile:
197+
collection_json = json.load(infile)
198+
return map(lambda coll: (coll[u'studyID'], coll[u'treeID']),
199+
collection_json[u'decisions'])
200+
201+
synthesis_treespecs = [] # rank order
202+
trees_in_synthesis = {}
203+
204+
def read_synthesis_collections():
205+
if len(synthesis_treespecs) > 0: return
206+
for collection_name in ['fungi',
207+
'safe-microbes',
208+
'metazoa',
209+
'plants']:
210+
path = os.path.join(repo_dir, 'collections-1/collections-by-owner/opentreeoflife', collection_name + '.json')
211+
print 'reading', path
212+
for treespec in collection_treespecs(path):
213+
synthesis_treespecs.append(treespec)
214+
trees_in_synthesis[treespec] = True
215+
216+
def in_synthesis(study_id, tree_id):
217+
if len(trees_in_synthesis) == 0:
218+
read_synthesis_collections()
219+
treespec = (study_id, tree_id)
220+
if treespec in trees_in_synthesis:
221+
return trees_in_synthesis[treespec]
222+
else:
223+
return False
224+
225+
# Utilities associated with obtaining study and tree lists for reporting
226+
227+
# All study ids within a given phylesystem (shard)
228+
229+
def all_study_ids(shard):
230+
ids = []
231+
top = os.path.join(shard, 'study')
232+
hundreds = os.listdir(top)
233+
for dir in hundreds:
234+
dir2 = os.path.join(top, dir)
235+
if os.path.isdir(dir2):
236+
dirs = os.listdir(dir2)
237+
for study_dir in dirs:
238+
# if os.path.isdir(study_dir): ... ? ...
239+
ids.append(study_dir)
240+
print len(ids), 'studies'
241+
return ids
242+
243+
# These are some asterales studies. List is from gcmdr repo.
244+
asterales_treespecs=[("pg_2539", "tree6294"), # Soltis et al. 2011
245+
("pg_715", "tree1289"), # Barnadesioideae
246+
("pg_329", "tree324"), # Hieracium
247+
# ("pg_9", "tree1"), # Campanulidae
248+
("pg_1944", "tree3959"), # Campanulidae; replacement of 9 above
249+
("pg_709", "tree1276"), # Lobelioideae, very non monophyletic
250+
("pg_41", "tree1396"), # Feddea
251+
("pg_82", "tree5792"), # Campanula, very non monophyletic campanula
252+
("pg_932", "tree1831") # Goodeniaceae, tons of non monophyly
253+
]
254+
255+
def get_refs(paths):
256+
return map(load_tree, paths)
257+
258+
def load_tree(path):
259+
tree = Taxonomy.getTaxonomy(path, 'ott')
260+
count = 0
261+
for id in tree.idIndex.keySet():
262+
count += 1
263+
print count, 'ids'
264+
return tree
265+
266+
registry_dir = os.path.join(repo_dir, 'reference-taxonomy', 'registry')
267+
268+
# consider comparing the "^ot:focalClade" to the induced root
269+
270+
if __name__ == '__main__':
271+
272+
argparser = argparse.ArgumentParser(description='Play around with conflict.')
273+
274+
argparser.add_argument('--out', dest='outfile', default='-')
275+
276+
argparser.add_argument('--shard', dest='shard',
277+
default=default_shard,
278+
help='root directory of repository (shard) containing nexsons')
279+
argparser.add_argument('--ref', dest='refs', nargs='+',
280+
default=os.path.join(registry_dir, 'draftversion4.tre'), # synthetic tree is a newick...
281+
help='reference tree (taxonomy or synth)')
282+
args = argparser.parse_args()
283+
284+
# refs = get_refs(args.refs, os.path.join(registry_dir, 'plants-ott29/'))
285+
# refs = get_refs(args.refs, os.path.join(registry_dir, 'ott2.9/'))
286+
287+
report_on_trees(all_study_ids(args.shard),
288+
args.shard,
289+
get_refs(args.refs),
290+
args.outfile)
291+
292+
# /Users/jar/a/ot/repo/phylesystem-1/study/ot_31/ot_31/ot_31.json
293+
# ls -l ../repo/phylesystem-1/study/ot_31/ot_31/ot_31.json
294+
295+
296+
# look at u'^ot:candidateTreeForSynthesis' = list of tree ids
297+
# look at list of trees in synthesis (from collections)

0 commit comments

Comments
 (0)