|
| 1 | +# Some class and method names borrowed from peyotl/nexson_proxy.py |
| 2 | + |
| 3 | +import sys, os, json, argparse, csv |
| 4 | +from org.opentreeoflife.taxa import Taxonomy, Nexson, Flag |
| 5 | +from org.opentreeoflife.conflict import ConflictAnalysis, Disposition |
| 6 | + |
| 7 | +repo_dir = '../repo' # directory containing repo clones |
| 8 | + |
| 9 | +default_shard = os.path.join(repo_dir, 'phylesystem-1') |
| 10 | + |
| 11 | +# Report generation |
| 12 | + |
| 13 | +# Report on every tree in every study |
| 14 | + |
| 15 | +def report_on_trees(study_ids, shard, refs, outfile): |
| 16 | + out = sys.stdout |
| 17 | + if outfile != '-': out = open(outfile, 'w') |
| 18 | + writer = start_report(refs, out) |
| 19 | + count = 0 |
| 20 | + for study_id in study_ids: |
| 21 | + study = get_study(study_id, shard) |
| 22 | + for tree in tree_iter_nexson_proxy(study): |
| 23 | + row = report_on_tree(tree, study, refs) |
| 24 | + writer.writerow(row) |
| 25 | + count += 1 |
| 26 | + if count % 100 == 0: print count, row |
| 27 | + if outfile != '-': out.close() |
| 28 | + |
| 29 | +def start_report(refs, out): |
| 30 | + writer = csv.writer(out) |
| 31 | + writer.writerow(tree_report_header(refs)) |
| 32 | + return writer |
| 33 | + |
| 34 | +# 'tree' report |
| 35 | +# Write one row using the given csv writer summarizing how the given |
| 36 | +# tree conflicts with taxonomy and/or synthesis. |
| 37 | + |
| 38 | +tree_report_header_1 = ['tree'] |
| 39 | +tree_report_header_2 = ['conflicted', 'resolves'] |
| 40 | + |
| 41 | +def tree_report_header(refs): |
| 42 | + row = tree_report_header_1 |
| 43 | + for ref in refs: |
| 44 | + row = row + tree_report_header_2 |
| 45 | + return row |
| 46 | + |
| 47 | +def report_on_tree(tree, study, refs): |
| 48 | + input = import_tree(tree) |
| 49 | + |
| 50 | + # One row per tree |
| 51 | + row = ['%s@%s' % (study.id, tree.tree_id)] |
| 52 | + for ref in refs: |
| 53 | + row = row + [count_conflicts(input, ref, tree.ingroup()), |
| 54 | + count_resolutions(input, ref, tree.ingroup())] |
| 55 | + return row |
| 56 | + |
| 57 | +def count_conflicts(input, ref, ingroup): |
| 58 | + analysis = ConflictAnalysis(input, ref, ingroup) |
| 59 | + arts = analysis.articulations(True) |
| 60 | + if arts == None: arts = [] |
| 61 | + count = 0 |
| 62 | + for art in arts: |
| 63 | + if art.disposition == Disposition.CONFLICTS_WITH: |
| 64 | + count += 1 |
| 65 | + return count |
| 66 | + |
| 67 | +def count_resolutions(input, ref, ingroup): |
| 68 | + analysis = ConflictAnalysis(input, ref, ingroup) |
| 69 | + arts = analysis.articulations(False) |
| 70 | + if arts == None: arts = [] |
| 71 | + count = 0 |
| 72 | + for art in arts: |
| 73 | + if art.disposition == Disposition.RESOLVES: |
| 74 | + count += 1 |
| 75 | + return count |
| 76 | + |
| 77 | +# Proxy object for study file in nexson format |
| 78 | + |
| 79 | +class NexsonProxy(object): |
| 80 | + def __init__(self, filepath): |
| 81 | + self.filepath = filepath # peyotl name |
| 82 | + self.nexson = None |
| 83 | + self.reftax_otus = {} |
| 84 | + self.nexson_trees = {} # tree_id -> blob |
| 85 | + self.preferred_trees = [] |
| 86 | + self._tree_proxy_cache = {} |
| 87 | + |
| 88 | + self.nexson = Nexson.load(self.filepath) |
| 89 | + self._nexml_element = self.nexson[u'nexml'] # peyotl name |
| 90 | + self.reftax_otus = Nexson.getOtus(self.nexson) # sort of wrong |
| 91 | + z = self.get(u'^ot:candidateTreeForSynthesis') |
| 92 | + if z != None: self.preferred_trees = z |
| 93 | + self.id = self.get(u'^ot:studyId') |
| 94 | + |
| 95 | + def get(self, attribute): |
| 96 | + if attribute in self.nexson[u'nexml']: |
| 97 | + return self._nexml_element[attribute] |
| 98 | + else: |
| 99 | + return None |
| 100 | + |
| 101 | + # cf. peyotl _create_tree_proxy (does not always create) |
| 102 | + def _get_tree_proxy(self, tree_id, tree, otus_id): |
| 103 | + tp = self._tree_proxy_cache.get(tree_id) |
| 104 | + if tp is None: |
| 105 | + np = NexsonTreeProxy(tree, tree_id, otus_id, self) |
| 106 | + self._tree_proxy_cache[tree_id] = np |
| 107 | + return np |
| 108 | + |
| 109 | + def get_tree(self, tree_id): |
| 110 | + np = self._tree_proxy_cache.get(tree_id) |
| 111 | + if np is not None: |
| 112 | + return np |
| 113 | + tgd = self._nexml_element[u'treesById'] |
| 114 | + for tg in tgd.values(): |
| 115 | + tbid = tg[u'treeById'] |
| 116 | + if tree_id in tbid: |
| 117 | + otus_id = tg[u'@otus'] |
| 118 | + nexson_tree = tbid[tree_id] |
| 119 | + return self._get_tree_proxy(tree_id=tree_id, tree=nexson_tree, otus_id=otus_id) |
| 120 | + return None |
| 121 | + |
| 122 | +def tree_iter_nexson_proxy(nexson_proxy): # peyotl |
| 123 | + '''Iterates over NexsonTreeProxy objects in order determined by the nexson blob''' |
| 124 | + nexml_el = nexson_proxy._nexml_element |
| 125 | + tg_order = nexml_el['^ot:treesElementOrder'] |
| 126 | + tgd = nexml_el['treesById'] |
| 127 | + for tg_id in tg_order: |
| 128 | + tg = tgd[tg_id] |
| 129 | + tree_order = tg['^ot:treeElementOrder'] |
| 130 | + tbid = tg['treeById'] |
| 131 | + otus_id = tg['@otus'] |
| 132 | + for k in tree_order: |
| 133 | + v = tbid[k] |
| 134 | + yield nexson_proxy._get_tree_proxy(tree_id=k, tree=v, otus_id=otus_id) |
| 135 | + |
| 136 | +class NexsonTreeProxy(object): |
| 137 | + def __init__(self, nexson_tree, tree_id, otus_id, nexson_proxy): |
| 138 | + self.nexson_tree = nexson_tree |
| 139 | + self.nexson_otus_id = otus_id |
| 140 | + self.tree_id = tree_id |
| 141 | + self._nexson_proxy = nexson_proxy |
| 142 | + by_id = nexson_proxy._nexml_element[u'otusById'] |
| 143 | + if otus_id not in by_id: |
| 144 | + print '** otus id not found', nexson_proxy.id, tree_id, otus_id, by_id.keys() |
| 145 | + self._otus = by_id[otus_id][u'otuById'] |
| 146 | + def ingroup(self): |
| 147 | + ingroup = self.nexson_tree.get(u'^ot:inGroupClade') |
| 148 | + if ingroup == '': |
| 149 | + return None |
| 150 | + else: |
| 151 | + return ingroup |
| 152 | + |
| 153 | +# Marshalling arguments |
| 154 | + |
| 155 | +# shard is the path to the root of the repository (or shard) clone |
| 156 | + |
| 157 | +def study_id_to_path(study_id, shard=default_shard): |
| 158 | + (prefix, number) = study_id.split('_', 1) |
| 159 | + if len(number) == 1: |
| 160 | + residue = '_' + number |
| 161 | + else: |
| 162 | + residue = number[-2:] |
| 163 | + return os.path.join(shard, 'study', prefix + '_' + residue, study_id, study_id + '.json') |
| 164 | + |
| 165 | +# Load a study |
| 166 | + |
| 167 | +single_study_cache = {'id': None, 'study': None} |
| 168 | + |
| 169 | +def get_study(study_id, shard): |
| 170 | + if study_id == single_study_cache['id']: |
| 171 | + study = single_study_cache['study'] |
| 172 | + else: |
| 173 | + single_study_cache['id'] = None |
| 174 | + single_study_cache['study'] = None |
| 175 | + study = gobble_study(study_id, shard) |
| 176 | + if study != None: |
| 177 | + single_study_cache['study'] = study |
| 178 | + single_study_cache['id'] = study_id |
| 179 | + return study |
| 180 | + |
| 181 | +def gobble_study(study_id, phylesystem): |
| 182 | + filepath = study_id_to_path(study_id, phylesystem) |
| 183 | + # should do try/catch for file-not-found |
| 184 | + if not os.path.exists(filepath): |
| 185 | + # foo, should be using try/catch |
| 186 | + print '** Not found:', self.filepath |
| 187 | + return None |
| 188 | + return NexsonProxy(filepath) |
| 189 | + |
| 190 | +def import_tree(tree): |
| 191 | + return Nexson.importTree(tree.nexson_tree, tree._nexson_proxy.reftax_otus, tree.tree_id) |
| 192 | + |
| 193 | +# Study/tree ids in a collection |
| 194 | + |
| 195 | +def collection_treespecs(path): |
| 196 | + with open(path, 'r') as infile: |
| 197 | + collection_json = json.load(infile) |
| 198 | + return map(lambda coll: (coll[u'studyID'], coll[u'treeID']), |
| 199 | + collection_json[u'decisions']) |
| 200 | + |
| 201 | +synthesis_treespecs = [] # rank order |
| 202 | +trees_in_synthesis = {} |
| 203 | + |
| 204 | +def read_synthesis_collections(): |
| 205 | + if len(synthesis_treespecs) > 0: return |
| 206 | + for collection_name in ['fungi', |
| 207 | + 'safe-microbes', |
| 208 | + 'metazoa', |
| 209 | + 'plants']: |
| 210 | + path = os.path.join(repo_dir, 'collections-1/collections-by-owner/opentreeoflife', collection_name + '.json') |
| 211 | + print 'reading', path |
| 212 | + for treespec in collection_treespecs(path): |
| 213 | + synthesis_treespecs.append(treespec) |
| 214 | + trees_in_synthesis[treespec] = True |
| 215 | + |
| 216 | +def in_synthesis(study_id, tree_id): |
| 217 | + if len(trees_in_synthesis) == 0: |
| 218 | + read_synthesis_collections() |
| 219 | + treespec = (study_id, tree_id) |
| 220 | + if treespec in trees_in_synthesis: |
| 221 | + return trees_in_synthesis[treespec] |
| 222 | + else: |
| 223 | + return False |
| 224 | + |
| 225 | +# Utilities associated with obtaining study and tree lists for reporting |
| 226 | + |
| 227 | +# All study ids within a given phylesystem (shard) |
| 228 | + |
| 229 | +def all_study_ids(shard): |
| 230 | + ids = [] |
| 231 | + top = os.path.join(shard, 'study') |
| 232 | + hundreds = os.listdir(top) |
| 233 | + for dir in hundreds: |
| 234 | + dir2 = os.path.join(top, dir) |
| 235 | + if os.path.isdir(dir2): |
| 236 | + dirs = os.listdir(dir2) |
| 237 | + for study_dir in dirs: |
| 238 | + # if os.path.isdir(study_dir): ... ? ... |
| 239 | + ids.append(study_dir) |
| 240 | + print len(ids), 'studies' |
| 241 | + return ids |
| 242 | + |
| 243 | +# These are some asterales studies. List is from gcmdr repo. |
| 244 | +asterales_treespecs=[("pg_2539", "tree6294"), # Soltis et al. 2011 |
| 245 | + ("pg_715", "tree1289"), # Barnadesioideae |
| 246 | + ("pg_329", "tree324"), # Hieracium |
| 247 | + # ("pg_9", "tree1"), # Campanulidae |
| 248 | + ("pg_1944", "tree3959"), # Campanulidae; replacement of 9 above |
| 249 | + ("pg_709", "tree1276"), # Lobelioideae, very non monophyletic |
| 250 | + ("pg_41", "tree1396"), # Feddea |
| 251 | + ("pg_82", "tree5792"), # Campanula, very non monophyletic campanula |
| 252 | + ("pg_932", "tree1831") # Goodeniaceae, tons of non monophyly |
| 253 | + ] |
| 254 | + |
| 255 | +def get_refs(paths): |
| 256 | + return map(load_tree, paths) |
| 257 | + |
| 258 | +def load_tree(path): |
| 259 | + tree = Taxonomy.getTaxonomy(path, 'ott') |
| 260 | + count = 0 |
| 261 | + for id in tree.idIndex.keySet(): |
| 262 | + count += 1 |
| 263 | + print count, 'ids' |
| 264 | + return tree |
| 265 | + |
| 266 | +registry_dir = os.path.join(repo_dir, 'reference-taxonomy', 'registry') |
| 267 | + |
| 268 | +# consider comparing the "^ot:focalClade" to the induced root |
| 269 | + |
| 270 | +if __name__ == '__main__': |
| 271 | + |
| 272 | + argparser = argparse.ArgumentParser(description='Play around with conflict.') |
| 273 | + |
| 274 | + argparser.add_argument('--out', dest='outfile', default='-') |
| 275 | + |
| 276 | + argparser.add_argument('--shard', dest='shard', |
| 277 | + default=default_shard, |
| 278 | + help='root directory of repository (shard) containing nexsons') |
| 279 | + argparser.add_argument('--ref', dest='refs', nargs='+', |
| 280 | + default=os.path.join(registry_dir, 'draftversion4.tre'), # synthetic tree is a newick... |
| 281 | + help='reference tree (taxonomy or synth)') |
| 282 | + args = argparser.parse_args() |
| 283 | + |
| 284 | + # refs = get_refs(args.refs, os.path.join(registry_dir, 'plants-ott29/')) |
| 285 | + # refs = get_refs(args.refs, os.path.join(registry_dir, 'ott2.9/')) |
| 286 | + |
| 287 | + report_on_trees(all_study_ids(args.shard), |
| 288 | + args.shard, |
| 289 | + get_refs(args.refs), |
| 290 | + args.outfile) |
| 291 | + |
| 292 | +# /Users/jar/a/ot/repo/phylesystem-1/study/ot_31/ot_31/ot_31.json |
| 293 | +# ls -l ../repo/phylesystem-1/study/ot_31/ot_31/ot_31.json |
| 294 | + |
| 295 | + |
| 296 | +# look at u'^ot:candidateTreeForSynthesis' = list of tree ids |
| 297 | +# look at list of trees in synthesis (from collections) |
0 commit comments