-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGO_Resnik.py
executable file
·98 lines (93 loc) · 3.28 KB
/
GO_Resnik.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import networkx as nx
import numpy as np
import pickle as pk
import gseapy as gp
import re
import itertools as it
import math
from collections import (defaultdict,Counter)
import obonet
def precalc_lower_bounds(G):
"""Pre-calculate the number of lower bounds of the graph nodes
"""
G.lower_bounds = Counter()
for n in G:
G.lower_bounds[n] += 1
for ans in nx.ancestors(G, n):
G.lower_bounds[ans] += 1
# G.descriptors.add("Pre-calculated lower bounds")
#This function is the second step for the resnik score calculation
def information_content(G, term):
"""Information content
Args:
G(Ontology Graph): Ontology graph object
term(str): ontolgy term
Returns:
str - Information content
Raises:
PGSSLookupError: The term was not found in Ontology Graph
PGSSInvalidOperation: see `pygosemsim.similarity.precalc_lower_bounds`
"""
if term not in G.lower_bounds:
raise exception.PGSSLookupError(f"Missing term: {term}")
freq = G.lower_bounds[term] / len(G)
return round(-1 * math.log2(freq), 3)
#This function is the third step for the resnik score calculation
def lowest_common_ancestor(G, term1, term2):
"""Naive implementation of lowest common ancestor (LCA)
Args:
G(GoGraph): Graph object
term1(str): Ontology term 1
term2(str): Ontology term 2
Returns:
str - Lowest common ancestor term
or None if the terms have no common ancestors
Raises:
PGSSLookupError: The term was not found in Ontology Graph
PGSSInvalidOperation: see `pygosemsim.similarity.precalc_lower_bounds`
"""
if term1 not in G:
raise exception.PGSSLookupError(f"Missing term: {term1}")
if term2 not in G:
raise exception.PGSSLookupError(f"Missing term: {term2}")
lb1 = nx.ancestors(G, term1) | {term1}
lb2 = nx.ancestors(G, term2) | {term2}
common_ans = lb1 & lb2
if not common_ans:
return
return min(common_ans, key=lambda x: G.lower_bounds[x])
#This function would return the resnik score between two terms
def resnik(G, term1, term2):
"""Semantic similarity based on Resnik method
Args:
G(GoGraph): Ontology graph object
term1(str): term 1
term2(str): term 2
Returns:
float - Resnik similarity value (Information content of LCA)
or None if the terms have no common ancestors
Raises:
PGSSLookupError: The term was not found in GoGraph
PGSSInvalidOperation: see `pygosemsim.similarity.precalc_lower_bounds`
"""
mica = lowest_common_ancestor(G, term1, term2)
if mica is not None:
return information_content(G, mica)
url = 'http://purl.obolibrary.org/obo/go.obo'
graph = obonet.read_obo(url)
graph_up = nx.DiGraph.reverse(graph)
precalc_lower_bounds(graph_up)
term_list=list(graph_up.nodes())
pairwise_list = list(it.combinations(term_list, 2))
GO_resnik={}
for pair in pairwise_list:
try:
try:
GO_resnik[pair[0],pair[1]]=round(resnik(graph_up, pair[0], pair[1]),3)
except:
GO_resnik[pair[0],pair[1]]=round(resnik(graph_up, pair[1], pair[0]),3)
except:
pass
with open('output/GO_resnik.pickle', 'wb') as handle:
pk.dump(GO_resnik, handle, protocol=pk.HIGHEST_PROTOCOL)