-
Notifications
You must be signed in to change notification settings - Fork 1
/
fst2ontolex.py
155 lines (138 loc) · 6.46 KB
/
fst2ontolex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import re,sys,os,traceback,argparse
from pprint import pprint
import urllib.parse
args=argparse.ArgumentParser(description="given an (S)FST transducer for inflections, create an OntoLex-Morph representation")
args.add_argument("fst",type=str, help="fst inflection file")
args.add_argument("-base", "--base_uri", nargs="?", type=str, help="base uri, defaults to #", default="#")
args=args.parse_args()
# - a new rule begins in a new line
# - FST rules have the following basic structure:
# RULE := "$"PRE_STATE"$" "=" SOURCE ":" TARGET ( TAB "$"POST_STATE"$")? ([ "|\" ...]*
# TARGET := "{" ("<"TAG*">"|STRING)+ "}" | "<"TAG">"
# SOURCE := "{" ("<"TAG*">"|STRING)+ "}" | "<"TAG">"
# - FST rules terminate if no POST_STATE is defined
# - Tags are enclosed by <>, strings are not enclosed, the surface string can be generated by dropping <[^>]*>
# the escaping rules aren't fully implemented
# for the conversion we point from the form to the inflection type, this is the
# nonterminal state (symbol) from which the sequence of inflection types is called
def encode(string):
""" string encoding for Turtle """
replacements={
"^" : "\\u005e",
"/" : "\\u002f",
'"' : "\\u0022",
"$" : "\\u0024",
"&" : "\\u0026"
}
for s,t in replacements.items():
while s in string:
idx=string.index(s)
string=string[0:idx]+t+string[idx+len(s):]
return " \""+string+"\" "
def my(identifier, prefix=""):
""" uri escaping and prefix assigment """
for symbol in [ "/", ".", "~" ]:
if symbol in identifier:
identifier="_".join(identifier.split(symbol))
return prefix+":"+urllib.parse.quote(identifier)
state2in2out2next={}
with open(args.fst,"r") as input:
lhs=None
for line in input:
if "%" in line:
line=line[0:line.index("%")]
line=line.strip()
if line!="":
# print("LINE",line)
if lhs==None:
if "=" in line:
lhs=line.split("=")[0].strip()
line="=".join(line.split("=")[1:]).strip()
if lhs!=None:
rhs=line.strip()
while(rhs.endswith("\\")):
line=input.readline()
if "%" in line:
line=line[0:line.index("%")]
line=line.strip()
rhs=rhs[0:-1]+" "+line
# print("RULE", lhs,"=>",rhs)
for rule in rhs.split("|"):
rule=re.sub(r"\s+"," ",rule).strip()
if rule!="":
src=None
tgt=None
post=None
if "$" in rule.split(" ")[-1]:
post=rule.split(" ")[-1]
rule=" ".join(rule.split(" ")[0:-1]).strip()
src=""
tgt=""
if ":" in rule:
src=rule[0:rule.index(":")].strip()
tgt=rule[rule.index(":")+1:].strip()
if tgt!=None and ":" in tgt:
sys.stderr.write("warning: did not implement filters, yet, skipping rhs \""+rhs+"\"\n")
sys.stderr.flush()
elif None in [src,tgt]:
sys.stderr.write("warning: check rhs \""+rhs+"\" with sub-expression \""+rule+"\"\n")
sys.stderr.flush()
else:
try:
src=re.sub(r"[{}]","",src).strip()
tgt=re.sub(r"[{}]","",tgt).strip()
if not lhs in state2in2out2next:
state2in2out2next[lhs] = { src : { tgt : [] }}
elif not src in state2in2out2next[lhs]:
state2in2out2next[lhs][src] = { tgt : [] }
else:
state2in2out2next[lhs][src][tgt] = []
if post!=None and not post in state2in2out2next[lhs][src][tgt]:
state2in2out2next[lhs][src][tgt].append(post)
except:
traceback.print_exc()
lhs=None
# pprint(state2in2out2next)
print("@prefix : <"+args.base_uri+"> .")
print("""
@prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#> .
@prefix synsem: <http://www.w3.org/ns/lemon/synsem#> .
@prefix decomp: <http://www.w3.org/ns/lemon/decomp#> .
@prefix vartrans: <http://www.w3.org/ns/lemon/vartrans#> .
@prefix lime: <http://www.w3.org/ns/lemon/lime#> .
@prefix morph: <http://www.w3.org/ns/lemon/morph#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix owl: <http://www.w3.org/2002/07/owl#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix skos: <http://www.w3.org/2004/02/skos#>.
@prefix dbr: <http://dbpedia.org/resource/>.
@prefix dbo: <http://dbpedia.org/ontology/>.
@prefix void: <http://rdfs.org/ns/void#>.
@prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>.
@prefix semiotics: <http://www.ontologydesignpatterns.org/cp/owl/semiotics.owl#>.
@prefix oils: <http://lemon-model.net/oils#>.
@prefix dct: <http://purl.org/dc/terms/>.
@prefix provo: <http://www.w3.org/ns/prov#>.
""")
for state in state2in2out2next:
itype=my("type#"+state)
print(itype+" a morph:InflectionType ; rdfs:label"+encode(state)+".")
for source in state2in2out2next[state]:
for target in state2in2out2next[state][source]:
for post in state2in2out2next[state][source][target]:
print(itype+" morph:next "+my("type#"+post)+".")
rule=my("rule#"+state+"_"+source+">"+target)
print(itype+" morph:inflectionRule "+rule+".")
print(rule+" a morph:InflectionRule; morph:example "+encode(source+" > ..."+target), end="; ")
s=source
t=target
if s=="<>":
s=""
if t=="<>":
t=""
for symbol in ["/", "$", "^", "&"]:
s=("\\"+symbol).join(s.split(symbol))
t=("\\"+symbol).join(t.split(symbol))
replacement="s/"+s+"$/"+t+"/" # with s+"$", we assume left-to-right replacement: TODO: TBC
print("morph:replacement"+encode(replacement)+".")