Add partial support for Pronunciation

This commit reads and dumps the LMF and inserts them into the database. It does not yet make them available to the user, nor does it export them. Part of #7, #89
goodmami · Feb 19, 2021 · d4b1853 · d4b1853
1 parent efbe271
commit d4b1853
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 20 deletions.
diff --git a/tests/data/mini-lmf-1.1.xml b/tests/data/mini-lmf-1.1.xml
@@ -23,7 +23,9 @@
     </LexicalEntry>
 
     <LexicalEntry id="test-ja-例え-n">
-      <Lemma partOfSpeech="n" writtenForm="例え" />
+      <Lemma partOfSpeech="n" writtenForm="例え">
+        <Pronunciation variety="standard" notation="ipa" audio="tatoe.wav">tatoe</Pronunciation>
+      </Lemma>
       <Form writtenForm="たとえ" script="Hira" />
       <Form writtenForm="タトエ" script="Kana" />
       <Form writtenForm="tatoe" script="Latn-kunrei" />

diff --git a/wn/_add.py b/wn/_add.py
@@ -29,6 +29,16 @@
      WHERE e.id = ?
        AND e.lexicon_rowid = ?
 '''
+# forms don't have reliable ids
+FORM_QUERY = '''
+    SELECT f.rowid
+      FROM forms AS f
+      JOIN entries AS e ON f.entry_rowid = e.rowid
+     WHERE e.lexicon_rowid = ?
+       AND e.id = ?
+       AND f.form = ?
+       AND f.script IS ?
+'''
 SENSE_QUERY = '''
     SELECT s.rowid
       FROM senses AS s
@@ -124,6 +134,7 @@ def _add_lmf(
             _insert_synsets(synsets, lexid, cur, progress)
             _insert_entries(entries, lexid, cur, progress)
             _insert_forms(entries, lexid, lexidmap, cur, progress)
+            _insert_pronunciations(entries, lexid, lexidmap, cur, progress)
             _insert_tags(entries, lexid, lexidmap, cur, progress)
             _insert_senses(entries, synsets, lexid, lexidmap, cur, progress)
             _insert_adjpositions(entries, lexid, lexidmap, cur, progress)
@@ -159,7 +170,7 @@ def _sum_counts(info) -> int:
     counts = info['counts']
     return sum(counts.get(name, 0) for name in
                ('LexicalEntry', 'ExternalLexicalEntry',
-                'Lemma', 'Form', 'Tag',
+                'Lemma', 'Form', 'Pronunciation', 'Tag',
                 'Sense', 'ExternalSense',
                 'SenseRelation', 'Example', 'Count',
                 'SyntacticBehaviour',
@@ -378,19 +389,34 @@ def _insert_forms(entries, lexid, lexidmap, cur, progress):
         progress.update(len(forms))
 
 
+def _insert_pronunciations(entries, lexid, lexidmap, cur, progress):
+    progress.set(status='Pronunciations')
+    query = f'INSERT INTO pronunciations VALUES (({FORM_QUERY}),?,?,?,?,?)'
+    for batch in _split(entries):
+        prons = []
+        for entry in batch:
+            eid = entry.id
+            lemma = entry.lemma
+            lid = lexidmap.get(eid, lexid)
+            if not entry.external:
+                for p in entry.lemma.pronunciations:
+                    prons.append(
+                        (lid, eid, lemma.form, lemma.script,
+                         p.value, p.variety, p.notation, p.phonemic, p.audio)
+                    )
+            for form in entry.forms:
+                for p in form.pronunciations:
+                    prons.append(
+                        (lid, eid, form.form, form.script,
+                         p.text, p.variety, p.notation, p.phonemic, p.audio)
+                    )
+        cur.executemany(query, prons)
+        progress.update(len(prons))
+
+
 def _insert_tags(entries, lexid, lexidmap, cur, progress):
     progress.set(status='Word Form Tags')
-    query = '''
-        INSERT INTO tags VALUES (
-            (SELECT f.rowid
-               FROM forms AS f
-               JOIN entries AS e ON f.entry_rowid = e.rowid
-              WHERE e.lexicon_rowid = ?
-                AND e.id = ?
-                AND f.form = ?
-                AND f.script IS ?),
-            ?,?)
-    '''
+    query = f'INSERT INTO tags VALUES (({FORM_QUERY}),?,?)'
     for batch in _split(entries):
         tags = []
         for entry in batch:

diff --git a/wn/_db.py b/wn/_db.py
@@ -37,7 +37,7 @@
 # >>> wn._db.schema_hash(conn)
 #
 COMPATIBLE_SCHEMA_HASHES = {
-    'be954c77c4c6fc1127d2b7145715615cc9a1ea4d',
+    '30b13fc2d9282cde065fb770e21c04edf3217500',
 }
 
 

diff --git a/wn/lmf.py b/wn/lmf.py
@@ -306,6 +306,24 @@ def as_external(
         return obj
 
 
+class Pronunciation:
+    __slots__ = 'value', 'variety', 'notation', 'phonemic', 'audio'
+
+    def __init__(
+        self,
+        value: str,
+        variety: str = None,
+        notation: str = None,
+        phonemic: bool = True,
+        audio: str = None,
+    ):
+        self.value = value
+        self.variety = variety
+        self.notation = notation
+        self.phonemic = phonemic
+        self.audio = audio
+
+
 class Tag:
     __slots__ = 'text', 'category'
 
@@ -315,21 +333,36 @@ def __init__(self, text: str, category: str):
 
 
 class Form:
-    __slots__ = 'form', 'script', 'tags'
+    __slots__ = 'form', 'script', 'pronunciations', 'tags'
 
-    def __init__(self, form: str, script: str, tags: List[Tag] = None):
+    def __init__(
+        self,
+        form: str,
+        script: str,
+        pronunciations: List[Pronunciation] = None,
+        tags: List[Tag] = None,
+    ):
         self.form = form
         self.script = script
+        self.pronunciations = pronunciations or []
         self.tags = tags or []
 
 
 class Lemma:
-    __slots__ = 'form', 'pos', 'script', 'tags'
+    __slots__ = 'form', 'pos', 'script', 'pronunciations', 'tags'
 
-    def __init__(self, form: str, pos: str, script: str = '', tags: List[Tag] = None):
+    def __init__(
+        self,
+        form: str,
+        pos: str,
+        script: str = '',
+        pronunciations: List[Pronunciation] = None,
+        tags: List[Tag] = None,
+    ):
         self.form = form
         self.pos = pos
         self.script = script
+        self.pronunciations = pronunciations or []
         self.tags = tags or []
 
 
@@ -595,6 +628,7 @@ def _load_lemma(events) -> Lemma:
         attrs['writtenForm'],
         _get_literal(attrs['partOfSpeech'], PARTS_OF_SPEECH),
         script=attrs.get('script'),
+        pronunciations=_load_pronunciations(events),
         tags=_load_tags(events)
     )
     events.end('Lemma')
@@ -607,11 +641,30 @@ def _load_forms(events) -> List[Form]:
         attrs = next(events)[1].attrib
         forms.append(Form(attrs['writtenForm'],
                           script=attrs.get('script'),
+                          pronunciations=_load_pronunciations(events),
                           tags=_load_tags(events)))
         events.end('Form')
     return forms
 
 
+def _load_pronunciations(events) -> List[Pronunciation]:
+    pronunciations: List[Pronunciation] = []
+    while events.starts('Pronunciation'):
+        next(events)
+        elem = events.end('Pronunciation')
+        attrs = elem.attrib
+        pronunciations.append(
+            Pronunciation(
+                elem.text,
+                variety=attrs.get('variety'),
+                notation=attrs.get('notation'),
+                phonemic=_get_bool(attrs.get('phonemic', 'true')),
+                audio=attrs.get('audio'),
+            )
+        )
+    return pronunciations
+
+
 def _load_tags(events) -> List[Tag]:
     tags: List[Tag] = []
     while events.starts('Tag'):
@@ -957,7 +1010,8 @@ def _build_lemma(lemma: Lemma) -> ET.Element:
         attrib['script'] = lemma.script
     attrib['partOfSpeech'] = lemma.pos
     elem = ET.Element('Lemma', attrib=attrib)
-    # TODO: Pronunciation
+    for pron in lemma.pronunciations:
+        elem.append(_build_pronunciation(pron))
     for tag in lemma.tags:
         elem.append(_build_tag(tag))
     return elem
@@ -968,12 +1022,28 @@ def _build_form(form: Form) -> ET.Element:
     if form.script:
         attrib['script'] = form.script
     elem = ET.Element('Form', attrib=attrib)
-    # TODO: Pronunciation
+    for pron in form.pronunciations:
+        elem.append(_build_pronunciation(pron))
     for tag in form.tags:
         elem.append(_build_tag(tag))
     return elem
 
 
+def _build_pronunciation(pron) -> ET.Element:
+    attrib = {}
+    if pron.variety:
+        attrib['variety'] = pron.variety
+    if pron.notation:
+        attrib['notation'] = pron.notation
+    if not pron.phonemic:
+        attrib['phonemic'] = 'false'
+    if pron.audio:
+        attrib['audio'] = pron.audio
+    elem = ET.Element('Pronunciation', attrib=attrib)
+    elem.text = pron.value
+    return elem
+
+
 def _build_tag(tag: Tag) -> ET.Element:
     elem = ET.Element('Tag', category=tag.category)
     elem.text = tag.text

diff --git a/wn/schema.sql b/wn/schema.sql
@@ -88,6 +88,15 @@ CREATE TABLE forms (
 CREATE INDEX form_entry_index ON forms (entry_rowid);
 CREATE INDEX form_index ON forms (form);
 
+CREATE TABLE pronunciations (
+    form_rowid INTEGER NOT NULL REFERENCES forms (rowid) ON DELETE CASCADE,
+    value TEXT,
+    variety TEXT,
+    notation TEXT,
+    phonemic BOOLEAN CHECK( phonemic IN (0, 1) ) DEFAULT 1 NOT NULL,
+    audio TEXT
+);
+CREATE INDEX pronunciation_form_index ON pronunciations (form_rowid);
 
 CREATE TABLE tags (
     form_rowid INTEGER NOT NULL REFERENCES forms (rowid) ON DELETE CASCADE,