Skip to content

Commit

Permalink
Add partial support for Pronunciation
Browse files Browse the repository at this point in the history
This commit reads and dumps the LMF and inserts them into the
database. It does not yet make them available to the user, nor does it
export them.

Part of #7, #89
  • Loading branch information
goodmami committed Feb 19, 2021
1 parent efbe271 commit d4b1853
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 20 deletions.
4 changes: 3 additions & 1 deletion tests/data/mini-lmf-1.1.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
</LexicalEntry>

<LexicalEntry id="test-ja-例え-n">
<Lemma partOfSpeech="n" writtenForm="例え" />
<Lemma partOfSpeech="n" writtenForm="例え">
<Pronunciation variety="standard" notation="ipa" audio="tatoe.wav">tatoe</Pronunciation>
</Lemma>
<Form writtenForm="たとえ" script="Hira" />
<Form writtenForm="タトエ" script="Kana" />
<Form writtenForm="tatoe" script="Latn-kunrei" />
Expand Down
50 changes: 38 additions & 12 deletions wn/_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@
WHERE e.id = ?
AND e.lexicon_rowid = ?
'''
# forms don't have reliable ids
FORM_QUERY = '''
SELECT f.rowid
FROM forms AS f
JOIN entries AS e ON f.entry_rowid = e.rowid
WHERE e.lexicon_rowid = ?
AND e.id = ?
AND f.form = ?
AND f.script IS ?
'''
SENSE_QUERY = '''
SELECT s.rowid
FROM senses AS s
Expand Down Expand Up @@ -124,6 +134,7 @@ def _add_lmf(
_insert_synsets(synsets, lexid, cur, progress)
_insert_entries(entries, lexid, cur, progress)
_insert_forms(entries, lexid, lexidmap, cur, progress)
_insert_pronunciations(entries, lexid, lexidmap, cur, progress)
_insert_tags(entries, lexid, lexidmap, cur, progress)
_insert_senses(entries, synsets, lexid, lexidmap, cur, progress)
_insert_adjpositions(entries, lexid, lexidmap, cur, progress)
Expand Down Expand Up @@ -159,7 +170,7 @@ def _sum_counts(info) -> int:
counts = info['counts']
return sum(counts.get(name, 0) for name in
('LexicalEntry', 'ExternalLexicalEntry',
'Lemma', 'Form', 'Tag',
'Lemma', 'Form', 'Pronunciation', 'Tag',
'Sense', 'ExternalSense',
'SenseRelation', 'Example', 'Count',
'SyntacticBehaviour',
Expand Down Expand Up @@ -378,19 +389,34 @@ def _insert_forms(entries, lexid, lexidmap, cur, progress):
progress.update(len(forms))


def _insert_pronunciations(entries, lexid, lexidmap, cur, progress):
progress.set(status='Pronunciations')
query = f'INSERT INTO pronunciations VALUES (({FORM_QUERY}),?,?,?,?,?)'
for batch in _split(entries):
prons = []
for entry in batch:
eid = entry.id
lemma = entry.lemma
lid = lexidmap.get(eid, lexid)
if not entry.external:
for p in entry.lemma.pronunciations:
prons.append(
(lid, eid, lemma.form, lemma.script,
p.value, p.variety, p.notation, p.phonemic, p.audio)
)
for form in entry.forms:
for p in form.pronunciations:
prons.append(
(lid, eid, form.form, form.script,
p.text, p.variety, p.notation, p.phonemic, p.audio)
)
cur.executemany(query, prons)
progress.update(len(prons))


def _insert_tags(entries, lexid, lexidmap, cur, progress):
progress.set(status='Word Form Tags')
query = '''
INSERT INTO tags VALUES (
(SELECT f.rowid
FROM forms AS f
JOIN entries AS e ON f.entry_rowid = e.rowid
WHERE e.lexicon_rowid = ?
AND e.id = ?
AND f.form = ?
AND f.script IS ?),
?,?)
'''
query = f'INSERT INTO tags VALUES (({FORM_QUERY}),?,?)'
for batch in _split(entries):
tags = []
for entry in batch:
Expand Down
2 changes: 1 addition & 1 deletion wn/_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# >>> wn._db.schema_hash(conn)
#
COMPATIBLE_SCHEMA_HASHES = {
'be954c77c4c6fc1127d2b7145715615cc9a1ea4d',
'30b13fc2d9282cde065fb770e21c04edf3217500',
}


Expand Down
82 changes: 76 additions & 6 deletions wn/lmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,24 @@ def as_external(
return obj


class Pronunciation:
__slots__ = 'value', 'variety', 'notation', 'phonemic', 'audio'

def __init__(
self,
value: str,
variety: str = None,
notation: str = None,
phonemic: bool = True,
audio: str = None,
):
self.value = value
self.variety = variety
self.notation = notation
self.phonemic = phonemic
self.audio = audio


class Tag:
__slots__ = 'text', 'category'

Expand All @@ -315,21 +333,36 @@ def __init__(self, text: str, category: str):


class Form:
__slots__ = 'form', 'script', 'tags'
__slots__ = 'form', 'script', 'pronunciations', 'tags'

def __init__(self, form: str, script: str, tags: List[Tag] = None):
def __init__(
self,
form: str,
script: str,
pronunciations: List[Pronunciation] = None,
tags: List[Tag] = None,
):
self.form = form
self.script = script
self.pronunciations = pronunciations or []
self.tags = tags or []


class Lemma:
__slots__ = 'form', 'pos', 'script', 'tags'
__slots__ = 'form', 'pos', 'script', 'pronunciations', 'tags'

def __init__(self, form: str, pos: str, script: str = '', tags: List[Tag] = None):
def __init__(
self,
form: str,
pos: str,
script: str = '',
pronunciations: List[Pronunciation] = None,
tags: List[Tag] = None,
):
self.form = form
self.pos = pos
self.script = script
self.pronunciations = pronunciations or []
self.tags = tags or []


Expand Down Expand Up @@ -595,6 +628,7 @@ def _load_lemma(events) -> Lemma:
attrs['writtenForm'],
_get_literal(attrs['partOfSpeech'], PARTS_OF_SPEECH),
script=attrs.get('script'),
pronunciations=_load_pronunciations(events),
tags=_load_tags(events)
)
events.end('Lemma')
Expand All @@ -607,11 +641,30 @@ def _load_forms(events) -> List[Form]:
attrs = next(events)[1].attrib
forms.append(Form(attrs['writtenForm'],
script=attrs.get('script'),
pronunciations=_load_pronunciations(events),
tags=_load_tags(events)))
events.end('Form')
return forms


def _load_pronunciations(events) -> List[Pronunciation]:
pronunciations: List[Pronunciation] = []
while events.starts('Pronunciation'):
next(events)
elem = events.end('Pronunciation')
attrs = elem.attrib
pronunciations.append(
Pronunciation(
elem.text,
variety=attrs.get('variety'),
notation=attrs.get('notation'),
phonemic=_get_bool(attrs.get('phonemic', 'true')),
audio=attrs.get('audio'),
)
)
return pronunciations


def _load_tags(events) -> List[Tag]:
tags: List[Tag] = []
while events.starts('Tag'):
Expand Down Expand Up @@ -957,7 +1010,8 @@ def _build_lemma(lemma: Lemma) -> ET.Element:
attrib['script'] = lemma.script
attrib['partOfSpeech'] = lemma.pos
elem = ET.Element('Lemma', attrib=attrib)
# TODO: Pronunciation
for pron in lemma.pronunciations:
elem.append(_build_pronunciation(pron))
for tag in lemma.tags:
elem.append(_build_tag(tag))
return elem
Expand All @@ -968,12 +1022,28 @@ def _build_form(form: Form) -> ET.Element:
if form.script:
attrib['script'] = form.script
elem = ET.Element('Form', attrib=attrib)
# TODO: Pronunciation
for pron in form.pronunciations:
elem.append(_build_pronunciation(pron))
for tag in form.tags:
elem.append(_build_tag(tag))
return elem


def _build_pronunciation(pron) -> ET.Element:
attrib = {}
if pron.variety:
attrib['variety'] = pron.variety
if pron.notation:
attrib['notation'] = pron.notation
if not pron.phonemic:
attrib['phonemic'] = 'false'
if pron.audio:
attrib['audio'] = pron.audio
elem = ET.Element('Pronunciation', attrib=attrib)
elem.text = pron.value
return elem


def _build_tag(tag: Tag) -> ET.Element:
elem = ET.Element('Tag', category=tag.category)
elem.text = tag.text
Expand Down
9 changes: 9 additions & 0 deletions wn/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ CREATE TABLE forms (
CREATE INDEX form_entry_index ON forms (entry_rowid);
CREATE INDEX form_index ON forms (form);

CREATE TABLE pronunciations (
form_rowid INTEGER NOT NULL REFERENCES forms (rowid) ON DELETE CASCADE,
value TEXT,
variety TEXT,
notation TEXT,
phonemic BOOLEAN CHECK( phonemic IN (0, 1) ) DEFAULT 1 NOT NULL,
audio TEXT
);
CREATE INDEX pronunciation_form_index ON pronunciations (form_rowid);

CREATE TABLE tags (
form_rowid INTEGER NOT NULL REFERENCES forms (rowid) ON DELETE CASCADE,
Expand Down

0 comments on commit d4b1853

Please sign in to comment.