Skip to content

Commit

Permalink
Convert parse_sentence_def to PEG
Browse files Browse the repository at this point in the history
Like previously with alineas, I create two rules for sentence-def, one
singular and one plural; future will say if it is a good choice, else
it can be simply rewritten into one single rule.

Adapted associated test.
  • Loading branch information
Seb35 committed Dec 31, 2018
1 parent a4cc2f3 commit 39f2c58
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 43 deletions.
56 changes: 16 additions & 40 deletions duralex/alinea_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ def month_to_number(month):
'article_def': {
'type': TYPE_ARTICLE_DEFINITION,
},
'sentence_def': {
'type': TYPE_SENTENCE_DEFINITION,
},
'sentences_def': {
'type': TYPE_SENTENCE_DEFINITION,
},
'alinea_def': {
'type': TYPE_ALINEA_DEFINITION,
},
Expand Down Expand Up @@ -417,7 +423,7 @@ def parse_definition(tokens, i, parent):
parse_header1_definition,
parse_header2_definition,
parse_header3_definition,
parse_sentence_definition,
#parse_sentence_definition,
parse_word_definition,
parse_title_definition,
parse_subparagraph_definition
Expand All @@ -429,10 +435,16 @@ def parse_definition(tokens, i, parent):

if i == j:
grammar = parsimonious.Grammar("""
rule = whitespaces ( article_def / alinea_def / alineas_def ) whitespaces
rule = whitespaces ( article_def / sentence_def / sentences_def / alinea_def / alineas_def ) whitespaces
# [DuraLex] create node of type "article-definition"
article_def = ( ~"un +"i / ~"l['’] *"i ) ~"article"i ( ( _ article_id ) / ( ~" +additionnel"i ) )? (_ so_that_written)? ( ( before_quote quoted ) / ( before_free_quote free_quoted ) )
article_def = ( ~"un +|l['’] *"i ) ~"article"i ( ( _ article_id ) / ( ~" +additionnel"i ) )? (_ so_that_written)? ( ( before_quote quoted ) / ( before_free_quote free_quoted ) )
# [DuraLex] create node of type "sentence-definition"
sentence_def = ~"(une +|la +)?" ~"phrase"i ( _ so_that_written )? ( ( before_quote quoted ) / ( before_free_quote free_quoted ) )
# [DuraLex] create node of type "sentence-definition"
sentences_def = ~"(les +)?" ( cardinal_adjective_number _ )? ~"phrases"i ( _ so_that_written )? ( ( before_quote quoted ) / ( before_free_quote free_quoted ) )+
# [DuraLex] create node of type "alinea-definition"
alinea_def = ~"(l['’] *|un +)"i ~"alin[ée]a"i (_ so_that_written)? ( ( before_quote quoted ) / ( before_free_quote free_quoted ) )
Expand Down Expand Up @@ -467,7 +479,7 @@ def parse_definition(tokens, i, parent):
free_quoted = ~"[^\\n]+"
# [DuraLex] define property "count"
cardinal_adjective_number = ~"(vingt|trente|quarante|cinquante|soixante|septante|quatre-vingt|huitante|octante|nonante)(-et-un|-deux|-trois|-quatre|-cinq|-six|-sept|-huit|-neuf)?|(soixante|quatre-vingt)(-et-onze|-douze|-treize|-quatorze|-quinze|-seize|-dix-sept|-dix-huit|-dix-neuf)?|zéro|un|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|dix-sept|dix-huit|dix-neuf|quatre-vingt-un|quatre-vingt-onze"i
cardinal_adjective_number = ~"(vingt|trente|quarante|cinquante|soixante|septante|quatre-vingt|huitante|octante|nonante)(-et-un|-deux|-trois|-quatre|-cinq|-six|-sept|-huit|-neuf)?|(soixante|quatre-vingt)(-et-onze|-douze|-treize|-quatorze|-quinze|-seize|-dix-sept|-dix-huit|-dix-neuf)?|zéro|une?|deux|trois|quatre|cinq|six|sept|huit|neuf|dix|onze|douze|treize|quatorze|quinze|seize|dix-sept|dix-huit|dix-neuf|quatre-vingt-un|quatre-vingt-onze"i
multiplicative_adverb = ( multiplicative_adverb_units_before_decades? multiplicative_adverb_decades ) / multiplicative_adverb_units
multiplicative_adverb_units = ~"semel|bis|ter|quater|(quinqu|sex|sept|oct|no[nv])ies"i
Expand All @@ -487,42 +499,6 @@ def parse_definition(tokens, i, parent):

return i

def parse_sentence_definition(tokens, i, parent):
if i >= len(tokens):
return i

LOGGER.debug('parse_sentence_definition %s', str(tokens[i:i+10]))
j = i

# {count} phrases
if is_number_word(tokens[i]) and tokens[i + 2].startswith(u'phrase'):
count = word_to_number(tokens[i])
i += 4
# ainsi rédigé
# est rédigé
# est ainsi rédigé
if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé')
or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))):
# we expect {count} definitions => {count} quotes
# but they don't always match, so for now we parse all of the available contents
# FIXME: issue a warning because the expected count doesn't match?
i += 3 if tokens[i+2].startswith(u'rédigé') else 5
i = parse_for_each(
parse_quote,
tokens,
i,
lambda : create_node(parent, {'type': TYPE_SENTENCE_DEFINITION, 'children': []})
)
else:
create_node(parent, {'type': TYPE_SENTENCE_DEFINITION, 'count': count})
else:
LOGGER.debug('parse_sentence_definition none %s', str(tokens[i:i+10]))
return j

LOGGER.debug('parse_sentence_definition end %s', str(tokens[i:i+10]))

return i

def parse_word_definition(tokens, i, parent):
if i >= len(tokens):
return i
Expand Down
36 changes: 36 additions & 0 deletions duralex/old_alinea_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,42 @@
# Old unitary grammars now merged/unified into a bigger grammar
#

def parse_sentence_definition(tokens, i, parent):
if i >= len(tokens):
return i

LOGGER.debug('parse_sentence_definition %s', str(tokens[i:i+10]))
j = i

# {count} phrases
if is_number_word(tokens[i]) and tokens[i + 2].startswith(u'phrase'):
count = word_to_number(tokens[i])
i += 4
# ainsi rédigé
# est rédigé
# est ainsi rédigé
if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé')
or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))):
# we expect {count} definitions => {count} quotes
# but they don't always match, so for now we parse all of the available contents
# FIXME: issue a warning because the expected count doesn't match?
i += 3 if tokens[i+2].startswith(u'rédigé') else 5
i = parse_for_each(
parse_quote,
tokens,
i,
lambda : create_node(parent, {'type': TYPE_SENTENCE_DEFINITION, 'children': []})
)
else:
create_node(parent, {'type': TYPE_SENTENCE_DEFINITION, 'count': count})
else:
LOGGER.debug('parse_sentence_definition none %s', str(tokens[i:i+10]))
return j

LOGGER.debug('parse_sentence_definition end %s', str(tokens[i:i+10]))

return i

def parse_article_definition(tokens, i, parent):
# Transfered to parse_definition
if i >= len(tokens):
Expand Down
12 changes: 9 additions & 3 deletions tests/ParseSentenceDefinitionTest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-

from DuralexTestCase import DuralexTestCase
from DuralexTestCase import DuralexTestCase, main

import duralex.alinea_parser as parser

class ParseSentenceDefinitionTest(DuralexTestCase):
def test_one_sentence_with_quotes(self):
self.assertEqualAST(
self.call_parse_func(
parser.parse_sentence_definition,
parser.parse_definition,
("une phrase ainsi rédigée :\n"
"\"phrase 1\"\n")
),
Expand All @@ -28,7 +28,7 @@ def test_one_sentence_with_quotes(self):
def test_three_sentences_with_quotes(self):
self.assertEqualAST(
self.call_parse_func(
parser.parse_sentence_definition,
parser.parse_definition,
("trois phrases ainsi rédigées :\n"
"\"phrase 1\"\n"
"\"phrase 2\"\n"
Expand Down Expand Up @@ -60,7 +60,13 @@ def test_three_sentences_with_quotes(self):
'words': 'phrase 3'
}
],
'count': 3,
'type': 'sentence-definition'
}
]}
)

if __name__ == '__main__':
main()

# vim: set ts=4 sw=4 sts=4 et:

0 comments on commit 39f2c58

Please sign in to comment.