From c83aa9a7f14d9b12277061c8b8b4bb7db9a982f7 Mon Sep 17 00:00:00 2001 From: Seb35 Date: Mon, 31 Dec 2018 13:31:01 +0100 Subject: [PATCH] In ToSemanticTreeVisitor, create multiple children in the parent node instead of one 'container' child with multiple children MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously when some rule was found multiple times (e.g. quote), ToSemanticTreeVisitor created an untyped container with multiple children, which was likely to create errors because first it was untyped (which is unusual), second because no DuraLex visitor expect to find such container with multiple children but expect to find directly multiple children, and third it is ugly. On the other side, ToSemanticTreeVisitor needs a DuraLex container node when a Parsimonious node contains multiple DuraLex nodes during half-read operations. Now with this commit, DuraLex container nodes are collected as soon as an upper DuraLex node is able to contain them, and in this case the children of this container node are directly put as children of the DuraLex parent node. In the case the parent Parsimonious node is still not a DuraLex node all container nodes and non-container nodes are flatten in a container node node. A (welcomed) side-effect of this new behaviour is that there is no more danger to see container-of-containers DuraLex nodes given they are always flatten. At the upper side, when a local DuraLex tree is attached to the global DuraLex tree, container nodes are again flatten. With this mechanism, the global DuraLex tree should never have any container nodes, they should only remain in half-read trees in ToSemanticTreeVisitor. To be still able to continue parsing even if it remains container nodes, they now have the type 'parsimonious-list-container'. Obviously if such nodes are found, ToSemanticTreeVisitor will need to be debugged. As a result of this operation, the DuraLex trees created by this visitor are a bit different than previous one in the case of multiple values. For instance the following amendment: À l’article L100-3 du code de l’énergie est ajouté deux phrases ainsi rédigées : "Phrase 1." "Phrase 2." was previously the following DuraLex tree: type: edit; editType: replace ╠═ type: code-reference; id: code de l'énergie ║ ╚═ type: article-reference; id: L100-3 ╠═ type: sentence-reference ║ ╚═ type: quote; words: Phrase 1. ╚═ type: sentence-definition ╚═ type: quote; words: Phrase 2. and it is now: type: edit; editType: replace ╠═ type: code-reference; id: code de l'énergie ║ ╚═ type: article-reference; id: L100-3 ╚═ type: sentence-reference; count: 2 ╠═ type: quote; words: Phrase 1. ╚═ type: quote; words: Phrase 2. I find it is even more meaningful given the number of sentences matches the number of quotes. (I know that in some amendments both sentences are grouped in the same quote, I almost find they could be splitted by a further visitor, to be discussed and evaluated.) From a reusing point of view (in SedLex, both could be easily read (currently SedLex only read the last one in both cases, I will push shortly a light change so that SedLex will always show all quotes). Issue: #9 --- duralex/ToSemanticTreeVisitor.py | 20 ++++++++++++++++---- tests/ParseAlineaDefinitionTest.py | 21 +++------------------ tests/ParseSentenceDefinitionTest.py | 14 ++------------ 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/duralex/ToSemanticTreeVisitor.py b/duralex/ToSemanticTreeVisitor.py index 93febcc..3b57274 100644 --- a/duralex/ToSemanticTreeVisitor.py +++ b/duralex/ToSemanticTreeVisitor.py @@ -60,7 +60,11 @@ def attach(self, dparent, ptree): dtree, properties = self.visit(ptree) dparent.update(properties) if dtree: - push_node(dparent, dtree) + if 'type' in dtree and dtree['type'] == 'parsimonious-list-container': + for dnode in list(dtree['children']): + push_node(dparent, dnode) + else: + push_node(dparent, dtree) return dtree def generic_visit(self, pnode, children): @@ -101,12 +105,16 @@ def generic_visit(self, pnode, children): else: if hierarchical: raise Exception('base item after a hierarchical item: unknown behaviour') - push_node(dnode, dchild) + if 'type' in dchild and dchild['type'] == 'parsimonious-list-container': + for dsubchild in list(dchild['children']): + push_node(dnode, dsubchild) + else: + push_node(dnode, dchild) first = False dproperties = {} elif len(dchildren) > 1: - dnode = create_node(None, {}) + dnode = create_node(None, {'type': 'parsimonious-list-container'}) first = True hierarchical = False for dchild in dchildren: @@ -119,7 +127,11 @@ def generic_visit(self, pnode, children): else: if hierarchical: raise Exception('base item after a hierarchical item: unknown behaviour') - push_node(dnode, dchild) + if 'type' in dchild and dchild['type'] == 'parsimonious-list-container': + for dsubchild in list(dchild['children']): + push_node(dnode, dsubchild) + else: + push_node(dnode, dchild) first = False elif len(dchildren) == 1: diff --git a/tests/ParseAlineaDefinitionTest.py b/tests/ParseAlineaDefinitionTest.py index 8e0e07c..0a4b0b3 100644 --- a/tests/ParseAlineaDefinitionTest.py +++ b/tests/ParseAlineaDefinitionTest.py @@ -106,30 +106,15 @@ def test_n_alineas_with_n_quotes(self): { 'type': 'quote', 'words': 'alinéa 1' - } - ], - }, - { - 'type': 'alinea-definition', - 'children': [ + }, { 'type': 'quote', 'words': 'alinéa 2' - } - ], - }, - { - 'type': 'alinea-definition', - 'children': [ + }, { 'type': 'quote', 'words': 'alinéa 3' - } - ], - }, - { - 'type': 'alinea-definition', - 'children': [ + }, { 'type': 'quote', 'words': 'alinéa 4' diff --git a/tests/ParseSentenceDefinitionTest.py b/tests/ParseSentenceDefinitionTest.py index c41c6d9..1ad79a3 100644 --- a/tests/ParseSentenceDefinitionTest.py +++ b/tests/ParseSentenceDefinitionTest.py @@ -40,21 +40,11 @@ def test_three_sentences_with_quotes(self): { 'type': 'quote', 'words': 'phrase 1' - } - ], - 'type': 'sentence-definition' - }, - { - 'children': [ + }, { 'type': 'quote', 'words': 'phrase 2' - } - ], - 'type': 'sentence-definition' - }, - { - 'children': [ + }, { 'type': 'quote', 'words': 'phrase 3'