From da0ca44f91e6b16a49a16df9893c6c344131d2c7 Mon Sep 17 00:00:00 2001 From: CM Lubinski Date: Thu, 1 Mar 2018 18:12:36 -0500 Subject: [PATCH 1/5] Add navigation and append_to methods to DocCursor. It'll be helpful to poke around the document tree as we make document migrations. These changes allow us to 1) find the left and right siblings of a node (i.e. nodes which share the same parent, but come before or after the current node) 2) move a subtree to a different part of the document 3) create a new child at a specific offset within its parent --- api/document/tests/tree_test.py | 73 +++++++++++++++++++++++++++++++ api/document/tree.py | 76 +++++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 3 deletions(-) diff --git a/api/document/tests/tree_test.py b/api/document/tests/tree_test.py index 57935c31f..6294292d6 100644 --- a/api/document/tests/tree_test.py +++ b/api/document/tests/tree_test.py @@ -238,3 +238,76 @@ def test_jump_to(): assert root.jump_to('root_1').identifier == 'root_1' assert para_a.jump_to('root_1').identifier == 'root_1' assert para_a.jump_to('root_1__sec_1').identifier == 'root_1__sec_1' + + +def test_add_child_insert_pos(): + root = tree.DocCursor.new_tree('root') + root.add_child('sec', '1') + root.add_child('sec', '2') + root.add_child('sec', '3') + + assert [n.identifier for n in root.walk()] == [ + 'root_1', 'root_1__sec_1', 'root_1__sec_2', 'root_1__sec_3'] + + root.add_child('sec', '4', insert_pos=1) + assert [n.identifier for n in root.walk()] == [ + 'root_1', 'root_1__sec_1', 'root_1__sec_4', 'root_1__sec_2', + 'root_1__sec_3', + ] + + +def test_siblings(): + root = tree.DocCursor.new_tree('root') + s1 = root.add_child('sec', '1') + s2 = root.add_child('sec', '2') + s3 = root.add_child('sec', '3') + s4 = root.add_child('sec', '4') + s1.add_child('para') + s3.add_child('para') + + assert [s.identifier for s in s3.left_siblings()] == [ + s1.identifier, s2.identifier] + assert [s.identifier for s in s3.right_siblings()] == [s4.identifier] + assert s1.left_sibling() is None + assert s1.right_sibling().identifier == s2.identifier + assert s2.left_sibling().identifier == s1.identifier + + +def test_append_to(): + root = tree.DocCursor.new_tree('root') + s1 = root.add_child('sec') + s1.add_child('para') + p12 = s1.add_child('para') + p12.add_child('math') + s2 = root.add_child('sec') + s2.add_child('para') + list_el = s2.add_child('list') + list_el.add_child('listitem') + list_el.add_child('listitem') + + assert [n.identifier for n in root.walk()] == [ + 'root_1', + 'root_1__sec_1', + 'root_1__sec_1__para_1', + 'root_1__sec_1__para_2', + 'root_1__sec_1__para_2__math_1', + 'root_1__sec_2', + 'root_1__sec_2__para_1', + 'root_1__sec_2__list_1', + 'root_1__sec_2__list_1__listitem_1', + 'root_1__sec_2__list_1__listitem_2', + ] + + list_el.append_to(p12) + assert [n.identifier for n in root.walk()] == [ + 'root_1', + 'root_1__sec_1', + 'root_1__sec_1__para_1', + 'root_1__sec_1__para_2', + 'root_1__sec_1__para_2__math_1', + 'root_1__sec_1__para_2__list_1', + 'root_1__sec_1__para_2__list_1__listitem_1', + 'root_1__sec_1__para_2__list_1__listitem_2', + 'root_1__sec_2', + 'root_1__sec_2__para_1', + ] diff --git a/api/document/tree.py b/api/document/tree.py index 89c3ada9a..c462a273d 100644 --- a/api/document/tree.py +++ b/api/document/tree.py @@ -3,6 +3,7 @@ from networkx import DiGraph from networkx.algorithms.dag import descendants +from networkx.relabel import relabel_nodes from document.models import DocNode @@ -126,7 +127,7 @@ def next_emblem(self, node_type: str) -> str: return str(child_type_counts[node_type] + 1) def add_child(self: T, node_type: str, type_emblem: Optional[str] = None, - **attrs) -> T: + insert_pos: Optional[int] = None, **attrs) -> T: if type_emblem is None: type_emblem = self.next_emblem(node_type) if 'policy' not in attrs: @@ -138,8 +139,15 @@ def add_child(self: T, node_type: str, type_emblem: Optional[str] = None, type_emblem=type_emblem, depth=self.depth + 1, **attrs )) - self.tree.add_edge(self.identifier, identifier, - sort_order=self.next_sort_order()) + if insert_pos is None: + insert_pos = self.next_sort_order() + else: + # shift children if necessary + edges = self.tree.out_edges(self.identifier, data='sort_order') + for _, child_idx, sort_order in edges: + if sort_order >= insert_pos: + self.tree[self.identifier][child_idx]['sort_order'] += 1 + self.tree.add_edge(self.identifier, identifier, sort_order=insert_pos) return self.__class__(self.tree, identifier=identifier) def subtree_size(self) -> int: @@ -219,6 +227,18 @@ def parent(self: T) -> Optional[T]: return type(self)(self.tree, predecessors[0]) return None + def left_sibling(self: T) -> Optional[T]: + siblings = list(self.left_siblings()) + if siblings: + return siblings[-1] + return None + + def right_sibling(self: T) -> Optional[T]: + siblings = list(self.right_siblings()) + if siblings: + return siblings[0] + return None + def ancestors(self: T, filter_fn: Callable[[T], bool]=None) \ -> Iterator[T]: parent = self.parent() @@ -227,6 +247,26 @@ def ancestors(self: T, filter_fn: Callable[[T], bool]=None) \ if parent: yield from parent.ancestors(filter_fn) + def left_siblings(self: T) -> Iterator[T]: + """What's before this node, sharing the same parent.""" + parent = self.parent() + if parent: + for sibling in parent.children(): + if sibling.identifier == self.identifier: + break + yield sibling + + def right_siblings(self: T) -> Iterator[T]: + """What's after this node, sharing the same parent.""" + parent = self.parent() + seen_self = False + if parent: + for sibling in parent.children(): + if seen_self: + yield sibling + if sibling.identifier == self.identifier: + seen_self = True + def add_models(self: T, models: Iterator[DocNode]) -> T: """Convert a (linear) list of DocNodes into a tree-aware version. We assume that the models are already sorted.""" @@ -257,6 +297,36 @@ def filter(self: T, filter_fn: Callable[[DocNode], bool]) -> Iterator[T]: def jump_to(self: T, identifier: str) -> T: return type(self)(self.tree, identifier) + def append_to(self: T, new_parent: T) -> T: + """Move this node (and its subtree) to be the last child of the + new_parent. This requires modifying all of the subtree's + identifiers.""" + prev_parent = self.parent() + if not prev_parent: + raise AssertionError("Can't move a node with no parent") + # remove from previous position + self.tree.remove_edge(prev_parent.identifier, self.identifier) + # add to new position + old_prefix = self.identifier + self.model.type_emblem = new_parent.next_emblem(self.node_type) + new_prefix = ( + new_parent.identifier + + '__' + self.node_type + + '_' + self.type_emblem + ) + self.tree.add_edge(new_parent.identifier, self.identifier, + sort_order=new_parent.next_sort_order()) + + # relabel + label_mapping = {} + for node in self.walk(): + old_label = node.identifier + node.model.identifier = new_prefix + old_label[len(old_prefix):] + label_mapping[old_label] = node.model.identifier + relabel_nodes(self.tree, label_mapping, copy=False) + + return self.jump_to(label_mapping[self.identifier]) + class XMLAwareCursor(DocCursor): """Extension of DocCursor which also tracks the XML which created each From b63591fbc8033f1caa7dd637485853d953503ec3 Mon Sep 17 00:00:00 2001 From: CM Lubinski Date: Thu, 1 Mar 2018 18:34:39 -0500 Subject: [PATCH 2/5] Add two initial document migrations. The PDF parser will sometimes generate documents which aren't valid in our ProseMirror schema. This adds a command to migrate those documents to a format which _is_ valid. Ideally, this would include modifications to the PDF parser to prevent these documents from being created, but we're pressed for time. Also, we're able to cut some corners by ignoring potential changes to inline annotations. As we're only modifying existing docnodes (or adding new ones), we'll never break an FK. That won't be true of future doc migrations. --- .../management/commands/migrate_documents.py | 71 +++++++++++++++++++ ...t_management_commands_migrate_documents.py | 41 +++++++++++ 2 files changed, 112 insertions(+) create mode 100644 api/ombpdf/management/commands/migrate_documents.py create mode 100644 api/ombpdf/tests/test_management_commands_migrate_documents.py diff --git a/api/ombpdf/management/commands/migrate_documents.py b/api/ombpdf/management/commands/migrate_documents.py new file mode 100644 index 000000000..93d4f40ea --- /dev/null +++ b/api/ombpdf/management/commands/migrate_documents.py @@ -0,0 +1,71 @@ +import logging + +from django.core.management.base import BaseCommand +from django.db import transaction +from tqdm import tqdm + +from document.models import DocNode +from document.tree import DocCursor + +logger = logging.getLogger(__name__) + + +def ensure_listitem_in_list(doc: DocCursor) -> DocCursor: + """We expect all listitems to have a parent list. However, the parser has + emitted lists followed by bare listitems. We'll place the listitem in the + list that imemdiately precedes it.""" + for li in doc.filter(lambda n: n.node_type == 'listitem'): + parent = li.parent() + prev_sibling = li.left_sibling() + + if not parent: + logger.warning('Root of %s is an li.', + doc.policy.title_with_number) + continue + if parent.node_type == 'list': # all is well + continue + + if prev_sibling and prev_sibling.node_type == 'list': + li.append_to(prev_sibling) + return ensure_listitem_in_list(doc) + # else: create new list to wrap this one + logger.warning('Could not fix li in %s', doc.policy.title_with_number) + + return doc # no changes needed + + +def ensure_section_has_heading(doc: DocCursor) -> DocCursor: + """We expect all sections to have a heading. Fill the missing data with + placeholder text.""" + # materialize so we don't need to worry about a modified iterator + secs = list(doc.filter(lambda n: n.node_type == 'sec')) + for sec in secs: + children = list(sec.children()) + if not children or children[0].node_type != 'heading': + sec.add_child('heading', insert_pos=0, text='--Missing Heading--', + policy_id=doc.policy_id) + return doc + + +transforms = [ + ensure_listitem_in_list, + ensure_section_has_heading, +] + + +class Command(BaseCommand): + help = ( # noqa (overriding a builtin) + "Run through (idempotent) document migrations to mass-fixup docs.") + + def handle(self, *args, **kwargs): + roots = DocNode.objects.filter(depth=0) + with tqdm(total=roots.count()) as pbar: + for root_docnode in roots: + with transaction.atomic(): + doc = DocCursor.load_from_model(root_docnode) + for transform in transforms: + doc = transform(doc) + doc.nested_set_renumber(bulk_create=False) + for node in doc.walk(): + node.save() + pbar.update(1) diff --git a/api/ombpdf/tests/test_management_commands_migrate_documents.py b/api/ombpdf/tests/test_management_commands_migrate_documents.py new file mode 100644 index 000000000..cf0498443 --- /dev/null +++ b/api/ombpdf/tests/test_management_commands_migrate_documents.py @@ -0,0 +1,41 @@ +from document.tree import DocCursor +from ombpdf.management.commands import migrate_documents + + +def test_ensure_listitem_in_list(): + root = DocCursor.new_tree('root') + list_el = root.add_child('list') + list_el.add_child('listitem', '1') + root.add_child('listitem', '2') + root.add_child('listitem', '3') + assert [n.identifier for n in root.walk()] == [ + 'root_1', + 'root_1__list_1', + 'root_1__list_1__listitem_1', + 'root_1__listitem_2', + 'root_1__listitem_3', + ] + + root = migrate_documents.ensure_listitem_in_list(root) + assert [n.identifier for n in root.walk()] == [ + 'root_1', + 'root_1__list_1', + 'root_1__list_1__listitem_1', + 'root_1__list_1__listitem_2', + 'root_1__list_1__listitem_3', + ] + + +def test_ensure_section_has_heading(): + root = DocCursor.new_tree('root') + sec1 = root.add_child('sec') + sec1.add_child('sec') + sec12 = sec1.add_child('sec') + sec12.add_child('heading', text='Subheading') + root.add_child('sec') + + root = migrate_documents.ensure_section_has_heading(root) + assert root['sec_1']['heading_1'].text == '--Missing Heading--' + assert root['sec_1']['sec_1']['heading_1'].text == '--Missing Heading--' + assert root['sec_1']['sec_2']['heading_1'].text == 'Subheading' + assert root['sec_2']['heading_1'].text == '--Missing Heading--' From 3ba4865aa9ac7f01d6fc6bc10fbf43df26600c7b Mon Sep 17 00:00:00 2001 From: CM Lubinski Date: Thu, 1 Mar 2018 19:01:18 -0500 Subject: [PATCH 3/5] Always trim title before it's sent. The text of the heading can be too long. --- api/document/js/__tests__/serialize-doc.test.ts | 9 +++++++++ api/document/js/serialize-doc.ts | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/api/document/js/__tests__/serialize-doc.test.ts b/api/document/js/__tests__/serialize-doc.test.ts index 47af75540..58b0c2a50 100644 --- a/api/document/js/__tests__/serialize-doc.test.ts +++ b/api/document/js/__tests__/serialize-doc.test.ts @@ -1,4 +1,5 @@ import { Fragment } from 'prosemirror-model'; +import * as repeatString from 'repeat-string'; import serializeDoc, { apiFactory, convertTexts } from '../serialize-doc'; import schema, { factory } from '../schema'; @@ -53,6 +54,14 @@ describe('serializeDoc()', () => { expect(result.title).toEqual('Stufff'); }); + it('trims heading text to for the section title', () => { + const node = factory.sec([ + factory.heading(repeatString('1', 555), 1), + ]); + const result = serializeDoc(node); + expect(result.title).toEqual(repeatString('1', 128)); + }); + it('converts footnotes', () => { const node = schema.nodes.para.create({}, [ schema.nodes.paraText.create({}, [ diff --git a/api/document/js/serialize-doc.ts b/api/document/js/serialize-doc.ts index e2d699438..6a6c9b08b 100644 --- a/api/document/js/serialize-doc.ts +++ b/api/document/js/serialize-doc.ts @@ -51,9 +51,9 @@ const NODE_CONVERTERS: NodeConverterMap = { }, sec(node) { const headerNode = node.content.child(0); - const title = headerNode.textContent; + const title = headerNode.textContent || ''; const sec = defaultNodeConverter(node); - sec.title = title; + sec.title = title.substr(0, 128); // trim return sec; }, From 742e1aeffe8e0d84daf5b6c41833799c481af897 Mon Sep 17 00:00:00 2001 From: CM Lubinski Date: Thu, 1 Mar 2018 19:01:42 -0500 Subject: [PATCH 4/5] Allow sections with only a heading. This comes up frequently in the imported docs. --- api/document/js/schema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/document/js/schema.ts b/api/document/js/schema.ts index 84c3b743a..727f42c44 100644 --- a/api/document/js/schema.ts +++ b/api/document/js/schema.ts @@ -64,7 +64,7 @@ const schema = new Schema({ toDOM: () => ['div', { class: 'node-paragraph' }, 0], }, sec: { - content: 'heading block+', + content: 'heading block*', group: 'block', toDOM: () => ['section', { class: 'node-section' }, 0], }, From df3768a26bd49c1bfbf35221517db949f2fb6d5d Mon Sep 17 00:00:00 2001 From: CM Lubinski Date: Thu, 1 Mar 2018 19:11:15 -0500 Subject: [PATCH 5/5] Set imported docs to be marked as such. --- .../migrations/0016_auto_20180302_0007.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 api/reqs/migrations/0016_auto_20180302_0007.py diff --git a/api/reqs/migrations/0016_auto_20180302_0007.py b/api/reqs/migrations/0016_auto_20180302_0007.py new file mode 100644 index 000000000..13df9c245 --- /dev/null +++ b/api/reqs/migrations/0016_auto_20180302_0007.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.5 on 2018-03-02 00:07 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('reqs', '0015_remove_policy_public'), + ] + + operations = [ + migrations.RunSQL(""" + UPDATE reqs_policy + SET workflow_phase='cleanup' + WHERE workflow_phase='no_doc' + AND id IN (SELECT policy_id from document_docnode) + """) + ]