Merge pull request #2291 from laws-africa/accented-terms

Sentence caser / accented terms
laws-africa · Nov 20, 2024 · f407cd2 · f407cd2
2 parents d88494d + 3fb45b8
commit f407cd2
Show file tree

Hide file tree

Showing 12 changed files with 335 additions and 1 deletion.
diff --git a/indigo/analysis/__init__.py b/indigo/analysis/__init__.py
@@ -3,3 +3,4 @@
 import indigo.analysis.refs  # noqa
 import indigo.analysis.work_detail  # noqa
 import indigo.analysis.italics_terms  # noqa
+import indigo.analysis.sentence_caser  # noqa
diff --git a/indigo/analysis/sentence_caser.py b/indigo/analysis/sentence_caser.py
@@ -0,0 +1,53 @@
+import re
+
+import unicodedata
+from lxml import etree
+
+from indigo.plugins import LocaleBasedMatcher, plugins
+
+
+@plugins.register('sentence-caser')
+class BaseSentenceCaser(LocaleBasedMatcher):
+    """ Sentence cases headings in a document.
+    """
+    terms = None
+    normalized_terms = None
+
+    def sentence_case_headings_in_document(self, document):
+        accented_terms = document.language.accented_terms.first()
+        # allow tests to specify self.terms
+        if not self.terms:
+            self.terms = accented_terms.terms if accented_terms else []
+        self.terms.sort(key=lambda x: len(x), reverse=True)
+        self.normalized_terms = [''.join(c for c in unicodedata.normalize('NFD', t) if unicodedata.category(c) != 'Mn').lower()
+                                 for t in self.terms]
+        root = etree.fromstring(document.content.encode('utf-8'))
+        nsmap = {'a': root.nsmap[None]}
+        for heading in root.xpath('//a:heading', namespaces=nsmap):
+            self.capitalized = False
+            skip_elements = heading.xpath(".//a:*[ancestor::a:authorialNote]", namespaces=nsmap)
+            for elem in heading.iter():
+                if elem in skip_elements:
+                    continue
+                elem.text = self.adjust_heading_text(elem.text)
+                elem.tail = self.adjust_heading_text(elem.tail)
+
+        document.content = etree.tostring(root, encoding='unicode')
+
+    def adjust_heading_text(self, text):
+        # text may be None or ' ', for example -- ignore in those cases
+        if text and text.strip():
+            text = self.apply_terms(text.lower())
+            if not self.capitalized:
+                # don't use capitalize() on the whole of `text`, as this interferes with capitalised terms
+                # either way, lstrip if capitalizing here since the first letter would be missed otherwise
+                text = text.lstrip()[0].upper() + (text.lstrip()[1:] if len(text.lstrip()) > 1 else '')
+                self.capitalized = True
+        return text
+
+    def apply_terms(self, text):
+        # save a tiny bit of time by checking for any matches first
+        if any(t in text for t in self.normalized_terms):
+            for i, term in enumerate(self.normalized_terms):
+                text = re.sub(rf'\b{term}\b', self.terms[i], text)
+        return text
diff --git a/indigo/tests/test_sentence_caser.py b/indigo/tests/test_sentence_caser.py
@@ -0,0 +1,165 @@
+from django.test import TestCase
+from lxml import etree
+
+from indigo.analysis.sentence_caser import BaseSentenceCaser
+from indigo_api.models import Document, Work
+from indigo_api.tests.fixtures import document_fixture
+
+
+class SentenceCaserTestCase(TestCase):
+    fixtures = ['languages_data', 'countries']
+
+    def setUp(self):
+        self.work = Work(frbr_uri='/za/act/1991/1')
+        self.sentence_caser = BaseSentenceCaser()
+        self.sentence_caser.terms = ['Tâx', 'táxation', 'in the Höme']
+        self.maxDiff = None
+
+    def test_sentence_case_headings_in_document(self):
+        document = Document(
+            work=self.work,
+            language_id=1,
+            document_xml=document_fixture(
+                xml="""
+        <section eId="sec_0">
+          <num>0</num>
+          <heading><i> TAXATION </i>IN<sup><authorialNote marker="1" placement="bottom" eId="sec_0__authorialNote_1"><p eId="sec_0__authorialNote_1__p_1">OR ON</p></authorialNote></sup> THE HOME<sup><authorialNote marker="1A" placement="bottom" eId="sec_0__authorialNote_2"><p eId="sec_0__authorialNote_2__p_1">FN</p></authorialNote></sup></heading>
+          <content>
+            <p eId="sec_0__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_1">
+          <num>1</num>
+          <heading><i>HELLO</i> NO TAXATION <i>WITHOUT REPRESENTATION</i> <b>(OTHER STUFF)</b></heading>
+          <content>
+            <p eId="sec_1__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_1A">
+          <num>1A</num>
+          <heading><i> </i>HELLO NO TAXATION <i>WITHOUT REPRESENTATION</i> <b>(OTHER STUFF)</b></heading>
+          <content>
+            <p eId="sec_1A__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_2">
+          <num>2</num>
+          <heading>TAXATION AND TAX, AND TAXONOMIES TOO</heading>
+          <content>
+            <p eId="sec_2__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_2A">
+          <num>2A</num>
+          <heading>TAXONOMIES ALL ALONE</heading>
+          <content>
+            <p eId="sec_2A__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_3">
+          <num>3</num>
+          <heading>TAXATION <i>IN THE HOME</i></heading>
+          <content>
+            <p eId="sec_3__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_4">
+          <num>4</num>
+          <heading><i> TAXATION </i>IN THE HOME<sup><authorialNote marker="1" placement="bottom" eId="sec_4__authorialNote_1"><p eId="sec_4__authorialNote_1__p_1">FN</p></authorialNote></sup></heading>
+          <content>
+            <p eId="sec_4__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_5">
+          <num>5</num>
+          <heading>DOUBLE <b>TAXATION</b></heading>
+          <content>
+            <p eId="sec_5__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_6">
+          <num>6</num>
+          <heading>SINGLE <i>TAXATION</i></heading>
+          <content>
+            <p eId="sec_6__p_1">Text</p>
+          </content>
+        </section>
+                """
+            )
+        )
+
+        expected = Document(
+            work=self.work,
+            document_xml=document_fixture(
+                xml="""
+        <section eId="sec_0">
+          <num>0</num>
+          <heading><i>Táxation </i>in<sup><authorialNote marker="1" placement="bottom" eId="sec_0__authorialNote_1"><p eId="sec_0__authorialNote_1__p_1">OR ON</p></authorialNote></sup> the home<sup><authorialNote marker="1A" placement="bottom" eId="sec_0__authorialNote_2"><p eId="sec_0__authorialNote_2__p_1">FN</p></authorialNote></sup></heading>
+          <content>
+            <p eId="sec_0__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_1">
+          <num>1</num>
+          <heading><i>Hello</i> no táxation <i>without representation</i> <b>(other stuff)</b></heading>
+          <content>
+            <p eId="sec_1__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_1A">
+          <num>1A</num>
+          <heading><i> </i>Hello no táxation <i>without representation</i> <b>(other stuff)</b></heading>
+          <content>
+            <p eId="sec_1A__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_2">
+          <num>2</num>
+          <heading>Táxation and Tâx, and taxonomies too</heading>
+          <content>
+            <p eId="sec_2__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_2A">
+          <num>2A</num>
+          <heading>Taxonomies all alone</heading>
+          <content>
+            <p eId="sec_2A__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_3">
+          <num>3</num>
+          <heading>Táxation <i>in the Höme</i></heading>
+          <content>
+            <p eId="sec_3__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_4">
+          <num>4</num>
+          <heading><i>Táxation </i>in the Höme<sup><authorialNote marker="1" placement="bottom" eId="sec_4__authorialNote_1"><p eId="sec_4__authorialNote_1__p_1">FN</p></authorialNote></sup></heading>
+          <content>
+            <p eId="sec_4__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_5">
+          <num>5</num>
+          <heading>Double <b>táxation</b></heading>
+          <content>
+            <p eId="sec_5__p_1">Text</p>
+          </content>
+        </section>
+        <section eId="sec_6">
+          <num>6</num>
+          <heading>Single <i>táxation</i></heading>
+          <content>
+            <p eId="sec_6__p_1">Text</p>
+          </content>
+        </section>
+                """
+            )
+        )
+
+        self.sentence_caser.sentence_case_headings_in_document(document)
+        root = etree.fromstring(expected.content.encode('utf-8'))
+        expected.content = etree.tostring(root, encoding='utf-8').decode('utf-8')
+        self.assertEqual(expected.content, document.content)
diff --git a/indigo_api/admin.py b/indigo_api/admin.py
@@ -2,6 +2,7 @@
 
 from django import forms
 from django.contrib import admin
+from django.contrib.postgres.forms import SimpleArrayField
 from django.shortcuts import reverse
 from django.utils.timezone import now
 from django.utils.translation import gettext_lazy as _
@@ -11,7 +12,7 @@
 from treebeard.forms import MoveNodeForm, movenodeform_factory
 from background_task.admin import TaskAdmin
 
-from .models import Document, Subtype, Colophon, Work, TaskLabel, TaxonomyTopic, CitationAlias, SavedSearch
+from .models import Document, Subtype, Colophon, Work, TaskLabel, TaxonomyTopic, CitationAlias, SavedSearch, AccentedTerms
 
 
 admin.site.register(Subtype)
@@ -97,6 +98,19 @@ class CitationAliasAdmin(admin.ModelAdmin):
     list_filter = ('place',)
 
 
+class AccentedTermsForm(forms.ModelForm):
+    terms = SimpleArrayField(forms.CharField(max_length=1024, required=False), delimiter='\n', required=False, widget=forms.Textarea)
+
+    class Meta:
+        model = AccentedTerms
+        fields = "__all__"
+
+
+@admin.register(AccentedTerms)
+class AccentedTermsAdmin(admin.ModelAdmin):
+    form = AccentedTermsForm
+
+
 def run_now(modeladmin, request, queryset):
     queryset.update(run_at=now())
     messages.success(request, _("Updated run time to now for selected tasks."))

diff --git a/indigo_api/migrations/0049_accentedterms.py b/indigo_api/migrations/0049_accentedterms.py
@@ -0,0 +1,27 @@
+# Generated by Django 4.2.15 on 2024-11-18 13:09
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('indigo_api', '0048_amendment_verb'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='AccentedTerms',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('terms', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=1024, verbose_name='terms'), blank=True, null=True, size=None)),
+                ('language', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='accented_terms', to='indigo_api.language', unique=True, verbose_name='language')),
+            ],
+            options={
+                'verbose_name': 'accented terms',
+                'verbose_name_plural': 'accented terms',
+            },
+        ),
+    ]
diff --git a/indigo_api/models/places.py b/indigo_api/models/places.py
@@ -262,3 +262,18 @@ def work_properties(self):
             props['cap'] = "Chapter (Cap.)"
 
         return props
+
+
+class AccentedTerms(models.Model):
+    """ Accented terms for a language.
+    """
+    language = models.ForeignKey(Language, related_name='accented_terms', null=False, blank=False, unique=True,
+                                 on_delete=models.CASCADE, verbose_name=_("language"))
+    terms = ArrayField(models.CharField(_("terms"), max_length=1024), null=True, blank=True)
+
+    class Meta:
+        verbose_name = _("accented terms")
+        verbose_name_plural = _("accented terms")
+
+    def __str__(self):
+        return str(self.language)
diff --git a/indigo_api/urls.py b/indigo_api/urls.py
@@ -28,6 +28,7 @@
     path('documents/<int:document_id>/analysis/link-terms', documents.LinkTermsView.as_view(), name='link-terms'),
     path('documents/<int:document_id>/analysis/link-references', documents.LinkReferencesView.as_view(), name='link-references'),
     path('documents/<int:document_id>/analysis/mark-up-italics', documents.MarkUpItalicsTermsView.as_view(), name='mark-up-italics'),
+    path('documents/<int:document_id>/analysis/sentence-case-headings', documents.SentenceCaseHeadingsView.as_view(), name='sentence-case-headings'),
 
     path('', include(router.urls)),
 ]
diff --git a/indigo_api/views/documents.py b/indigo_api/views/documents.py
@@ -432,6 +432,25 @@ def mark_up_italics(self, document):
             italics_terms_finder.mark_up_italics_in_document(document, italics_terms)
 
 
+class SentenceCaseHeadingsView(DocumentResourceView, APIView):
+    """ Sentence case headings. Also apply accents as needed / relevant.
+    """
+    def post(self, request, document_id):
+        serializer = DocumentAPISerializer(instance=self.document, data=self.request.data)
+        serializer.fields['document'].fields['content'].required = True
+        serializer.is_valid(raise_exception=True)
+        document = serializer.fields['document'].update_document(self.document, serializer.validated_data['document'])
+
+        self.sentence_case(document)
+
+        return Response({'document': {'content': document.document_xml}})
+
+    def sentence_case(self, document):
+        sentence_caser = plugins.for_document('sentence-caser', document)
+        if sentence_caser:
+            sentence_caser.sentence_case_headings_in_document(document)
+
+
 class DocumentDiffView(DocumentResourceView, APIView):
     def post(self, request, document_id):
         serializer = DocumentDiffSerializer(instance=self.document, data=self.request.data)

diff --git a/indigo_app/static/javascript/indigo/views/document.js b/indigo_app/static/javascript/indigo/views/document.js
@@ -164,6 +164,7 @@
       this.definedTermsView = new Indigo.DocumentDefinedTermsView({model: this.documentContent});
       this.referencesView = new Indigo.DocumentReferencesView({model: this.documentContent});
       this.italicsView = new Indigo.DocumentItalicsView({model: this.documentContent});
+      this.sentenceCaseView = new Indigo.DocumentSentenceCaseView({model: this.documentContent});
       this.revisionsView = new Indigo.DocumentRevisionsView({document: this.document, documentContent: this.documentContent});
       this.tocView = new Indigo.DocumentTOCView({model: this.documentContent, document: this.document});
 

diff --git a/indigo_app/static/javascript/indigo/views/document_sentences_caser.js b/indigo_app/static/javascript/indigo/views/document_sentences_caser.js
@@ -0,0 +1,33 @@
+(function(exports) {
+  "use strict";
+
+  if (!exports.Indigo) exports.Indigo = {};
+  Indigo = exports.Indigo;
+
+  /**
+   * Handle the sentence caser view.
+   */
+  Indigo.DocumentSentenceCaseView = Backbone.View.extend({
+    el: '#sentence-caser',
+    events: {
+      'click .sentence-case-headings': 'sentenceCaseHeadings'
+    },
+
+    sentenceCaseHeadings: function(e) {
+      let self = this,
+          data = {'document': this.model.document.toJSON()};
+
+      data.document.content = this.model.toXml();
+
+      $.ajax({
+        url: this.model.document.url() + '/analysis/sentence-case-headings',
+        type: "POST",
+        data: JSON.stringify(data),
+        contentType: "application/json; charset=utf-8",
+        dataType: "json"})
+          .then(function(response) {
+            self.model.set('content', response.document.content);
+          });
+    },
+  });
+})(window);
diff --git a/indigo_app/templates/indigo_api/document/_toolbar.html b/indigo_app/templates/indigo_api/document/_toolbar.html
@@ -28,6 +28,10 @@
         <a class="dropdown-item" href="#defined-terms-modal" data-bs-toggle="modal">{% trans "Defined terms..." %}</a>
         <a class="dropdown-item" href="#references-modal" data-bs-toggle="modal">{% trans "References..." %}</a>
         <a class="dropdown-item" href="#italics-modal" data-bs-toggle="modal">{% trans "Italicised terms..." %}</a>
+        <div class="dropdown-divider"></div>
+        <a class="dropdown-item" href="#" id="sentence-caser">
+          <span class="sentence-case-headings">{% trans "Make headings Sentence case" %}</span>
+        </a>
       {% endblock %}
     </div>
   </div>