Skip to content

Commit

Permalink
Merge pull request #2291 from laws-africa/accented-terms
Browse files Browse the repository at this point in the history
Sentence caser / accented terms
  • Loading branch information
goose-life authored Nov 20, 2024
2 parents d88494d + 3fb45b8 commit f407cd2
Show file tree
Hide file tree
Showing 12 changed files with 335 additions and 1 deletion.
1 change: 1 addition & 0 deletions indigo/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
import indigo.analysis.refs # noqa
import indigo.analysis.work_detail # noqa
import indigo.analysis.italics_terms # noqa
import indigo.analysis.sentence_caser # noqa
53 changes: 53 additions & 0 deletions indigo/analysis/sentence_caser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re

import unicodedata
from lxml import etree

from indigo.plugins import LocaleBasedMatcher, plugins


@plugins.register('sentence-caser')
class BaseSentenceCaser(LocaleBasedMatcher):
""" Sentence cases headings in a document.
"""
terms = None
normalized_terms = None

def sentence_case_headings_in_document(self, document):
accented_terms = document.language.accented_terms.first()
# allow tests to specify self.terms
if not self.terms:
self.terms = accented_terms.terms if accented_terms else []
self.terms.sort(key=lambda x: len(x), reverse=True)
self.normalized_terms = [''.join(c for c in unicodedata.normalize('NFD', t) if unicodedata.category(c) != 'Mn').lower()
for t in self.terms]
root = etree.fromstring(document.content.encode('utf-8'))
nsmap = {'a': root.nsmap[None]}
for heading in root.xpath('//a:heading', namespaces=nsmap):
self.capitalized = False
skip_elements = heading.xpath(".//a:*[ancestor::a:authorialNote]", namespaces=nsmap)
for elem in heading.iter():
if elem in skip_elements:
continue
elem.text = self.adjust_heading_text(elem.text)
elem.tail = self.adjust_heading_text(elem.tail)

document.content = etree.tostring(root, encoding='unicode')

def adjust_heading_text(self, text):
# text may be None or ' ', for example -- ignore in those cases
if text and text.strip():
text = self.apply_terms(text.lower())
if not self.capitalized:
# don't use capitalize() on the whole of `text`, as this interferes with capitalised terms
# either way, lstrip if capitalizing here since the first letter would be missed otherwise
text = text.lstrip()[0].upper() + (text.lstrip()[1:] if len(text.lstrip()) > 1 else '')
self.capitalized = True
return text

def apply_terms(self, text):
# save a tiny bit of time by checking for any matches first
if any(t in text for t in self.normalized_terms):
for i, term in enumerate(self.normalized_terms):
text = re.sub(rf'\b{term}\b', self.terms[i], text)
return text
165 changes: 165 additions & 0 deletions indigo/tests/test_sentence_caser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from django.test import TestCase
from lxml import etree

from indigo.analysis.sentence_caser import BaseSentenceCaser
from indigo_api.models import Document, Work
from indigo_api.tests.fixtures import document_fixture


class SentenceCaserTestCase(TestCase):
fixtures = ['languages_data', 'countries']

def setUp(self):
self.work = Work(frbr_uri='/za/act/1991/1')
self.sentence_caser = BaseSentenceCaser()
self.sentence_caser.terms = ['Tâx', 'táxation', 'in the Höme']
self.maxDiff = None

def test_sentence_case_headings_in_document(self):
document = Document(
work=self.work,
language_id=1,
document_xml=document_fixture(
xml="""
<section eId="sec_0">
<num>0</num>
<heading><i> TAXATION </i>IN<sup><authorialNote marker="1" placement="bottom" eId="sec_0__authorialNote_1"><p eId="sec_0__authorialNote_1__p_1">OR ON</p></authorialNote></sup> THE HOME<sup><authorialNote marker="1A" placement="bottom" eId="sec_0__authorialNote_2"><p eId="sec_0__authorialNote_2__p_1">FN</p></authorialNote></sup></heading>
<content>
<p eId="sec_0__p_1">Text</p>
</content>
</section>
<section eId="sec_1">
<num>1</num>
<heading><i>HELLO</i> NO TAXATION <i>WITHOUT REPRESENTATION</i> <b>(OTHER STUFF)</b></heading>
<content>
<p eId="sec_1__p_1">Text</p>
</content>
</section>
<section eId="sec_1A">
<num>1A</num>
<heading><i> </i>HELLO NO TAXATION <i>WITHOUT REPRESENTATION</i> <b>(OTHER STUFF)</b></heading>
<content>
<p eId="sec_1A__p_1">Text</p>
</content>
</section>
<section eId="sec_2">
<num>2</num>
<heading>TAXATION AND TAX, AND TAXONOMIES TOO</heading>
<content>
<p eId="sec_2__p_1">Text</p>
</content>
</section>
<section eId="sec_2A">
<num>2A</num>
<heading>TAXONOMIES ALL ALONE</heading>
<content>
<p eId="sec_2A__p_1">Text</p>
</content>
</section>
<section eId="sec_3">
<num>3</num>
<heading>TAXATION <i>IN THE HOME</i></heading>
<content>
<p eId="sec_3__p_1">Text</p>
</content>
</section>
<section eId="sec_4">
<num>4</num>
<heading><i> TAXATION </i>IN THE HOME<sup><authorialNote marker="1" placement="bottom" eId="sec_4__authorialNote_1"><p eId="sec_4__authorialNote_1__p_1">FN</p></authorialNote></sup></heading>
<content>
<p eId="sec_4__p_1">Text</p>
</content>
</section>
<section eId="sec_5">
<num>5</num>
<heading>DOUBLE <b>TAXATION</b></heading>
<content>
<p eId="sec_5__p_1">Text</p>
</content>
</section>
<section eId="sec_6">
<num>6</num>
<heading>SINGLE <i>TAXATION</i></heading>
<content>
<p eId="sec_6__p_1">Text</p>
</content>
</section>
"""
)
)

expected = Document(
work=self.work,
document_xml=document_fixture(
xml="""
<section eId="sec_0">
<num>0</num>
<heading><i>Táxation </i>in<sup><authorialNote marker="1" placement="bottom" eId="sec_0__authorialNote_1"><p eId="sec_0__authorialNote_1__p_1">OR ON</p></authorialNote></sup> the home<sup><authorialNote marker="1A" placement="bottom" eId="sec_0__authorialNote_2"><p eId="sec_0__authorialNote_2__p_1">FN</p></authorialNote></sup></heading>
<content>
<p eId="sec_0__p_1">Text</p>
</content>
</section>
<section eId="sec_1">
<num>1</num>
<heading><i>Hello</i> no táxation <i>without representation</i> <b>(other stuff)</b></heading>
<content>
<p eId="sec_1__p_1">Text</p>
</content>
</section>
<section eId="sec_1A">
<num>1A</num>
<heading><i> </i>Hello no táxation <i>without representation</i> <b>(other stuff)</b></heading>
<content>
<p eId="sec_1A__p_1">Text</p>
</content>
</section>
<section eId="sec_2">
<num>2</num>
<heading>Táxation and Tâx, and taxonomies too</heading>
<content>
<p eId="sec_2__p_1">Text</p>
</content>
</section>
<section eId="sec_2A">
<num>2A</num>
<heading>Taxonomies all alone</heading>
<content>
<p eId="sec_2A__p_1">Text</p>
</content>
</section>
<section eId="sec_3">
<num>3</num>
<heading>Táxation <i>in the Höme</i></heading>
<content>
<p eId="sec_3__p_1">Text</p>
</content>
</section>
<section eId="sec_4">
<num>4</num>
<heading><i>Táxation </i>in the Höme<sup><authorialNote marker="1" placement="bottom" eId="sec_4__authorialNote_1"><p eId="sec_4__authorialNote_1__p_1">FN</p></authorialNote></sup></heading>
<content>
<p eId="sec_4__p_1">Text</p>
</content>
</section>
<section eId="sec_5">
<num>5</num>
<heading>Double <b>táxation</b></heading>
<content>
<p eId="sec_5__p_1">Text</p>
</content>
</section>
<section eId="sec_6">
<num>6</num>
<heading>Single <i>táxation</i></heading>
<content>
<p eId="sec_6__p_1">Text</p>
</content>
</section>
"""
)
)

self.sentence_caser.sentence_case_headings_in_document(document)
root = etree.fromstring(expected.content.encode('utf-8'))
expected.content = etree.tostring(root, encoding='utf-8').decode('utf-8')
self.assertEqual(expected.content, document.content)
16 changes: 15 additions & 1 deletion indigo_api/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from django import forms
from django.contrib import admin
from django.contrib.postgres.forms import SimpleArrayField
from django.shortcuts import reverse
from django.utils.timezone import now
from django.utils.translation import gettext_lazy as _
Expand All @@ -11,7 +12,7 @@
from treebeard.forms import MoveNodeForm, movenodeform_factory
from background_task.admin import TaskAdmin

from .models import Document, Subtype, Colophon, Work, TaskLabel, TaxonomyTopic, CitationAlias, SavedSearch
from .models import Document, Subtype, Colophon, Work, TaskLabel, TaxonomyTopic, CitationAlias, SavedSearch, AccentedTerms


admin.site.register(Subtype)
Expand Down Expand Up @@ -97,6 +98,19 @@ class CitationAliasAdmin(admin.ModelAdmin):
list_filter = ('place',)


class AccentedTermsForm(forms.ModelForm):
terms = SimpleArrayField(forms.CharField(max_length=1024, required=False), delimiter='\n', required=False, widget=forms.Textarea)

class Meta:
model = AccentedTerms
fields = "__all__"


@admin.register(AccentedTerms)
class AccentedTermsAdmin(admin.ModelAdmin):
form = AccentedTermsForm


def run_now(modeladmin, request, queryset):
queryset.update(run_at=now())
messages.success(request, _("Updated run time to now for selected tasks."))
Expand Down
27 changes: 27 additions & 0 deletions indigo_api/migrations/0049_accentedterms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Generated by Django 4.2.15 on 2024-11-18 13:09

import django.contrib.postgres.fields
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('indigo_api', '0048_amendment_verb'),
]

operations = [
migrations.CreateModel(
name='AccentedTerms',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('terms', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=1024, verbose_name='terms'), blank=True, null=True, size=None)),
('language', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='accented_terms', to='indigo_api.language', unique=True, verbose_name='language')),
],
options={
'verbose_name': 'accented terms',
'verbose_name_plural': 'accented terms',
},
),
]
15 changes: 15 additions & 0 deletions indigo_api/models/places.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,3 +262,18 @@ def work_properties(self):
props['cap'] = "Chapter (Cap.)"

return props


class AccentedTerms(models.Model):
""" Accented terms for a language.
"""
language = models.ForeignKey(Language, related_name='accented_terms', null=False, blank=False, unique=True,
on_delete=models.CASCADE, verbose_name=_("language"))
terms = ArrayField(models.CharField(_("terms"), max_length=1024), null=True, blank=True)

class Meta:
verbose_name = _("accented terms")
verbose_name_plural = _("accented terms")

def __str__(self):
return str(self.language)
1 change: 1 addition & 0 deletions indigo_api/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
path('documents/<int:document_id>/analysis/link-terms', documents.LinkTermsView.as_view(), name='link-terms'),
path('documents/<int:document_id>/analysis/link-references', documents.LinkReferencesView.as_view(), name='link-references'),
path('documents/<int:document_id>/analysis/mark-up-italics', documents.MarkUpItalicsTermsView.as_view(), name='mark-up-italics'),
path('documents/<int:document_id>/analysis/sentence-case-headings', documents.SentenceCaseHeadingsView.as_view(), name='sentence-case-headings'),

path('', include(router.urls)),
]
19 changes: 19 additions & 0 deletions indigo_api/views/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,25 @@ def mark_up_italics(self, document):
italics_terms_finder.mark_up_italics_in_document(document, italics_terms)


class SentenceCaseHeadingsView(DocumentResourceView, APIView):
""" Sentence case headings. Also apply accents as needed / relevant.
"""
def post(self, request, document_id):
serializer = DocumentAPISerializer(instance=self.document, data=self.request.data)
serializer.fields['document'].fields['content'].required = True
serializer.is_valid(raise_exception=True)
document = serializer.fields['document'].update_document(self.document, serializer.validated_data['document'])

self.sentence_case(document)

return Response({'document': {'content': document.document_xml}})

def sentence_case(self, document):
sentence_caser = plugins.for_document('sentence-caser', document)
if sentence_caser:
sentence_caser.sentence_case_headings_in_document(document)


class DocumentDiffView(DocumentResourceView, APIView):
def post(self, request, document_id):
serializer = DocumentDiffSerializer(instance=self.document, data=self.request.data)
Expand Down
1 change: 1 addition & 0 deletions indigo_app/static/javascript/indigo/views/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@
this.definedTermsView = new Indigo.DocumentDefinedTermsView({model: this.documentContent});
this.referencesView = new Indigo.DocumentReferencesView({model: this.documentContent});
this.italicsView = new Indigo.DocumentItalicsView({model: this.documentContent});
this.sentenceCaseView = new Indigo.DocumentSentenceCaseView({model: this.documentContent});
this.revisionsView = new Indigo.DocumentRevisionsView({document: this.document, documentContent: this.documentContent});
this.tocView = new Indigo.DocumentTOCView({model: this.documentContent, document: this.document});

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
(function(exports) {
"use strict";

if (!exports.Indigo) exports.Indigo = {};
Indigo = exports.Indigo;

/**
* Handle the sentence caser view.
*/
Indigo.DocumentSentenceCaseView = Backbone.View.extend({
el: '#sentence-caser',
events: {
'click .sentence-case-headings': 'sentenceCaseHeadings'
},

sentenceCaseHeadings: function(e) {
let self = this,
data = {'document': this.model.document.toJSON()};

data.document.content = this.model.toXml();

$.ajax({
url: this.model.document.url() + '/analysis/sentence-case-headings',
type: "POST",
data: JSON.stringify(data),
contentType: "application/json; charset=utf-8",
dataType: "json"})
.then(function(response) {
self.model.set('content', response.document.content);
});
},
});
})(window);
4 changes: 4 additions & 0 deletions indigo_app/templates/indigo_api/document/_toolbar.html
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
<a class="dropdown-item" href="#defined-terms-modal" data-bs-toggle="modal">{% trans "Defined terms..." %}</a>
<a class="dropdown-item" href="#references-modal" data-bs-toggle="modal">{% trans "References..." %}</a>
<a class="dropdown-item" href="#italics-modal" data-bs-toggle="modal">{% trans "Italicised terms..." %}</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="#" id="sentence-caser">
<span class="sentence-case-headings">{% trans "Make headings Sentence case" %}</span>
</a>
{% endblock %}
</div>
</div>
Expand Down
Loading

0 comments on commit f407cd2

Please sign in to comment.