Skip to content

Commit 541af0c

Browse files
Merge pull request #2216 from laws-africa/subtype-refs
bring subtypes matcher into the new world
2 parents da03922 + dc5c66c commit 541af0c

File tree

3 files changed

+40
-37
lines changed

3 files changed

+40
-37
lines changed

indigo/analysis/refs/base.py

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,19 @@
44
from django.conf import settings
55

66
from docpipe.citations import ActMatcher
7-
from docpipe.matchers import CitationMatcher
7+
from docpipe.matchers import CitationMatcher, ExtractedMatch
88
from indigo.analysis.markup import TextPatternMarker
99
from indigo.analysis.matchers import DocumentPatternMatcherMixin
1010
from indigo.plugins import LocaleBasedMatcher, plugins
1111
from indigo_api.models import Subtype, Work, Country
1212

1313

1414
def markup_document_refs(document):
15-
# TODO: these are both old and should be retired
15+
# TODO: this is old and should be retired
1616
finder = plugins.for_document('refs', document)
1717
if finder:
1818
finder.find_references_in_document(document)
1919

20-
finder = plugins.for_document('refs-subtypes', document)
21-
if finder:
22-
finder.find_references_in_document(document)
23-
2420
# new mechanism for calling locale-based matchers based on DocumentPatternMatcher
2521
for plugin_type in settings.INDIGO['LINK_REFERENCES_PLUGINS']:
2622
matcher = plugins.for_document(plugin_type, document)
@@ -123,9 +119,9 @@ class ActNumberCitationMatcherAFR(ActNumberCitationMatcher):
123119
xml_candidate_xpath = ".//text()[contains(., 'Wet') and not(ancestor::ns:ref)]"
124120

125121

126-
@plugins.register('refs-subtypes')
127-
class RefsFinderSubtypesENG(BaseRefsFinder):
128-
""" Finds references to works other than Acts in documents, of the form:
122+
@plugins.register('refs-subtype-numbers')
123+
class SubtypeNumberCitationMatcherENG(DocumentPatternMatcherMixin, CitationMatcher):
124+
""" Finds references to works based on subtypes, of the form:
129125
130126
P 52 of 2001
131127
Ordinance no. 52 of 1998
@@ -136,49 +132,54 @@ class RefsFinderSubtypesENG(BaseRefsFinder):
136132
# country, language, locality
137133
locale = (None, 'eng', None)
138134

139-
def setup(self, root):
135+
html_candidate_xpath = ".//text()[(PATTERNS) and not(ancestor::a)]"
136+
xml_candidate_xpath = ".//text()[(PATTERNS) and not(ancestor::ns:ref)]"
137+
138+
def setup(self, *args, **kwargs):
140139
self.setup_subtypes()
141-
self.setup_candidate_xpath()
142-
self.setup_pattern_re()
143-
# If we don't have subtypes, don't let the superclass do setup, because it will fail.
144-
# We're going to opt-out of doing any work anyway.
145-
if self.subtypes:
146-
super().setup(root)
140+
super().setup(*args, **kwargs)
147141

148142
def setup_subtypes(self):
149143
self.subtypes = [s for s in Subtype.objects.all()]
150-
self.subtype_names = [s.name for s in self.subtypes]
151-
self.subtype_abbreviations = [s.abbreviation for s in self.subtypes]
144+
subtype_names = [s.name for s in self.subtypes]
145+
subtype_abbreviations = [s.abbreviation for s in self.subtypes]
152146

153-
self.subtypes_string = '|'.join([re.escape(s) for s in self.subtype_names + self.subtype_abbreviations])
147+
# sort, longest first
148+
subtypes = sorted(subtype_names + subtype_abbreviations, key=len, reverse=True)
149+
self.subtypes_string = '|'.join(re.escape(s) for s in subtypes)
154150

155-
def setup_candidate_xpath(self):
156-
xpath_contains = " or ".join([f"contains(translate(., '{subtype.upper()}', '{subtype.lower()}'), "
157-
f"'{subtype.lower()}')"
158-
for subtype in self.subtype_names + self.subtype_abbreviations])
159-
self.candidate_xpath = f".//text()[({xpath_contains}) and not(ancestor::a:ref)]"
151+
# build the xpath; if there are no subtypes, use "false" to not match anything
152+
xpath_contains = " or ".join([
153+
f"contains(translate(., '{subtype.upper()}', '{subtype.lower()}'), '{subtype.lower()}')"
154+
for subtype in subtypes
155+
]) or "false"
156+
self.candidate_xpath = self.candidate_xpath.replace('PATTERNS', xpath_contains)
160157

161-
def setup_pattern_re(self):
162158
# TODO: disregard e.g. "6 May" in "GN 34 of 6 May 2020", but catch reference
163159
self.pattern_re = re.compile(
164160
fr'''
165161
(?P<ref>
166162
(?P<subtype>{self.subtypes_string})\s*
167163
(No\.?\s*)?
168-
(?P<num>\d+)
164+
(?P<num>[a-z0-9-]+)
169165
(\s+of\s+|/)
170166
(?P<year>\d{{4}})
171167
)
172168
''', re.X | re.I)
173169

174-
def markup_patterns(self, root):
170+
def extract_paged_text_matches(self):
175171
# don't do anything if there are no subtypes
176172
if self.subtypes:
177-
super().markup_patterns(root)
173+
super().extract_paged_text_matches()
178174

179-
def make_href(self, match):
175+
def run_dom_matching(self):
176+
# don't do anything if there are no subtypes
177+
if self.subtypes:
178+
super().run_dom_matching()
179+
180+
def make_href(self, match: ExtractedMatch):
180181
# use correct subtype for FRBR URI
181-
subtype = match.group('subtype')
182+
subtype = match.groups['subtype']
182183
for s in self.subtypes:
183184
if subtype.lower() == s.name.lower() or subtype.lower() == s.abbreviation.lower():
184185
subtype = s.abbreviation
@@ -188,7 +189,7 @@ def make_href(self, match):
188189
if self.frbr_uri.locality:
189190
place = f'{self.frbr_uri.country}-{self.frbr_uri.locality}'
190191

191-
return f'/akn/{place}/act/{subtype}/{match.group("year")}/{match.group("num")}'
192+
return f'/akn/{place}/act/{subtype}/{match.groups["year"]}/{match.groups["num"].lower()}'
192193

193194

194195
@plugins.register('refs-cap')

indigo/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@
166166
'JS_I18N_NAMESPACES': ['indigo_app'],
167167

168168
# Plugins to use for linking references. The internal-refs must come after work-level plugins.
169-
'LINK_REFERENCES_PLUGINS': ['refs-act-numbers', 'refs-act-names', 'refs-aliases', 'refs-cap', 'internal-refs'],
169+
'LINK_REFERENCES_PLUGINS': [
170+
'refs-act-numbers', 'refs-act-names', 'refs-subtype-numbers', 'refs-aliases', 'refs-cap', 'internal-refs'
171+
],
170172
}
171173

172174
# Database

indigo/tests/test_refs.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from cobalt import FrbrUri
77

8-
from indigo.analysis.refs.base import RefsFinderSubtypesENG, RefsFinderCapENG, ActNumberCitationMatcherFRA, \
8+
from indigo.analysis.refs.base import SubtypeNumberCitationMatcherENG, RefsFinderCapENG, ActNumberCitationMatcherFRA, \
99
ActNumberCitationMatcherAFR
1010

1111
from indigo_api.models import Document, Language, Work, Country, User
@@ -17,7 +17,7 @@ class RefsFinderSubtypesENGTestCase(TestCase):
1717

1818
def setUp(self):
1919
self.work = Work(frbr_uri='/akn/za/act/1991/1')
20-
self.finder = RefsFinderSubtypesENG()
20+
self.finder = SubtypeNumberCitationMatcherENG()
2121
self.eng = Language.for_code('eng')
2222
self.maxDiff = None
2323

@@ -32,7 +32,7 @@ def test_find_simple(self):
3232
<paragraph eId="sec_1.paragraph-0">
3333
<content>
3434
<p>Something to do with GN no 102 of 2012.</p>
35-
<p>And another thing about SI 4 of 1998.</p>
35+
<p>And another thing about SI 4a of 1998.</p>
3636
</content>
3737
</paragraph>
3838
</section>"""
@@ -49,14 +49,14 @@ def test_find_simple(self):
4949
<paragraph eId="sec_1.paragraph-0">
5050
<content>
5151
<p>Something to do with <ref href="/akn/za/act/gn/2012/102">GN no 102 of 2012</ref>.</p>
52-
<p>And another thing about <ref href="/akn/za/act/si/1998/4">SI 4 of 1998</ref>.</p>
52+
<p>And another thing about <ref href="/akn/za/act/si/1998/4a">SI 4a of 1998</ref>.</p>
5353
</content>
5454
</paragraph>
5555
</section>"""
5656
),
5757
language=self.eng)
5858

59-
self.finder.find_references_in_document(document)
59+
self.finder.markup_document_matches(document)
6060
root = etree.fromstring(expected.content.encode('utf-8'))
6161
expected.content = etree.tostring(root, encoding='utf-8').decode('utf-8')
6262
self.assertEqual(expected.content, document.content)

0 commit comments

Comments
 (0)