4
4
from django .conf import settings
5
5
6
6
from docpipe .citations import ActMatcher
7
- from docpipe .matchers import CitationMatcher
7
+ from docpipe .matchers import CitationMatcher , ExtractedMatch
8
8
from indigo .analysis .markup import TextPatternMarker
9
9
from indigo .analysis .matchers import DocumentPatternMatcherMixin
10
10
from indigo .plugins import LocaleBasedMatcher , plugins
11
11
from indigo_api .models import Subtype , Work , Country
12
12
13
13
14
14
def markup_document_refs (document ):
15
- # TODO: these are both old and should be retired
15
+ # TODO: this is old and should be retired
16
16
finder = plugins .for_document ('refs' , document )
17
17
if finder :
18
18
finder .find_references_in_document (document )
19
19
20
- finder = plugins .for_document ('refs-subtypes' , document )
21
- if finder :
22
- finder .find_references_in_document (document )
23
-
24
20
# new mechanism for calling locale-based matchers based on DocumentPatternMatcher
25
21
for plugin_type in settings .INDIGO ['LINK_REFERENCES_PLUGINS' ]:
26
22
matcher = plugins .for_document (plugin_type , document )
@@ -123,9 +119,9 @@ class ActNumberCitationMatcherAFR(ActNumberCitationMatcher):
123
119
xml_candidate_xpath = ".//text()[contains(., 'Wet') and not(ancestor::ns:ref)]"
124
120
125
121
126
- @plugins .register ('refs-subtypes ' )
127
- class RefsFinderSubtypesENG ( BaseRefsFinder ):
128
- """ Finds references to works other than Acts in documents , of the form:
122
+ @plugins .register ('refs-subtype-numbers ' )
123
+ class SubtypeNumberCitationMatcherENG ( DocumentPatternMatcherMixin , CitationMatcher ):
124
+ """ Finds references to works based on subtypes , of the form:
129
125
130
126
P 52 of 2001
131
127
Ordinance no. 52 of 1998
@@ -136,49 +132,54 @@ class RefsFinderSubtypesENG(BaseRefsFinder):
136
132
# country, language, locality
137
133
locale = (None , 'eng' , None )
138
134
139
- def setup (self , root ):
135
+ html_candidate_xpath = ".//text()[(PATTERNS) and not(ancestor::a)]"
136
+ xml_candidate_xpath = ".//text()[(PATTERNS) and not(ancestor::ns:ref)]"
137
+
138
+ def setup (self , * args , ** kwargs ):
140
139
self .setup_subtypes ()
141
- self .setup_candidate_xpath ()
142
- self .setup_pattern_re ()
143
- # If we don't have subtypes, don't let the superclass do setup, because it will fail.
144
- # We're going to opt-out of doing any work anyway.
145
- if self .subtypes :
146
- super ().setup (root )
140
+ super ().setup (* args , ** kwargs )
147
141
148
142
def setup_subtypes (self ):
149
143
self .subtypes = [s for s in Subtype .objects .all ()]
150
- self . subtype_names = [s .name for s in self .subtypes ]
151
- self . subtype_abbreviations = [s .abbreviation for s in self .subtypes ]
144
+ subtype_names = [s .name for s in self .subtypes ]
145
+ subtype_abbreviations = [s .abbreviation for s in self .subtypes ]
152
146
153
- self .subtypes_string = '|' .join ([re .escape (s ) for s in self .subtype_names + self .subtype_abbreviations ])
147
+ # sort, longest first
148
+ subtypes = sorted (subtype_names + subtype_abbreviations , key = len , reverse = True )
149
+ self .subtypes_string = '|' .join (re .escape (s ) for s in subtypes )
154
150
155
- def setup_candidate_xpath (self ):
156
- xpath_contains = " or " .join ([f"contains(translate(., '{ subtype .upper ()} ', '{ subtype .lower ()} '), "
157
- f"'{ subtype .lower ()} ')"
158
- for subtype in self .subtype_names + self .subtype_abbreviations ])
159
- self .candidate_xpath = f".//text()[({ xpath_contains } ) and not(ancestor::a:ref)]"
151
+ # build the xpath; if there are no subtypes, use "false" to not match anything
152
+ xpath_contains = " or " .join ([
153
+ f"contains(translate(., '{ subtype .upper ()} ', '{ subtype .lower ()} '), '{ subtype .lower ()} ')"
154
+ for subtype in subtypes
155
+ ]) or "false"
156
+ self .candidate_xpath = self .candidate_xpath .replace ('PATTERNS' , xpath_contains )
160
157
161
- def setup_pattern_re (self ):
162
158
# TODO: disregard e.g. "6 May" in "GN 34 of 6 May 2020", but catch reference
163
159
self .pattern_re = re .compile (
164
160
fr'''
165
161
(?P<ref>
166
162
(?P<subtype>{ self .subtypes_string } )\s*
167
163
(No\.?\s*)?
168
- (?P<num>\d +)
164
+ (?P<num>[a-z0-9-] +)
169
165
(\s+of\s+|/)
170
166
(?P<year>\d{{4}})
171
167
)
172
168
''' , re .X | re .I )
173
169
174
- def markup_patterns (self , root ):
170
+ def extract_paged_text_matches (self ):
175
171
# don't do anything if there are no subtypes
176
172
if self .subtypes :
177
- super ().markup_patterns ( root )
173
+ super ().extract_paged_text_matches ( )
178
174
179
- def make_href (self , match ):
175
+ def run_dom_matching (self ):
176
+ # don't do anything if there are no subtypes
177
+ if self .subtypes :
178
+ super ().run_dom_matching ()
179
+
180
+ def make_href (self , match : ExtractedMatch ):
180
181
# use correct subtype for FRBR URI
181
- subtype = match .group ( 'subtype' )
182
+ subtype = match .groups [ 'subtype' ]
182
183
for s in self .subtypes :
183
184
if subtype .lower () == s .name .lower () or subtype .lower () == s .abbreviation .lower ():
184
185
subtype = s .abbreviation
@@ -188,7 +189,7 @@ def make_href(self, match):
188
189
if self .frbr_uri .locality :
189
190
place = f'{ self .frbr_uri .country } -{ self .frbr_uri .locality } '
190
191
191
- return f'/akn/{ place } /act/{ subtype } /{ match .group ( "year" ) } /{ match .group ( "num" )} '
192
+ return f'/akn/{ place } /act/{ subtype } /{ match .groups [ "year" ] } /{ match .groups [ "num" ]. lower ( )} '
192
193
193
194
194
195
@plugins .register ('refs-cap' )
0 commit comments