-
-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integra base para segmentadores e segmentador da AMA (#64)
O desenvolvimento deste PR só foi possível graças ao trabalho de @alex-custodio, @emanuelucas04 e @danielfireman no desenvolvimento do [Exoonero](https://exoonero.org/) sob supervisão de @Luisa-Coelho no escopo do programa Querido Diário nas Universidades. Assim como o esforço de @Jefersonalves, @Winzen e @ogecece para integrar o que foi desenvolvido a esta base de código.
- Loading branch information
Showing
18 changed files
with
547 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .factory import get_segmenter | ||
|
||
__all__ = [ | ||
"get_segmenter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .gazette_segment import GazetteSegment | ||
from .association_segmenter import AssociationSegmenter | ||
|
||
__all__ = [ | ||
"GazetteSegment", | ||
"AssociationSegmenter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from typing import Any, Dict, Iterable, List, Union | ||
from segmentation.base import GazetteSegment | ||
|
||
|
||
class AssociationSegmenter: | ||
def __init__(self, territories: Iterable[Dict[str, Any]]): | ||
self.territories = territories | ||
|
||
def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]: | ||
""" | ||
Returns a list of GazetteSegment | ||
""" | ||
raise NotImplementedError | ||
|
||
def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]: | ||
""" | ||
Segment a association text by territory | ||
and returns a list of text segments | ||
""" | ||
raise NotImplementedError | ||
|
||
def build_segment(self, *args, **kwargs) -> GazetteSegment: | ||
""" | ||
Returns a GazetteSegment | ||
""" | ||
raise NotImplementedError | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from datetime import date, datetime | ||
from dataclasses import dataclass | ||
|
||
|
||
@dataclass | ||
class GazetteSegment: | ||
""" | ||
Dataclass to represent a gazette segment of a association | ||
related to a city | ||
""" | ||
id: str | ||
territory_name: str | ||
source_text: str | ||
date: date | ||
edition_number: str | ||
is_extra_edition: bool | ||
power: str | ||
file_checksum: str | ||
scraped_at: datetime | ||
created_at: datetime | ||
processed: bool | ||
file_path: str | ||
file_url: str | ||
state_code: str | ||
territory_id: str | ||
file_raw_txt: str | ||
url: str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from typing import Any, Dict, Iterable | ||
|
||
from segmentation.base import AssociationSegmenter | ||
from segmentation import segmenters | ||
|
||
|
||
_segmenter_instances = {} | ||
|
||
|
||
def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter: | ||
""" | ||
Factory method to return a AssociationSegmenter | ||
Example | ||
------- | ||
>>> territory_id = "9999999" | ||
>>> territories = [ | ||
{ | ||
"id": "9999999", | ||
"territory_name": "Bairro do Limoeiro", | ||
"state_code": "ZZ", | ||
"state": "Limoeirolândia", | ||
}, { | ||
"id": "0000000", | ||
"territory_name": "Castelo Rá-Tim-Bum", | ||
"state_code": "SP", | ||
"state": "São Paulo", | ||
}, | ||
] | ||
>>> from segmentation import get_segmenter | ||
>>> segmenter = get_segmenter(territory_id, territories) | ||
>>> segments = segmenter.get_gazette_segments() | ||
Notes | ||
----- | ||
This method implements a factory method pattern. | ||
See: https://github.com/faif/python-patterns/blob/master/patterns/creational/factory.py | ||
""" | ||
|
||
territory_to_segmenter_class = { | ||
"2700000": "ALAssociacaoMunicipiosSegmenter", | ||
} | ||
|
||
if territory_id not in _segmenter_instances: | ||
segmenter_class_name = territory_to_segmenter_class[territory_id] | ||
segmenter_class = getattr(segmenters, segmenter_class_name) | ||
_segmenter_instances[territory_id] = segmenter_class(territories) | ||
|
||
return _segmenter_instances[territory_id] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .al_associacao_municipios import ALAssociacaoMunicipiosSegmenter | ||
|
||
__all__ = [ | ||
"ALAssociacaoMunicipiosSegmenter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import re | ||
import logging | ||
|
||
from typing import Any, Dict, List | ||
from segmentation.base import AssociationSegmenter, GazetteSegment | ||
from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug | ||
|
||
|
||
class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter): | ||
RE_NOMES_MUNICIPIOS = re.compile( | ||
r""" | ||
(ESTADO\sDE\sALAGOAS(?:|\s)\n{1,2}PREFEITURA\sMUNICIPAL\sDE\s) # Marcador de início do cabeçalho de publicação do município | ||
((?!EDUCAÇÃO).*?\n{0,2}(?!VAMOS).*?$) # Nome do município (pode estar presente em até duas linhas). Exceções Notáveis: VAMOS, Poço das Trincheiras, 06/01/2022, ato CCB3A6AB; EDUCAÇÃO, Dois Riachos, 07/12/2023, ato ABCCE576 | ||
(\n\s(?:\s|SECRETARIA|Secretaria)) # Marcador de fim do cabeçalho (pula mais de duas linhas). Exceções Notáveis: SECRETARIA, Coité do Nóia, 02/10/2018, ato 12F7DE15; Secretaria, Qubrângulo, 18/07/2023, atos 27FB2D83 a 1FAF9421 | ||
""", | ||
re.MULTILINE | re.VERBOSE, | ||
) | ||
|
||
def get_gazette_segments(self, gazette: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
""" | ||
Returns a list of dicts with the gazettes metadata | ||
""" | ||
territory_to_text_map = self.split_text_by_territory(gazette["source_text"]) | ||
gazette_segments = [ | ||
self.build_segment(territory_slug, segment_text, gazette).__dict__ | ||
for territory_slug, segment_text in territory_to_text_map.items() | ||
] | ||
return gazette_segments | ||
|
||
def split_text_by_territory(self, text: str) -> Dict[str, str]: | ||
""" | ||
Segment a association text by territory | ||
and returns a dict with the territory name and the text segment | ||
""" | ||
ama_header = text.lstrip().split("\n", maxsplit=1)[0].rstrip() | ||
# clean headers | ||
clean_text = "\n".join(re.split(re.escape(ama_header), text)) | ||
# clean final lines | ||
clean_text = "\n".join( | ||
re.split(r"(Código Ide ?ntificador:\s*\w+)", clean_text)[:-1] | ||
) | ||
|
||
raw_segments = re.split(self.RE_NOMES_MUNICIPIOS, clean_text)[1:] | ||
|
||
territory_to_text_map = {} | ||
for pattern_batch in batched(raw_segments, 4): | ||
territory_name = pattern_batch[1] | ||
clean_territory_name = self._normalize_territory_name(territory_name) | ||
territory_slug = get_territory_slug(clean_territory_name, "AL") | ||
previous_text_or_header = territory_to_text_map.setdefault( | ||
territory_slug, f"{ama_header}\n" | ||
) | ||
raw_batch_text = "".join(pattern_batch) | ||
new_territory_text = f"{previous_text_or_header}\n{raw_batch_text}" | ||
territory_to_text_map[territory_slug] = new_territory_text | ||
|
||
return territory_to_text_map | ||
|
||
def build_segment( | ||
self, territory_slug: str, segment_text: str, gazette: Dict | ||
) -> GazetteSegment: | ||
logging.debug( | ||
f"Creating segment for territory \"{territory_slug}\" from {gazette['file_path']} file." | ||
) | ||
territory_data = get_territory_data(territory_slug, self.territories) | ||
|
||
return GazetteSegment(**{ | ||
**gazette, | ||
# segment specific values | ||
"processed": True, | ||
"file_checksum": get_checksum(segment_text), | ||
"source_text": segment_text.strip(), | ||
"territory_name": territory_data["territory_name"], | ||
"territory_id": territory_data["id"], | ||
}) | ||
|
||
def _normalize_territory_name(self, territory_name: str) -> str: | ||
clean_name = territory_name.strip().replace("\n", "") | ||
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso. | ||
clean_name = re.sub( | ||
"\s*(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*|EXTRATO.*|SÚMULA.*|RATIFICAÇÃO.*)", | ||
"", | ||
clean_name, | ||
) | ||
name_to_fixed = { | ||
"MAJOR IZIDORO": "MAJOR ISIDORO", | ||
} | ||
return name_to_fixed.get(clean_name, clean_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.