Skip to content

Commit

Permalink
Integra base para segmentadores e segmentador da AMA (#64)
Browse files Browse the repository at this point in the history
O desenvolvimento deste PR só foi possível graças ao trabalho de
@alex-custodio, @emanuelucas04 e @danielfireman no desenvolvimento do
[Exoonero](https://exoonero.org/) sob supervisão de @Luisa-Coelho no
escopo do programa Querido Diário nas Universidades. Assim como o
esforço de @Jefersonalves, @Winzen e @ogecece para integrar o que foi
desenvolvido a esta base de código.
  • Loading branch information
Giulio Carvalho authored Dec 13, 2023
2 parents 2272001 + 299d323 commit cc54af2
Show file tree
Hide file tree
Showing 18 changed files with 547 additions and 182 deletions.
9 changes: 8 additions & 1 deletion main/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
from storage import create_storage_interface
from index import create_index_interface
from tasks import (
create_gazettes_index,
create_themed_excerpts_index,
embedding_rerank_excerpts,
extract_text_from_gazettes,
extract_themed_excerpts_from_gazettes,
get_gazettes_to_be_processed,
get_themes,
get_territories,
tag_entities_in_excerpts,
)

Expand Down Expand Up @@ -42,11 +45,15 @@ def execute_pipeline():
text_extractor = create_apache_tika_text_extraction()
themes = get_themes()

create_gazettes_index(index)
territories = get_territories(database)
gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database)
indexed_gazette_ids = extract_text_from_gazettes(
gazettes_to_be_processed, database, storage, index, text_extractor
gazettes_to_be_processed, territories, database, storage, index, text_extractor
)

for theme in themes:
create_themed_excerpts_index(theme, index)
themed_excerpt_ids = extract_themed_excerpts_from_gazettes(
theme, indexed_gazette_ids, index
)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ requests==2.25.0
scikit-learn==1.0.2
sentence-transformers==2.2.0
huggingface-hub==0.10.1 # fix: https://github.com/UKPLab/sentence-transformers/issues/1762
python-slugify[unidecode]==8.0.1
5 changes: 5 additions & 0 deletions segmentation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .factory import get_segmenter

__all__ = [
"get_segmenter",
]
7 changes: 7 additions & 0 deletions segmentation/base/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .gazette_segment import GazetteSegment
from .association_segmenter import AssociationSegmenter

__all__ = [
"GazetteSegment",
"AssociationSegmenter",
]
27 changes: 27 additions & 0 deletions segmentation/base/association_segmenter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Any, Dict, Iterable, List, Union
from segmentation.base import GazetteSegment


class AssociationSegmenter:
def __init__(self, territories: Iterable[Dict[str, Any]]):
self.territories = territories

def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]:
"""
Returns a list of GazetteSegment
"""
raise NotImplementedError

def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]:
"""
Segment a association text by territory
and returns a list of text segments
"""
raise NotImplementedError

def build_segment(self, *args, **kwargs) -> GazetteSegment:
"""
Returns a GazetteSegment
"""
raise NotImplementedError

27 changes: 27 additions & 0 deletions segmentation/base/gazette_segment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from datetime import date, datetime
from dataclasses import dataclass


@dataclass
class GazetteSegment:
"""
Dataclass to represent a gazette segment of a association
related to a city
"""
id: str
territory_name: str
source_text: str
date: date
edition_number: str
is_extra_edition: bool
power: str
file_checksum: str
scraped_at: datetime
created_at: datetime
processed: bool
file_path: str
file_url: str
state_code: str
territory_id: str
file_raw_txt: str
url: str
49 changes: 49 additions & 0 deletions segmentation/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Any, Dict, Iterable

from segmentation.base import AssociationSegmenter
from segmentation import segmenters


_segmenter_instances = {}


def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter:
"""
Factory method to return a AssociationSegmenter
Example
-------
>>> territory_id = "9999999"
>>> territories = [
{
"id": "9999999",
"territory_name": "Bairro do Limoeiro",
"state_code": "ZZ",
"state": "Limoeirolândia",
}, {
"id": "0000000",
"territory_name": "Castelo Rá-Tim-Bum",
"state_code": "SP",
"state": "São Paulo",
},
]
>>> from segmentation import get_segmenter
>>> segmenter = get_segmenter(territory_id, territories)
>>> segments = segmenter.get_gazette_segments()
Notes
-----
This method implements a factory method pattern.
See: https://github.com/faif/python-patterns/blob/master/patterns/creational/factory.py
"""

territory_to_segmenter_class = {
"2700000": "ALAssociacaoMunicipiosSegmenter",
}

if territory_id not in _segmenter_instances:
segmenter_class_name = territory_to_segmenter_class[territory_id]
segmenter_class = getattr(segmenters, segmenter_class_name)
_segmenter_instances[territory_id] = segmenter_class(territories)

return _segmenter_instances[territory_id]
5 changes: 5 additions & 0 deletions segmentation/segmenters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .al_associacao_municipios import ALAssociacaoMunicipiosSegmenter

__all__ = [
"ALAssociacaoMunicipiosSegmenter",
]
88 changes: 88 additions & 0 deletions segmentation/segmenters/al_associacao_municipios.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re
import logging

from typing import Any, Dict, List
from segmentation.base import AssociationSegmenter, GazetteSegment
from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug


class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter):
RE_NOMES_MUNICIPIOS = re.compile(
r"""
(ESTADO\sDE\sALAGOAS(?:|\s)\n{1,2}PREFEITURA\sMUNICIPAL\sDE\s) # Marcador de início do cabeçalho de publicação do município
((?!EDUCAÇÃO).*?\n{0,2}(?!VAMOS).*?$) # Nome do município (pode estar presente em até duas linhas). Exceções Notáveis: VAMOS, Poço das Trincheiras, 06/01/2022, ato CCB3A6AB; EDUCAÇÃO, Dois Riachos, 07/12/2023, ato ABCCE576
(\n\s(?:\s|SECRETARIA|Secretaria)) # Marcador de fim do cabeçalho (pula mais de duas linhas). Exceções Notáveis: SECRETARIA, Coité do Nóia, 02/10/2018, ato 12F7DE15; Secretaria, Qubrângulo, 18/07/2023, atos 27FB2D83 a 1FAF9421
""",
re.MULTILINE | re.VERBOSE,
)

def get_gazette_segments(self, gazette: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Returns a list of dicts with the gazettes metadata
"""
territory_to_text_map = self.split_text_by_territory(gazette["source_text"])
gazette_segments = [
self.build_segment(territory_slug, segment_text, gazette).__dict__
for territory_slug, segment_text in territory_to_text_map.items()
]
return gazette_segments

def split_text_by_territory(self, text: str) -> Dict[str, str]:
"""
Segment a association text by territory
and returns a dict with the territory name and the text segment
"""
ama_header = text.lstrip().split("\n", maxsplit=1)[0].rstrip()
# clean headers
clean_text = "\n".join(re.split(re.escape(ama_header), text))
# clean final lines
clean_text = "\n".join(
re.split(r"(Código Ide ?ntificador:\s*\w+)", clean_text)[:-1]
)

raw_segments = re.split(self.RE_NOMES_MUNICIPIOS, clean_text)[1:]

territory_to_text_map = {}
for pattern_batch in batched(raw_segments, 4):
territory_name = pattern_batch[1]
clean_territory_name = self._normalize_territory_name(territory_name)
territory_slug = get_territory_slug(clean_territory_name, "AL")
previous_text_or_header = territory_to_text_map.setdefault(
territory_slug, f"{ama_header}\n"
)
raw_batch_text = "".join(pattern_batch)
new_territory_text = f"{previous_text_or_header}\n{raw_batch_text}"
territory_to_text_map[territory_slug] = new_territory_text

return territory_to_text_map

def build_segment(
self, territory_slug: str, segment_text: str, gazette: Dict
) -> GazetteSegment:
logging.debug(
f"Creating segment for territory \"{territory_slug}\" from {gazette['file_path']} file."
)
territory_data = get_territory_data(territory_slug, self.territories)

return GazetteSegment(**{
**gazette,
# segment specific values
"processed": True,
"file_checksum": get_checksum(segment_text),
"source_text": segment_text.strip(),
"territory_name": territory_data["territory_name"],
"territory_id": territory_data["id"],
})

def _normalize_territory_name(self, territory_name: str) -> str:
clean_name = territory_name.strip().replace("\n", "")
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso.
clean_name = re.sub(
"\s*(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*|EXTRATO.*|SÚMULA.*|RATIFICAÇÃO.*)",
"",
clean_name,
)
name_to_fixed = {
"MAJOR IZIDORO": "MAJOR ISIDORO",
}
return name_to_fixed.get(clean_name, clean_name)
2 changes: 2 additions & 0 deletions tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .create_index import create_gazettes_index, create_themed_excerpts_index
from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts
from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts
from .gazette_text_extraction import extract_text_from_gazettes
Expand All @@ -10,3 +11,4 @@
TextExtractorInterface,
)
from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
from .list_territories import get_territories
Loading

0 comments on commit cc54af2

Please sign in to comment.