diff --git a/serenata_toolbox/chamber_of_deputies/__init__.py b/serenata_toolbox/chamber_of_deputies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/serenata_toolbox/federal_senate/__init__.py b/serenata_toolbox/federal_senate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/serenata_toolbox/federal_senate/federal_senate_dataset.py b/serenata_toolbox/federal_senate/federal_senate_dataset.py new file mode 100644 index 0000000..1b1932f --- /dev/null +++ b/serenata_toolbox/federal_senate/federal_senate_dataset.py @@ -0,0 +1,100 @@ +import os.path +from urllib.request import urlretrieve +import numpy as np +import pandas as pd + +from datetime import date + +class FederalSenateDataset: + URL = 'http://www.senado.gov.br/transparencia/LAI/verba/{}.csv' + FIRST_YEAR = 2008 + NEXT_YEAR = date.today().year + 1 + + YEAR_RANGE = range(FIRST_YEAR, NEXT_YEAR) + + def __init__(self, path): + self.path = path + + def fetch(self): + for year in self.YEAR_RANGE: + url = self.URL.format(year) + file_path = os.path.join(self.path, 'federal-senate-{}.csv'.format(year)) + urlretrieve(url, file_path) + + def translate(self): + filenames = ['federal-senate-{}.csv'.format(year) for year in self.YEAR_RANGE] + for filename in filenames: + csv_path = os.path.join(self.path, filename) + self.__translate_file(csv_path) + + def clean(self): + reimbursement_path = os.path.join(self.path, 'federal-senate-reimbursements.xz') + + filenames = ['federal-senate-{}.xz'.format(year) for year in self.YEAR_RANGE] + dataset = pd.DataFrame() + + for filename in filenames: + file_path = os.path.join(self.path, filename) + data = pd.read_csv(file_path, encoding = "utf-8") + dataset = pd.concat([dataset, data]) + + dataset['date'] = pd.to_datetime(dataset['date'], errors='coerce') + dataset['cnpj_cpf'] = dataset['cnpj_cpf'].str.replace(r'\D', '') + + dataset.to_csv(reimbursement_path, compression='xz', index=False, encoding='utf-8') + + return reimbursement_path + + def __translate_file(self, csv_path): + output_file_path = csv_path.replace('.csv', '.xz') + + data = pd.read_csv(csv_path, + sep=';', + encoding = "ISO-8859-1", + skiprows=1) + + data.columns = map(str.lower, data.columns) + + data.rename(columns={ + 'ano': 'year', + 'mes': 'month', + 'senador': 'congressperson_name', + 'tipo_despesa': 'expense_type', + 'cnpj_cpf': 'cnpj_cpf', + 'fornecedor': 'supplier', + 'documento': 'document_id', + 'data': 'date', + 'detalhamento': 'expense_details', + 'valor_reembolsado': 'reimbursement_value', + }, inplace=True) + + data['expense_type'] = data['expense_type'].astype('category') + + data['expense_type'] = \ + data['expense_type'].astype('category') + + categories = { + 'Aluguel de imóveis para escritório político, compreendendo despesas concernentes a eles.': + 'Rent of real estate for political office, comprising expenses concerning them', + 'Aquisição de material de consumo para uso no escritório político, inclusive aquisição ou locação de software, despesas postais, aquisição de publicações, locação de móveis e de equipamentos. ': + 'Acquisition of consumables for use in the political office, including acquisition or leasing of software, postal expenses, acquisition of publications, rental of furniture and equipment', + 'Contratação de consultorias, assessorias, pesquisas, trabalhos técnicos e outros serviços de apoio ao exercício do mandato parlamentar': + 'Recruitment of consultancies, advisory services, research, technical work and other services in support of the exercise of the parliamentary mandate', + 'Divulgação da atividade parlamentar': + 'Publicity of parliamentary activity', + 'Locomoção, hospedagem, alimentação, combustíveis e lubrificantes': + 'Locomotion, lodging, food, fuels and lubricants', + 'Passagens aéreas, aquáticas e terrestres nacionais': + 'National air, water and land transport', + 'Serviços de Segurança Privada': + 'Private Security Services' + } + + categories = [categories[cat] for cat in data['expense_type'].cat.categories] + + data['expense_type'].cat.rename_categories(categories, inplace=True) + + data.to_csv(output_file_path, compression='xz', index=False, encoding='utf-8') + + return output_file_path + diff --git a/tests/test_federal_senate_dataset.py b/tests/test_federal_senate_dataset.py new file mode 100644 index 0000000..b3ec83c --- /dev/null +++ b/tests/test_federal_senate_dataset.py @@ -0,0 +1,41 @@ +import os +from tempfile import gettempdir +from unittest import main, skipIf, TestCase, TestLoader + +from serenata_toolbox.federal_senate.federal_senate_dataset import FederalSenateDataset + +class TestFederalSenateDataset(TestCase): + def setUp(self): + self.path = gettempdir() + self.subject = FederalSenateDataset(self.path) + + @skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1', + 'Skipping integration test') + def test_fetch_saves_raw_files(self): + self.subject.fetch() + names = ['federal-senate-{}.csv'.format(year) for year in range(self.subject.FIRST_YEAR, self.subject.NEXT_YEAR)] + for name in names: + file_path = os.path.join(self.path, name) + assert(os.path.exists(file_path)) + + @skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1', + 'Skipping integration test') + def test_translate_creates_english_versions_for_every_csv(self): + self.subject.fetch() + self.subject.translate() + names = ['federal-senate-{}.xz'.format(year) for year in range(self.subject.FIRST_YEAR, self.subject.NEXT_YEAR)] + for name in names: + file_path = os.path.join(self.path, name) + assert(os.path.exists(file_path)) + + @skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1', + 'Skipping integration test') + def test_clean_creates_a_reimbursements_file(self): + self.subject.fetch() + self.subject.translate() + self.subject.clean() + file_path = os.path.join(self.path, 'federal-senate-reimbursements.xz') + assert(os.path.exists(file_path)) + +if __name__ == '__main__': + main()