-
Notifications
You must be signed in to change notification settings - Fork 69
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #53 from datasciencebr/anaschwendler-introduce-fed…
…eral-senate-script Introducing federal senate script
- Loading branch information
Showing
4 changed files
with
141 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Empty file.
100 changes: 100 additions & 0 deletions
100
serenata_toolbox/federal_senate/federal_senate_dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import os.path | ||
from urllib.request import urlretrieve | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from datetime import date | ||
|
||
class FederalSenateDataset: | ||
URL = 'http://www.senado.gov.br/transparencia/LAI/verba/{}.csv' | ||
FIRST_YEAR = 2008 | ||
NEXT_YEAR = date.today().year + 1 | ||
|
||
YEAR_RANGE = range(FIRST_YEAR, NEXT_YEAR) | ||
|
||
def __init__(self, path): | ||
self.path = path | ||
|
||
def fetch(self): | ||
for year in self.YEAR_RANGE: | ||
url = self.URL.format(year) | ||
file_path = os.path.join(self.path, 'federal-senate-{}.csv'.format(year)) | ||
urlretrieve(url, file_path) | ||
|
||
def translate(self): | ||
filenames = ['federal-senate-{}.csv'.format(year) for year in self.YEAR_RANGE] | ||
for filename in filenames: | ||
csv_path = os.path.join(self.path, filename) | ||
self.__translate_file(csv_path) | ||
|
||
def clean(self): | ||
reimbursement_path = os.path.join(self.path, 'federal-senate-reimbursements.xz') | ||
|
||
filenames = ['federal-senate-{}.xz'.format(year) for year in self.YEAR_RANGE] | ||
dataset = pd.DataFrame() | ||
|
||
for filename in filenames: | ||
file_path = os.path.join(self.path, filename) | ||
data = pd.read_csv(file_path, encoding = "utf-8") | ||
dataset = pd.concat([dataset, data]) | ||
|
||
dataset['date'] = pd.to_datetime(dataset['date'], errors='coerce') | ||
dataset['cnpj_cpf'] = dataset['cnpj_cpf'].str.replace(r'\D', '') | ||
|
||
dataset.to_csv(reimbursement_path, compression='xz', index=False, encoding='utf-8') | ||
|
||
return reimbursement_path | ||
|
||
def __translate_file(self, csv_path): | ||
output_file_path = csv_path.replace('.csv', '.xz') | ||
|
||
data = pd.read_csv(csv_path, | ||
sep=';', | ||
encoding = "ISO-8859-1", | ||
skiprows=1) | ||
|
||
data.columns = map(str.lower, data.columns) | ||
|
||
data.rename(columns={ | ||
'ano': 'year', | ||
'mes': 'month', | ||
'senador': 'congressperson_name', | ||
'tipo_despesa': 'expense_type', | ||
'cnpj_cpf': 'cnpj_cpf', | ||
'fornecedor': 'supplier', | ||
'documento': 'document_id', | ||
'data': 'date', | ||
'detalhamento': 'expense_details', | ||
'valor_reembolsado': 'reimbursement_value', | ||
}, inplace=True) | ||
|
||
data['expense_type'] = data['expense_type'].astype('category') | ||
|
||
data['expense_type'] = \ | ||
data['expense_type'].astype('category') | ||
|
||
categories = { | ||
'Aluguel de imóveis para escritório político, compreendendo despesas concernentes a eles.': | ||
'Rent of real estate for political office, comprising expenses concerning them', | ||
'Aquisição de material de consumo para uso no escritório político, inclusive aquisição ou locação de software, despesas postais, aquisição de publicações, locação de móveis e de equipamentos. ': | ||
'Acquisition of consumables for use in the political office, including acquisition or leasing of software, postal expenses, acquisition of publications, rental of furniture and equipment', | ||
'Contratação de consultorias, assessorias, pesquisas, trabalhos técnicos e outros serviços de apoio ao exercício do mandato parlamentar': | ||
'Recruitment of consultancies, advisory services, research, technical work and other services in support of the exercise of the parliamentary mandate', | ||
'Divulgação da atividade parlamentar': | ||
'Publicity of parliamentary activity', | ||
'Locomoção, hospedagem, alimentação, combustíveis e lubrificantes': | ||
'Locomotion, lodging, food, fuels and lubricants', | ||
'Passagens aéreas, aquáticas e terrestres nacionais': | ||
'National air, water and land transport', | ||
'Serviços de Segurança Privada': | ||
'Private Security Services' | ||
} | ||
|
||
categories = [categories[cat] for cat in data['expense_type'].cat.categories] | ||
|
||
data['expense_type'].cat.rename_categories(categories, inplace=True) | ||
|
||
data.to_csv(output_file_path, compression='xz', index=False, encoding='utf-8') | ||
|
||
return output_file_path | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import os | ||
from tempfile import gettempdir | ||
from unittest import main, skipIf, TestCase, TestLoader | ||
|
||
from serenata_toolbox.federal_senate.federal_senate_dataset import FederalSenateDataset | ||
|
||
class TestFederalSenateDataset(TestCase): | ||
def setUp(self): | ||
self.path = gettempdir() | ||
self.subject = FederalSenateDataset(self.path) | ||
|
||
@skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1', | ||
'Skipping integration test') | ||
def test_fetch_saves_raw_files(self): | ||
self.subject.fetch() | ||
names = ['federal-senate-{}.csv'.format(year) for year in range(self.subject.FIRST_YEAR, self.subject.NEXT_YEAR)] | ||
for name in names: | ||
file_path = os.path.join(self.path, name) | ||
assert(os.path.exists(file_path)) | ||
|
||
@skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1', | ||
'Skipping integration test') | ||
def test_translate_creates_english_versions_for_every_csv(self): | ||
self.subject.fetch() | ||
self.subject.translate() | ||
names = ['federal-senate-{}.xz'.format(year) for year in range(self.subject.FIRST_YEAR, self.subject.NEXT_YEAR)] | ||
for name in names: | ||
file_path = os.path.join(self.path, name) | ||
assert(os.path.exists(file_path)) | ||
|
||
@skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1', | ||
'Skipping integration test') | ||
def test_clean_creates_a_reimbursements_file(self): | ||
self.subject.fetch() | ||
self.subject.translate() | ||
self.subject.clean() | ||
file_path = os.path.join(self.path, 'federal-senate-reimbursements.xz') | ||
assert(os.path.exists(file_path)) | ||
|
||
if __name__ == '__main__': | ||
main() |