Merge pull request #53 from datasciencebr/anaschwendler-introduce-fed…

…eral-senate-script Introducing federal senate script
okfn-brasil · May 16, 2017 · 65ef7bf · 65ef7bf
2 parents 13b1b42 + b1c7b7d
commit 65ef7bf
Show file tree

Hide file tree

Showing 4 changed files with 141 additions and 0 deletions.
diff --git a/serenata_toolbox/chamber_of_deputies/__init__.py b/serenata_toolbox/chamber_of_deputies/__init__.py
diff --git a/serenata_toolbox/federal_senate/__init__.py b/serenata_toolbox/federal_senate/__init__.py
diff --git a/serenata_toolbox/federal_senate/federal_senate_dataset.py b/serenata_toolbox/federal_senate/federal_senate_dataset.py
@@ -0,0 +1,100 @@
+import os.path
+from urllib.request import urlretrieve
+import numpy as np
+import pandas as pd
+
+from datetime import date
+
+class FederalSenateDataset:
+    URL = 'http://www.senado.gov.br/transparencia/LAI/verba/{}.csv'
+    FIRST_YEAR = 2008
+    NEXT_YEAR = date.today().year + 1
+
+    YEAR_RANGE = range(FIRST_YEAR, NEXT_YEAR)
+
+    def __init__(self, path):
+        self.path = path
+
+    def fetch(self):
+        for year in self.YEAR_RANGE:
+            url = self.URL.format(year)
+            file_path = os.path.join(self.path, 'federal-senate-{}.csv'.format(year))
+            urlretrieve(url, file_path)
+
+    def translate(self):
+        filenames = ['federal-senate-{}.csv'.format(year) for year in self.YEAR_RANGE]
+        for filename in filenames:
+            csv_path = os.path.join(self.path, filename)
+            self.__translate_file(csv_path)
+
+    def clean(self):
+        reimbursement_path = os.path.join(self.path, 'federal-senate-reimbursements.xz')
+
+        filenames = ['federal-senate-{}.xz'.format(year) for year in self.YEAR_RANGE]
+        dataset = pd.DataFrame()
+
+        for filename in filenames:
+            file_path = os.path.join(self.path, filename)
+            data = pd.read_csv(file_path, encoding = "utf-8")
+            dataset = pd.concat([dataset, data])
+
+        dataset['date'] = pd.to_datetime(dataset['date'], errors='coerce')
+        dataset['cnpj_cpf'] = dataset['cnpj_cpf'].str.replace(r'\D', '')
+
+        dataset.to_csv(reimbursement_path, compression='xz', index=False, encoding='utf-8')
+
+        return reimbursement_path
+
+    def __translate_file(self, csv_path):
+        output_file_path = csv_path.replace('.csv', '.xz')
+
+        data = pd.read_csv(csv_path,
+                           sep=';',
+                           encoding = "ISO-8859-1",
+                           skiprows=1)
+
+        data.columns = map(str.lower, data.columns)
+
+        data.rename(columns={
+            'ano': 'year',
+            'mes': 'month',
+            'senador': 'congressperson_name',
+            'tipo_despesa': 'expense_type',
+            'cnpj_cpf': 'cnpj_cpf',
+            'fornecedor': 'supplier',
+            'documento': 'document_id',
+            'data': 'date',
+            'detalhamento': 'expense_details',
+            'valor_reembolsado': 'reimbursement_value',
+        }, inplace=True)
+
+        data['expense_type'] = data['expense_type'].astype('category')
+
+        data['expense_type'] = \
+            data['expense_type'].astype('category')
+
+        categories = {
+            'Aluguel de imóveis para escritório político, compreendendo despesas concernentes a eles.':
+                'Rent of real estate for political office, comprising expenses concerning them',
+            'Aquisição de material de consumo para uso no escritório político, inclusive aquisição ou locação de software, despesas postais, aquisição de publicações, locação de móveis e de equipamentos. ':
+                'Acquisition of consumables for use in the political office, including acquisition or leasing of software, postal expenses, acquisition of publications, rental of furniture and equipment',
+            'Contratação de consultorias, assessorias, pesquisas, trabalhos técnicos e outros serviços de apoio ao exercício do mandato parlamentar':
+                'Recruitment of consultancies, advisory services, research, technical work and other services in support of the exercise of the parliamentary mandate',
+            'Divulgação da atividade parlamentar':
+                'Publicity of parliamentary activity',
+            'Locomoção, hospedagem, alimentação, combustíveis e lubrificantes':
+                'Locomotion, lodging, food, fuels and lubricants',
+            'Passagens aéreas, aquáticas e terrestres nacionais':
+                'National air, water and land transport',
+            'Serviços de Segurança Privada':
+                'Private Security Services'
+        }
+
+        categories = [categories[cat] for cat in data['expense_type'].cat.categories]
+
+        data['expense_type'].cat.rename_categories(categories, inplace=True)
+
+        data.to_csv(output_file_path, compression='xz', index=False, encoding='utf-8')
+
+        return output_file_path
+
diff --git a/tests/test_federal_senate_dataset.py b/tests/test_federal_senate_dataset.py
@@ -0,0 +1,41 @@
+import os
+from tempfile import gettempdir
+from unittest import main, skipIf, TestCase, TestLoader
+
+from serenata_toolbox.federal_senate.federal_senate_dataset import FederalSenateDataset
+
+class TestFederalSenateDataset(TestCase):
+    def setUp(self):
+        self.path = gettempdir()
+        self.subject = FederalSenateDataset(self.path)
+
+    @skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1',
+            'Skipping integration test')
+    def test_fetch_saves_raw_files(self):
+        self.subject.fetch()
+        names = ['federal-senate-{}.csv'.format(year) for year in range(self.subject.FIRST_YEAR, self.subject.NEXT_YEAR)]
+        for name in names:
+            file_path = os.path.join(self.path, name)
+            assert(os.path.exists(file_path))
+
+    @skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1',
+            'Skipping integration test')
+    def test_translate_creates_english_versions_for_every_csv(self):
+        self.subject.fetch()
+        self.subject.translate()
+        names = ['federal-senate-{}.xz'.format(year) for year in range(self.subject.FIRST_YEAR, self.subject.NEXT_YEAR)]
+        for name in names:
+            file_path = os.path.join(self.path, name)
+            assert(os.path.exists(file_path))
+
+    @skipIf(os.environ.get('RUN_INTEGRATION_TESTS') != '1',
+            'Skipping integration test')
+    def test_clean_creates_a_reimbursements_file(self):
+        self.subject.fetch()
+        self.subject.translate()
+        self.subject.clean()
+        file_path = os.path.join(self.path, 'federal-senate-reimbursements.xz')
+        assert(os.path.exists(file_path))
+
+if __name__ == '__main__':
+    main()