diff --git a/liiatools/common/stream_filters.py b/liiatools/common/stream_filters.py index 895ab8ce..5b3b79e4 100644 --- a/liiatools/common/stream_filters.py +++ b/liiatools/common/stream_filters.py @@ -1,11 +1,12 @@ import logging import xmlschema import tablib +import pandas as pd import xml.etree.ElementTree as ET from io import BytesIO, StringIO from typing import Iterable, Union, Any, Dict, List from pathlib import Path -from tablib import import_book, import_set +from tablib import import_book, import_set, UnsupportedFormat from sfdata_stream_parser import events, collectors from sfdata_stream_parser.checks import type_check @@ -31,6 +32,17 @@ logger = logging.getLogger(__name__) +def _import_set_workaround(data): + """ + Workaround for a bug in tablib that causes it to fail to import + sets of data. + """ + try: + return import_set(data) + except UnsupportedFormat: + return pd.read_csv(data) + + def tablib_parse(source: FileLocator): """ Parse any of the tabular formats supported by TabLib @@ -58,7 +70,7 @@ def tablib_parse(source: FileLocator): pass try: - dataset = import_set(data) + dataset = _import_set_workaround(data) logger.debug("Opened %s as a sheet", filename) return tablib_to_stream(dataset, filename=filename) except Exception as e: @@ -86,6 +98,24 @@ def _tablib_dataset_to_stream(dataset: tablib.Dataset, **kwargs): yield events.EndContainer() +def _pandas_dataframe_to_stream(dataset: pd.DataFrame, **kwargs): + params = {k: v for k, v in kwargs.items() if v is not None} + yield events.StartContainer(**params) + yield events.StartTable(headers=dataset.columns.tolist()) + for r_ix, row in enumerate(dataset.itertuples(index=False)): + yield events.StartRow() + for c_ix, cell in enumerate(row[0:]): + yield events.Cell( + r_ix=r_ix, + c_ix=c_ix, + header=dataset.columns.tolist()[c_ix], + cell=cell, + ) + yield events.EndRow() + yield events.EndTable() + yield events.EndContainer() + + def tablib_to_stream( data: Union[tablib.Dataset, tablib.Databook], filename: str = None ): @@ -105,6 +135,9 @@ def tablib_to_stream( sheet, filename=filename, sheetname=sheet.title ) + elif isinstance(data, pd.DataFrame): + yield from _pandas_dataframe_to_stream(data, filename=filename) + def inherit_property(stream, prop_name: Union[str, Iterable[str]], override=False): """ diff --git a/liiatools/school_census_pipeline/pipeline.py b/liiatools/school_census_pipeline/pipeline.py index bc35fe11..c129f728 100644 --- a/liiatools/school_census_pipeline/pipeline.py +++ b/liiatools/school_census_pipeline/pipeline.py @@ -25,7 +25,6 @@ def process_file( pipeline_config: PipelineConfig, la_code: str, ) -> ProcessResult: - print(f"{file_locator=}") errors = ErrorContainer() year = pl.discover_year(file_locator) if year is None: