Skip to content

Commit

Permalink
fix tablib error
Browse files Browse the repository at this point in the history
  • Loading branch information
Patrick Troy committed Jun 27, 2024
1 parent 30a36e2 commit 449919b
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
37 changes: 35 additions & 2 deletions liiatools/common/stream_filters.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import logging
import xmlschema
import tablib
import pandas as pd
import xml.etree.ElementTree as ET
from io import BytesIO, StringIO
from typing import Iterable, Union, Any, Dict, List
from pathlib import Path
from tablib import import_book, import_set
from tablib import import_book, import_set, UnsupportedFormat

from sfdata_stream_parser import events, collectors
from sfdata_stream_parser.checks import type_check
Expand All @@ -31,6 +32,17 @@
logger = logging.getLogger(__name__)


def _import_set_workaround(data):
"""
Workaround for a bug in tablib that causes it to fail to import
sets of data.
"""
try:
return import_set(data)
except UnsupportedFormat:
return pd.read_csv(data)


def tablib_parse(source: FileLocator):
"""
Parse any of the tabular formats supported by TabLib
Expand Down Expand Up @@ -58,7 +70,7 @@ def tablib_parse(source: FileLocator):
pass

try:
dataset = import_set(data)
dataset = _import_set_workaround(data)
logger.debug("Opened %s as a sheet", filename)
return tablib_to_stream(dataset, filename=filename)
except Exception as e:
Expand Down Expand Up @@ -86,6 +98,24 @@ def _tablib_dataset_to_stream(dataset: tablib.Dataset, **kwargs):
yield events.EndContainer()


def _pandas_dataframe_to_stream(dataset: pd.DataFrame, **kwargs):
params = {k: v for k, v in kwargs.items() if v is not None}
yield events.StartContainer(**params)
yield events.StartTable(headers=dataset.columns.tolist())
for r_ix, row in enumerate(dataset.itertuples(index=False)):
yield events.StartRow()
for c_ix, cell in enumerate(row[0:]):
yield events.Cell(
r_ix=r_ix,
c_ix=c_ix,
header=dataset.columns.tolist()[c_ix],
cell=cell,
)
yield events.EndRow()
yield events.EndTable()
yield events.EndContainer()


def tablib_to_stream(
data: Union[tablib.Dataset, tablib.Databook], filename: str = None
):
Expand All @@ -105,6 +135,9 @@ def tablib_to_stream(
sheet, filename=filename, sheetname=sheet.title
)

elif isinstance(data, pd.DataFrame):
yield from _pandas_dataframe_to_stream(data, filename=filename)


def inherit_property(stream, prop_name: Union[str, Iterable[str]], override=False):
"""
Expand Down
1 change: 0 additions & 1 deletion liiatools/school_census_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def process_file(
pipeline_config: PipelineConfig,
la_code: str,
) -> ProcessResult:
print(f"{file_locator=}")
errors = ErrorContainer()
year = pl.discover_year(file_locator)
if year is None:
Expand Down

0 comments on commit 449919b

Please sign in to comment.