|
17 | 17 | opt for the lazy approach (such as in Dask), and don't materialize inside neither the `read_single` nor
|
18 | 18 | `concat` methods.
|
19 | 19 |
|
| 20 | +Existing readers such as PandasReader allow customisation via passing through any kwargs to the underlying |
| 21 | +pandas read method. |
| 22 | +
|
20 | 23 | The user should *not* bake in any specific business logic in here -- a more prefered approach is to
|
21 |
| -return an object such as data frame as early as possible, and apply any transformations later on. |
| 24 | +return an object such as (lazy) data frame as early as possible, and apply any transformations later on. |
22 | 25 | """
|
23 | 26 | from __future__ import annotations
|
24 | 27 |
|
25 | 28 | import json
|
26 | 29 | import logging
|
27 | 30 | from abc import ABC, abstractmethod
|
| 31 | +from collections import defaultdict |
28 | 32 | from collections.abc import Iterable
|
29 | 33 | from concurrent.futures import ThreadPoolExecutor
|
30 | 34 | from enum import Enum, auto, unique
|
@@ -87,31 +91,51 @@ def read_and_concat(
|
87 | 91 |
|
88 | 92 |
|
89 | 93 | class PandasReader(DataReader):
|
| 94 | + """Wraps various pandas read methods (parquet, json, csv, excel) into a single interface. |
| 95 | + Behaviour can be customised via passing any kwargs to the constructor. |
| 96 | + """ |
| 97 | + |
| 98 | + def __init__(self, input_format=InputFormat.AUTO, **pdread_kwargs): |
| 99 | + super().__init__(input_format=input_format) |
| 100 | + self.pdread_user_kwargs = pdread_kwargs |
| 101 | + self.pdread_default_kwargs = defaultdict(dict) |
| 102 | + self.pdread_default_kwargs[InputFormat.PARQUET] = { |
| 103 | + "engine": "fastparquet", |
| 104 | + } |
| 105 | + self.pdread_default_kwargs[InputFormat.JSON] = { |
| 106 | + "lines": "true", |
| 107 | + } |
| 108 | + self.pdread_default_kwargs[InputFormat.XLSX] = { |
| 109 | + "engine": "openpyxl", |
| 110 | + } |
| 111 | + |
90 | 112 | def read_single(self, partition: Partition, fs: AbstractFileSystem) -> pd.DataFrame:
|
91 | 113 | logger.debug(f"read dataframe for partition {partition}")
|
92 | 114 | input_format = self.detect_format(partition.url)
|
93 |
| - # TODO allow for user spec of engine and other params, essentially any quark |
| 115 | + logger.debug(f"format detected for partition {input_format} <- {partition}") |
94 | 116 | if input_format is InputFormat.PARQUET:
|
95 |
| - reader = lambda fd: pd.read_parquet(fd, engine="fastparquet") # noqa: E731 |
| 117 | + reader = pd.read_parquet |
96 | 118 | elif input_format is InputFormat.JSON:
|
97 |
| - reader = lambda fd: pd.read_json(fd, lines=True) # noqa: E731 |
| 119 | + reader = pd.read_json |
98 | 120 | elif input_format is InputFormat.CSV:
|
99 | 121 | reader = pd.read_csv
|
100 | 122 | elif input_format is InputFormat.XLSX:
|
101 |
| - reader = lambda fd: pd.read_excel(fd, engine="openpyxl") # noqa: E731 |
| 123 | + reader = pd.read_excel |
102 | 124 | elif input_format is InputFormat.AUTO:
|
103 | 125 | raise ValueError(f"partition had format detected as auto -> invalid state. Partition: {partition}")
|
104 | 126 | else:
|
105 | 127 | assert_exhaustive_enum(input_format)
|
106 | 128 |
|
| 129 | + pdread_kwargs = {**self.pdread_default_kwargs[input_format], **self.pdread_user_kwargs} |
| 130 | + logger.debug(f"reader kwargs {pdread_kwargs} for partition {partition}") |
107 | 131 | try:
|
108 | 132 | with fs.open(partition.url, "rb") as fd:
|
109 |
| - df = reader(fd) |
| 133 | + df = reader(fd, **pdread_kwargs) |
110 | 134 | except FileNotFoundError as e:
|
111 | 135 | logger.warning(f"file {partition} reading exception {type(e)}, attempting cache invalidation and reread")
|
112 | 136 | fs.invalidate_cache()
|
113 | 137 | with fs.open(partition.url, "rb") as fd:
|
114 |
| - df = reader(fd) |
| 138 | + df = reader(fd, **pdread_kwargs) |
115 | 139 |
|
116 | 140 | for key, value in partition.columns.items():
|
117 | 141 | df[key] = value
|
|
0 commit comments