From b6e4e025bc2e41ae733f5477801c101e2590eec5 Mon Sep 17 00:00:00 2001 From: asakho8 Date: Fri, 7 Nov 2025 16:39:44 -0500 Subject: [PATCH 1/2] ENH: add dtype_from_format option to preserve Excel text formatting --- doc/source/user_guide/io.rst | 33 ++++ pandas/io/excel/_base.py | 233 +++++++++++++++++++++++++- pandas/io/excel/_calamine.py | 10 +- pandas/io/excel/_odfreader.py | 10 +- pandas/io/excel/_openpyxl.py | 26 ++- pandas/io/excel/_pyxlsb.py | 6 +- pandas/io/excel/_xlrd.py | 74 +++++++- pandas/tests/io/excel/test_readers.py | 60 +++++++ tmp.xlsx | Bin 0 -> 4889 bytes 9 files changed, 428 insertions(+), 24 deletions(-) create mode 100644 tmp.xlsx diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5b25462568cfa..3e166248c27a5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3378,6 +3378,39 @@ Reading Excel files In the most basic use-case, ``read_excel`` takes a path to an Excel file, and the ``sheet_name`` indicating which sheet to parse. +Text-formatted cells +++++++++++++++++++++ + +Excel workbooks often contain values that are stored as numbers but formatted as +``Text`` to preserve literal strings such as postal codes or account numbers +with leading zeros. By default, :func:`~pandas.read_excel` still converts those +cells to numeric types, which can alter the original representation. Pass +``dtype_from_format=True`` to maintain the Excel text formatting when parsing +each sheet. When enabled, pandas forces any columns or index levels that are +formatted as text in the source workbook to use string dtypes in the resulting +``Series``/``Index``. + +This behavior currently applies to the ``openpyxl`` and ``xlrd`` engines. Other +engines simply ignore the flag until text format detection is implemented for +them. + +.. ipython:: python + + df = pd.DataFrame({"zip_code": ["00601", "02108", "10118"]}) + with pd.ExcelWriter("zips.xlsx", engine="openpyxl") as writer: + df.to_excel(writer, index=False) + for cell in writer.sheets["Sheet1"]["A"]: + cell.number_format = "@" # Excel's Text format + + parsed = pd.read_excel("zips.xlsx", dtype_from_format=True) + parsed.dtypes + +.. ipython:: python + :suppress: + + import os + os.remove("zips.xlsx") + When using the ``engine_kwargs`` parameter, pandas will pass these arguments to the engine. For this, it is important to know which function pandas is using internally. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 137ce208c5da1..ecba3ccfb3e62 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,6 @@ from __future__ import annotations +from bisect import bisect_left from collections.abc import ( Callable, Hashable, @@ -29,7 +30,10 @@ from pandas._config import config -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._libs.parsers import STR_NA_VALUES from pandas.compat._optional import ( get_version, @@ -163,6 +167,9 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. If you use ``None``, it will infer the dtype of each column based on the data. +dtype_from_format : bool, default False + Preserve cells formatted as ``Text`` in Excel as strings instead of + coercing them to numeric dtypes. engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. Engine compatibility : @@ -376,6 +383,7 @@ def read_excel( | Callable[[HashableT], bool] | None = ..., dtype: DtypeArg | None = ..., + dtype_from_format: bool = ..., engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., @@ -413,6 +421,7 @@ def read_excel( | Callable[[HashableT], bool] | None = ..., dtype: DtypeArg | None = ..., + dtype_from_format: bool = ..., engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., @@ -450,6 +459,7 @@ def read_excel( | Callable[[HashableT], bool] | None = None, dtype: DtypeArg | None = None, + dtype_from_format: bool = False, engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, @@ -497,6 +507,7 @@ def read_excel( index_col=index_col, usecols=usecols, dtype=dtype, + dtype_from_format=dtype_from_format, converters=converters, true_values=true_values, false_values=false_values, @@ -589,7 +600,13 @@ def get_sheet_by_name(self, name: str): def get_sheet_by_index(self, index: int): raise NotImplementedError - def get_sheet_data(self, sheet, rows: int | None = None): + def get_sheet_data( + self, + sheet, + rows: int | None = None, + *, + dtype_from_format: bool = False, + ): raise NotImplementedError def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -704,6 +721,7 @@ def parse( index_col: int | Sequence[int] | None = None, usecols=None, dtype: DtypeArg | None = None, + dtype_from_format: bool = False, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, skiprows: Sequence[int] | int | Callable[[int], object] | None = None, @@ -754,7 +772,11 @@ def parse( sheet = self.get_sheet_by_index(asheetname) file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) - data = self.get_sheet_data(sheet, file_rows_needed) + data, text_formatted_cols = self.get_sheet_data( + sheet, + file_rows_needed, + dtype_from_format=dtype_from_format, + ) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() @@ -766,6 +788,9 @@ def parse( output = self._parse_sheet( data=data, + text_formatted_cols=( + text_formatted_cols if dtype_from_format else None + ), output=output, asheetname=asheetname, header=header, @@ -785,6 +810,7 @@ def parse( comment=comment, skipfooter=skipfooter, dtype_backend=dtype_backend, + dtype_from_format=dtype_from_format, **kwds, ) @@ -817,11 +843,16 @@ def _parse_sheet( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, + text_formatted_cols: set[int] | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + dtype_from_format: bool = False, **kwds, ): is_list_header = False is_len_one_list_header = False + if dtype_from_format and text_formatted_cols: + self._coerce_text_data(data, text_formatted_cols) + if is_list_like(header): assert isinstance(header, Sequence) is_list_header = True @@ -933,6 +964,14 @@ def _parse_sheet( **kwds, ) + text_positions: set[int] = set() + if dtype_from_format and text_formatted_cols: + text_positions = self._resolve_text_positions( + parser, text_formatted_cols + ) + if text_positions: + self._inject_text_converters(parser, text_positions) + output[asheetname] = parser.read(nrows=nrows) if header_names: @@ -940,6 +979,9 @@ def _parse_sheet( header_names ) + if dtype_from_format and text_positions: + self._finalize_text_columns(output[asheetname], parser, text_positions) + except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() @@ -950,6 +992,186 @@ def _parse_sheet( return output + @staticmethod + def _text_format_converter(value): + if libmissing.checknull(value): + return value + return str(value) + + @staticmethod + def _parser_engine(parser): + return getattr(parser, "_engine", parser) + + @classmethod + def _parser_attr(cls, parser, attribute: str): + if hasattr(parser, attribute): + return getattr(parser, attribute) + engine = cls._parser_engine(parser) + if engine is not parser and hasattr(engine, attribute): + return getattr(engine, attribute) + return None + + def _resolve_text_positions( + self, parser, text_formatted_cols: set[int] + ) -> set[int]: + if not text_formatted_cols: + return set() + + orig_names = self._parser_attr(parser, "orig_names") or [] + col_indices = self._parser_attr(parser, "_col_indices") + if col_indices is None: + max_pos = len(orig_names) + return {idx for idx in text_formatted_cols if idx < max_pos} + + positions: set[int] = set() + for idx in text_formatted_cols: + pos = bisect_left(col_indices, idx) + if pos < len(col_indices) and col_indices[pos] == idx: + positions.add(pos) + return positions + + def _inject_text_converters(self, parser, text_positions: set[int]) -> None: + if not text_positions: + return + + target = self._parser_engine(parser) + existing_converters = getattr(target, "converters", None) + if existing_converters is None: + target.converters = {} + existing_clean: dict = {} + else: + target.converters = dict(existing_converters) + existing_clean = target._clean_mapping(existing_converters) + + orig_names = self._parser_attr(parser, "orig_names") or [] + + for pos in text_positions: + if pos >= len(orig_names): + continue + label = orig_names[pos] + if existing_clean and label in existing_clean: + continue + target.converters[pos] = self._text_format_converter + + def _finalize_text_columns( + self, + frame: DataFrame, + parser, + text_positions: set[int], + ) -> None: + if not text_positions or frame.empty: + return + + orig_names = self._parser_attr(parser, "orig_names") or [] + total_positions = len(orig_names) + if total_positions == 0: + return + + index_positions = self._resolve_index_positions(parser, total_positions) + + data_position_map: dict[int, int] = {} + df_col_index = 0 + for pos in range(total_positions): + if pos in index_positions: + continue + if df_col_index >= frame.shape[1]: + break + data_position_map[pos] = df_col_index + df_col_index += 1 + + for pos in text_positions: + if pos in index_positions: + continue + df_pos = data_position_map.get(pos) + if df_pos is None: + continue + frame.iloc[:, df_pos] = frame.iloc[:, df_pos].map( + self._text_format_converter + ) + + index_levels = self._index_levels_for_positions( + index_positions, text_positions, total_positions + ) + if index_levels: + frame.index = self._convert_index_labels(frame.index, index_levels) + + def _coerce_text_data(self, data: list, text_formatted_cols: set[int]) -> None: + if not text_formatted_cols or not data: + return + + for row in data: + if not row: + continue + for col_idx in text_formatted_cols: + if col_idx >= len(row): + continue + row[col_idx] = self._text_format_converter(row[col_idx]) + + def _resolve_index_positions(self, parser, total_positions: int) -> set[int]: + index_col = self._parser_attr(parser, "index_col") + if index_col is None or index_col is False: + return set() + + if is_list_like(index_col) and not isinstance(index_col, (str, bytes)): + entries = list(index_col) + else: + entries = [index_col] + + orig_names = self._parser_attr(parser, "orig_names") + positions: set[int] = set() + for entry in entries: + if isinstance(entry, int): + if 0 <= entry < total_positions: + positions.add(entry) + elif orig_names is not None: + try: + pos = orig_names.index(entry) + except ValueError: + continue + positions.add(pos) + return positions + + def _index_levels_for_positions( + self, + index_positions: set[int], + text_positions: set[int], + total_positions: int, + ) -> list[int]: + if not index_positions: + return [] + + position_to_level: dict[int, int] = {} + level = 0 + for pos in range(total_positions): + if pos in index_positions: + position_to_level[pos] = level + level += 1 + + ordered_levels: list[int] = [] + for pos in sorted(text_positions): + level_idx = position_to_level.get(pos) + if level_idx is not None and level_idx not in ordered_levels: + ordered_levels.append(level_idx) + return ordered_levels + + def _convert_index_labels(self, index, levels_to_convert: list[int]): + if not levels_to_convert: + return index + + converter = self._text_format_converter + if getattr(index, "nlevels", 1) == 1: + return index.map(converter) + + levels_set = set(levels_to_convert) + tuples = [] + for value in index.tolist(): + mutable = list(value) + for level in levels_set: + if level < len(mutable): + mutable[level] = converter(mutable[level]) + tuples.append(tuple(mutable)) + return type(index).from_tuples(tuples, names=index.names) + @doc(storage_options=_shared_docs["storage_options"]) class ExcelWriter(Generic[_WorkbookT]): @@ -1624,6 +1846,7 @@ def parse( index_col: int | Sequence[int] | None = None, usecols=None, converters=None, + dtype_from_format: bool = False, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, skiprows: Sequence[int] | int | Callable[[int], object] | None = None, @@ -1686,6 +1909,9 @@ def parse( either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. + dtype_from_format : bool, default False + Preserve cells formatted as ``Text`` in Excel as strings instead of + coercing them to numeric dtypes. true_values : list, default None Values to consider as True. false_values : list, default None @@ -1773,6 +1999,7 @@ def parse( index_col=index_col, usecols=usecols, converters=converters, + dtype_from_format=dtype_from_format, true_values=true_values, false_values=false_values, skiprows=skiprows, diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 0bdd2b42aad51..a68e93e0e9c20 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -98,8 +98,12 @@ def get_sheet_by_index(self, index: int) -> CalamineSheet: return self.book.get_sheet_by_index(index) def get_sheet_data( - self, sheet: CalamineSheet, file_rows_needed: int | None = None - ) -> list[list[Scalar | NaTType | time]]: + self, + sheet: CalamineSheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar | NaTType | time]], set[int]]: def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: if isinstance(value, float): val = int(value) @@ -121,4 +125,4 @@ def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: ) data = [[_convert_cell(cell) for cell in row] for row in rows] - return data + return data, set() diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index f79417d11080d..7df9b30aadfd0 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -100,8 +100,12 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, file_rows_needed: int | None = None - ) -> list[list[Scalar | NaTType]]: + self, + sheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar | NaTType]], set[int]]: """ Parse an ODF Table into a list of lists """ @@ -161,7 +165,7 @@ def get_sheet_data( if len(row) < max_row_len: row.extend([self.empty_value] * (max_row_len - len(row))) - return table + return table, set() def _get_row_repeat(self, row) -> int: """ diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 867d11583dcc0..7d26bd0ef4442 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -589,6 +589,15 @@ def get_sheet_by_index(self, index: int): self.raise_if_bad_sheet_by_index(index) return self.book.worksheets[index] + @staticmethod + def _cell_is_text_formatted(cell) -> bool: + number_format = getattr(cell, "number_format", None) + if number_format == "@": + return True + + format_id = getattr(cell, "number_format_id", None) + return format_id == 49 + def _convert_cell(self, cell) -> Scalar: from openpyxl.cell.cell import ( TYPE_ERROR, @@ -608,15 +617,24 @@ def _convert_cell(self, cell) -> Scalar: return cell.value def get_sheet_data( - self, sheet, file_rows_needed: int | None = None - ) -> list[list[Scalar]]: + self, + sheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar]], set[int]]: if self.book.read_only: sheet.reset_dimensions() data: list[list[Scalar]] = [] + text_formatted_cols: set[int] = set() last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): - converted_row = [self._convert_cell(cell) for cell in row] + converted_row: list[Scalar] = [] + for col_idx, cell in enumerate(row): + if dtype_from_format and self._cell_is_text_formatted(cell): + text_formatted_cols.add(col_idx) + converted_row.append(self._convert_cell(cell)) while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() @@ -639,4 +657,4 @@ def get_sheet_data( for data_row in data ] - return data + return data, (text_formatted_cols if dtype_from_format else set()) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index a6e42616c2043..a61225e8bb9a1 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -98,7 +98,9 @@ def get_sheet_data( self, sheet, file_rows_needed: int | None = None, - ) -> list[list[Scalar]]: + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar]], set[int]]: data: list[list[Scalar]] = [] previous_row_number = -1 # When sparse=True the rows can have different lengths and empty rows are @@ -124,4 +126,4 @@ def get_sheet_data( data_row + (max_width - len(data_row)) * empty_cell for data_row in data ] - return data + return data, set() diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 5d39a840336eb..5d1dde880dad3 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -76,9 +76,48 @@ def get_sheet_by_index(self, index): self.raise_if_bad_sheet_by_index(index) return self.book.sheet_by_index(index) + @staticmethod + def _maybe_mark_text_column( + text_columns: set[int], + sheet, + row_idx: int, + col_idx: int, + dtype_from_format: bool, + format_map, + xf_list, + ) -> bool: + if not dtype_from_format or not xf_list: + return False + + try: + xf_index = sheet.cell_xf_index(row_idx, col_idx) + except AttributeError: + return False + + if xf_index is None or xf_index >= len(xf_list): + return False + + xf = xf_list[xf_index] + format_key = getattr(xf, "format_key", None) + if format_key is None: + return False + + cell_format = format_map.get(format_key) + if cell_format is None: + return False + + if getattr(cell_format, "format_str", "") == "@": + text_columns.add(col_idx) + return True + return False + def get_sheet_data( - self, sheet, file_rows_needed: int | None = None - ) -> list[list[Scalar]]: + self, + sheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar]], set[int]]: from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -88,6 +127,9 @@ def get_sheet_data( ) epoch1904 = self.book.datemode + text_formatted_cols: set[int] = set() + format_map = getattr(self.book, "format_map", {}) if dtype_from_format else {} + xf_list = getattr(self.book, "xf_list", []) if dtype_from_format else [] def _parse_cell(cell_contents, cell_typ): """ @@ -131,10 +173,24 @@ def _parse_cell(cell_contents, cell_typ): nrows = sheet.nrows if file_rows_needed is not None: nrows = min(nrows, file_rows_needed) - return [ - [ - _parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) - ] - for i in range(nrows) - ] + + rows: list[list[Scalar]] = [] + for i in range(nrows): + parsed_row: list[Scalar] = [] + row_values = sheet.row_values(i) + row_types = sheet.row_types(i) + for j, (value, typ) in enumerate(zip(row_values, row_types)): + if dtype_from_format: + self._maybe_mark_text_column( + text_formatted_cols, + sheet, + i, + j, + dtype_from_format, + format_map, + xf_list, + ) + parsed_row.append(_parse_cell(value, typ)) + rows.append(parsed_row) + + return rows, (text_formatted_cols if dtype_from_format else set()) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fca63b1709dce..6599e5428744a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -157,6 +157,66 @@ def xfail_datetimes_with_pyxlsb(engine, request): ) +@td.skip_if_no("openpyxl") +def test_dtype_from_format_openpyxl(tmp_path): + from openpyxl import Workbook + + path = tmp_path / "text_format.xlsx" + wb = Workbook() + ws = wb.active + ws.title = "Sheet1" + ws.append(["id", "value"]) + ws.append([123, 1]) + ws.append([456, 2]) + + for cell in ws["A"]: + cell.number_format = "@" + + wb.save(path) + + df_default = pd.read_excel(path, engine="openpyxl") + assert df_default["id"].tolist() == [123, 456] + + df_text = pd.read_excel(path, engine="openpyxl", dtype_from_format=True) + assert df_text["id"].tolist() == ["123", "456"] + + df_index = pd.read_excel( + path, engine="openpyxl", dtype_from_format=True, index_col="id" + ) + assert df_index.index.tolist() == ["123", "456"] + + +@td.skip_if_no("xlrd") +@td.skip_if_no("xlwt") +def test_dtype_from_format_xlrd(tmp_path): + import xlwt + + path = tmp_path / "text_format.xls" + wb = xlwt.Workbook() + ws = wb.add_sheet("Sheet1") + text_style = xlwt.easyxf(num_format_str="@") + + ws.write(0, 0, "id") + ws.write(0, 1, "value") + ws.write(1, 0, 123, text_style) + ws.write(1, 1, 1) + ws.write(2, 0, 456, text_style) + ws.write(2, 1, 2) + + wb.save(str(path)) + + df_default = pd.read_excel(path, engine="xlrd") + assert df_default["id"].tolist() == [123, 456] + + df_text = pd.read_excel(path, engine="xlrd", dtype_from_format=True) + assert df_text["id"].tolist() == ["123", "456"] + + df_index = pd.read_excel( + path, engine="xlrd", dtype_from_format=True, index_col="id" + ) + assert df_index.index.tolist() == ["123", "456"] + + class TestReaders: @pytest.mark.parametrize("col", [[True, None, False], [True], [True, False]]) def test_read_excel_type_check(self, col, tmp_excel, read_ext): diff --git a/tmp.xlsx b/tmp.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..23da70baf534328cead4d978ecf2eabb7a8db6ae GIT binary patch literal 4889 zcmZ`-1yodB*B)9*7>1CcBn3el>6V5eL>L$alvZNsAqNlvDe3MQKw3#bq!ExALO^K= z0R<@?Lcnj-^?$zdzu&#k761TT07M1Z!_@orG*fU-!?;0> z8#b=i2JWtK523qoxFEvWNe`n(-X%;yy!gt~PZmnl3)4h7!}lX>Y?@qMy`7sP6@vTUoa_xt3RlG$MTBAaZs^*GEW}hFn0;!# zt5L{{_e`PiW{V)HW#;$z?7}@=l1Oc289}H+CJe4Q$ZI5wSJ6yuHU+-srJ}~3v#Tz2 zc$jg_XeVy<2`| zSvbPqFCb&8suEM&*0;wG(<+T*^3FJUF+IP{3cUiZtCp(CY|U|4G1u%|xH82SWO|n& zW^M-Q6ad^bKlEalJ4a5&?QBkJVgMlf3IG7Yk%Mp&@^r9uw*5Ve{G?~g9Of}CMH{qT zGlH*CxMC7h|N3fbaeviY3xn?@q@sH&sh%HWU(;)@drM0<8hDsszwl$?YsIB-PMo#Dn>j% zJMvsPt$t9vLGPfVzWmvbhOS{%Y5Xz(3iqCTGe(eV!FTMH41UkjI;VVl2l_?&G4QOf zmvGW?j4@SQ-90Cs=psO}XH?f9phmedYOyPE>~{PlhOfZ@d?E97$Ty8p_TJf=({E|5 z1I1!~Ie{|h2gyjs3cS*-)+|iYiO2d%UTM;lo(dY?CP2FTofCF9;);tb$X$lnUL^p7 z#U_++i(fohp zkB=UsPTxsN)JL(m+GdN=pmca?mlnjIN(l5$B2qebtFq--Ks3Y2r6{`bH6uv3lT%S*1jhg4OHI;RG-A zWWq&YqJ<>xU37Fj@D*#gly>G;p&OL&G}oeAt8+bc$nLZ`D11W9Ng(b(<1K^s3KdVO z;)bX=Nl~U5>B}&a`*}|z#1+a|MahVETUZrV{Sai5seNq*)uH})KSrk|yDAPtJEW0< zftZr*map;!V@;Brf%cu_(Xe@S7g*2tYLV$FaY0^dQ}^N5{l+y(eD9?5IyO%o6moaB zj^A^I)nuAo|YAP(=pwu5x)MG>Gmmo2I92NPL*5&>P=S7Q@aARi#Ar*@<|@n7~=04 z-Leva^%3yL7Y69YsI9}-(r#NWk0vh9pllmlYWRE@ojHu8yByO&_7#KkwsoDiSTMI> zd$sNNL zzur-;4#LhYAG;*UYU*bD_5$%d)$ln}n2G5bL{|*HB*qhi@P?BIZtUQP+Mt+&(~rqt z?oqVx(V?ppq>*X@55plAW85^fQMD9A>GpHUxpw8pjKy2|dIov*{i2LT*5?r(S$@r3YL!2r zldMOg3)1iznRx<>k01fn;-JpJsHk9I#l7|yHTp-~*|PQiyCIjeP9|rHv2%+Q0dFhJ zOKkXoiaI0#Y>C9yF|e=VmQQH!RiWyt`ldxFe1oC;6a{ut3lVCWV6lnlO6gcKSLF$% zRmGXNlnz)*apnXf3z3Y*>qRRNGP&8aZOYoNp12QN(Xa6S&aU~H_Xl0^0Du)-cD?jF zf%v$(Kk#s{we|E6`nCMZB3?!_E)Rt%PE&Z<@m)7_b9{>}< z!EJVlPzTjq<_M;$tJ1QbE2NchqO7Yjnmwsh{-jXg;YYf8!>oX+R@?fdo-_aM7p zJ@qzkS=ESyhymAEw{~+41*GO7^~lU)y%R2ysweZl!yxcsxROeEZb)nCLrTq!$G4d* z?IAMS$6e)tcaf}iKF^t(AH}BTar9p;ph{jb*fCjx90ZE%EusGg(1cv!E3OZcM2$0WbU{G*PlI9AzeX zi?nn-@(xg)^X}J(9hbT`_81-qgz9S^*}VhgGK#i=n}*C3^yUjga{4$#|KcaR5m|>D zRm9KT7pPtyTkGr!N7%tXxy zQ=ig(pejpjmT-H#Z=jk#lKN;aqWx-J_61KjTVJi?ZxfnvIh*(CUwkdU-dQ9qV=o9^ zcJlK{IN9xK;k4)Wvb4Rj_5zcPjR2We{;-m%mLCIGr0?ta7YEu)Kk&QEr)3(grc4Kt znQ{4=?>+c!#fo_AsZ+w}7D~boEAV`M2bQt4k}+_#B7@eSzsmv@)^_87jG2TpmK$B# z6|yORx_v=tnd}*}SMo?-Xj0#W52OBu?5oKgSEJ*$P}XR-3ZEdLROb;HN71S0MgXzK zkl@fnsS{6F1l2Ri#t+zP^!l_5mut=ysGD5pct#xldbZ)b1EDN}iVYBWm>0y5kF20?aJ4brPUOx(T>mn(STTC{^nq}kB6L2= z1DZF(cb0S9zh^l#UYDO$J@?INk0$-))$e$m*X&83v4wf6JgG=>09RW!poe4|54PXr zv@a~|q*9K_>sh4?V}0f$l~cy6Nq8U3O3W-+-AL^fKU7E^p;X*Qv8G|0sd#q7pU7A! zAs%mbrKk%-dd4nTL74>zVnAll>>uU166X3nwX>62HnHxLyjPwXX|lKoKr(M0^WSka z`{-+E#ZvXzhFD^QSELe>XQ^Wo0ZXMXvs#j+Oy`J$+s#7C?2ed=c+0gMhxpER%+96 zeeRU&7u{Kznlza@{`nE1wlWzRwuAKpGx8`%4|1taDPk`t>%&$D7b}5Zn^H>F8?L&7 zeR2^C-bk<3#DSLCH|qDO1fXXjJ}(FKVLf<2SO8QaG0PVfD&d~|;oeBor(HrnIhULf zkPMy$EW{pF-3wp06<714vtGZOY5kb|Oh49m3fh%9h*piS(;%K=qNwY7;Rh3X7ufaXm114nBm|8782v%f0*PZ151*Fc zf44DkHuLKW#_{|An;PfQ=Q{|$u>e3YUiUxI|JP4A4?lm#{sUjceg0og+~)*1m~gWqxK)D0mWZ2{5J>)&J?R z=fUT_{0}(w;xF+3@cHv{&O6c{IsUlF{4Z{0phJXfi2wkkxLW}izk7dnz5xFLZ-HY* literal 0 HcmV?d00001 From 5e7496c2e83af01be9faa56cdf5f2a857c420f2a Mon Sep 17 00:00:00 2001 From: asakho8 Date: Fri, 7 Nov 2025 16:39:44 -0500 Subject: [PATCH 2/2] ENH: add dtype_from_format option to preserve Excel text formatting --- doc/source/user_guide/io.rst | 33 ++++ pandas/io/excel/_base.py | 233 +++++++++++++++++++++++++- pandas/io/excel/_calamine.py | 10 +- pandas/io/excel/_odfreader.py | 10 +- pandas/io/excel/_openpyxl.py | 26 ++- pandas/io/excel/_pyxlsb.py | 6 +- pandas/io/excel/_xlrd.py | 76 +++++++-- pandas/tests/io/excel/test_readers.py | 60 +++++++ tmp.xlsx | Bin 0 -> 4889 bytes 9 files changed, 428 insertions(+), 26 deletions(-) create mode 100644 tmp.xlsx diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7092a0f8fa8d8..2ea81b0cb3080 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3327,6 +3327,39 @@ Reading Excel files In the most basic use-case, ``read_excel`` takes a path to an Excel file, and the ``sheet_name`` indicating which sheet to parse. +Text-formatted cells +++++++++++++++++++++ + +Excel workbooks often contain values that are stored as numbers but formatted as +``Text`` to preserve literal strings such as postal codes or account numbers +with leading zeros. By default, :func:`~pandas.read_excel` still converts those +cells to numeric types, which can alter the original representation. Pass +``dtype_from_format=True`` to maintain the Excel text formatting when parsing +each sheet. When enabled, pandas forces any columns or index levels that are +formatted as text in the source workbook to use string dtypes in the resulting +``Series``/``Index``. + +This behavior currently applies to the ``openpyxl`` and ``xlrd`` engines. Other +engines simply ignore the flag until text format detection is implemented for +them. + +.. ipython:: python + + df = pd.DataFrame({"zip_code": ["00601", "02108", "10118"]}) + with pd.ExcelWriter("zips.xlsx", engine="openpyxl") as writer: + df.to_excel(writer, index=False) + for cell in writer.sheets["Sheet1"]["A"]: + cell.number_format = "@" # Excel's Text format + + parsed = pd.read_excel("zips.xlsx", dtype_from_format=True) + parsed.dtypes + +.. ipython:: python + :suppress: + + import os + os.remove("zips.xlsx") + When using the ``engine_kwargs`` parameter, pandas will pass these arguments to the engine. For this, it is important to know which function pandas is using internally. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d1ae59e0e5866..69b61c1347030 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,6 @@ from __future__ import annotations +from bisect import bisect_left from collections.abc import ( Callable, Hashable, @@ -29,7 +30,10 @@ from pandas._config import config -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._libs.parsers import STR_NA_VALUES from pandas.compat._optional import ( get_version, @@ -164,6 +168,9 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. If you use ``None``, it will infer the dtype of each column based on the data. +dtype_from_format : bool, default False + Preserve cells formatted as ``Text`` in Excel as strings instead of + coercing them to numeric dtypes. engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. Engine compatibility : @@ -377,6 +384,7 @@ def read_excel( | Callable[[HashableT], bool] | None = ..., dtype: DtypeArg | None = ..., + dtype_from_format: bool = ..., engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., @@ -414,6 +422,7 @@ def read_excel( | Callable[[HashableT], bool] | None = ..., dtype: DtypeArg | None = ..., + dtype_from_format: bool = ..., engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., @@ -452,6 +461,7 @@ def read_excel( | Callable[[HashableT], bool] | None = None, dtype: DtypeArg | None = None, + dtype_from_format: bool = False, engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, @@ -499,6 +509,7 @@ def read_excel( index_col=index_col, usecols=usecols, dtype=dtype, + dtype_from_format=dtype_from_format, converters=converters, true_values=true_values, false_values=false_values, @@ -591,7 +602,13 @@ def get_sheet_by_name(self, name: str): def get_sheet_by_index(self, index: int): raise NotImplementedError - def get_sheet_data(self, sheet, rows: int | None = None): + def get_sheet_data( + self, + sheet, + rows: int | None = None, + *, + dtype_from_format: bool = False, + ): raise NotImplementedError def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -706,6 +723,7 @@ def parse( index_col: int | Sequence[int] | None = None, usecols=None, dtype: DtypeArg | None = None, + dtype_from_format: bool = False, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, skiprows: Sequence[int] | int | Callable[[int], object] | None = None, @@ -756,7 +774,11 @@ def parse( sheet = self.get_sheet_by_index(asheetname) file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) - data = self.get_sheet_data(sheet, file_rows_needed) + data, text_formatted_cols = self.get_sheet_data( + sheet, + file_rows_needed, + dtype_from_format=dtype_from_format, + ) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() @@ -768,6 +790,9 @@ def parse( output = self._parse_sheet( data=data, + text_formatted_cols=( + text_formatted_cols if dtype_from_format else None + ), output=output, asheetname=asheetname, header=header, @@ -787,6 +812,7 @@ def parse( comment=comment, skipfooter=skipfooter, dtype_backend=dtype_backend, + dtype_from_format=dtype_from_format, **kwds, ) @@ -819,11 +845,16 @@ def _parse_sheet( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, + text_formatted_cols: set[int] | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + dtype_from_format: bool = False, **kwds, ): is_list_header = False is_len_one_list_header = False + if dtype_from_format and text_formatted_cols: + self._coerce_text_data(data, text_formatted_cols) + if is_list_like(header): assert isinstance(header, Sequence) is_list_header = True @@ -935,6 +966,14 @@ def _parse_sheet( **kwds, ) + text_positions: set[int] = set() + if dtype_from_format and text_formatted_cols: + text_positions = self._resolve_text_positions( + parser, text_formatted_cols + ) + if text_positions: + self._inject_text_converters(parser, text_positions) + output[asheetname] = parser.read(nrows=nrows) if header_names: @@ -942,6 +981,9 @@ def _parse_sheet( header_names ) + if dtype_from_format and text_positions: + self._finalize_text_columns(output[asheetname], parser, text_positions) + except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() @@ -952,6 +994,186 @@ def _parse_sheet( return output + @staticmethod + def _text_format_converter(value): + if libmissing.checknull(value): + return value + return str(value) + + @staticmethod + def _parser_engine(parser): + return getattr(parser, "_engine", parser) + + @classmethod + def _parser_attr(cls, parser, attribute: str): + if hasattr(parser, attribute): + return getattr(parser, attribute) + engine = cls._parser_engine(parser) + if engine is not parser and hasattr(engine, attribute): + return getattr(engine, attribute) + return None + + def _resolve_text_positions( + self, parser, text_formatted_cols: set[int] + ) -> set[int]: + if not text_formatted_cols: + return set() + + orig_names = self._parser_attr(parser, "orig_names") or [] + col_indices = self._parser_attr(parser, "_col_indices") + if col_indices is None: + max_pos = len(orig_names) + return {idx for idx in text_formatted_cols if idx < max_pos} + + positions: set[int] = set() + for idx in text_formatted_cols: + pos = bisect_left(col_indices, idx) + if pos < len(col_indices) and col_indices[pos] == idx: + positions.add(pos) + return positions + + def _inject_text_converters(self, parser, text_positions: set[int]) -> None: + if not text_positions: + return + + target = self._parser_engine(parser) + existing_converters = getattr(target, "converters", None) + if existing_converters is None: + target.converters = {} + existing_clean: dict = {} + else: + target.converters = dict(existing_converters) + existing_clean = target._clean_mapping(existing_converters) + + orig_names = self._parser_attr(parser, "orig_names") or [] + + for pos in text_positions: + if pos >= len(orig_names): + continue + label = orig_names[pos] + if existing_clean and label in existing_clean: + continue + target.converters[pos] = self._text_format_converter + + def _finalize_text_columns( + self, + frame: DataFrame, + parser, + text_positions: set[int], + ) -> None: + if not text_positions or frame.empty: + return + + orig_names = self._parser_attr(parser, "orig_names") or [] + total_positions = len(orig_names) + if total_positions == 0: + return + + index_positions = self._resolve_index_positions(parser, total_positions) + + data_position_map: dict[int, int] = {} + df_col_index = 0 + for pos in range(total_positions): + if pos in index_positions: + continue + if df_col_index >= frame.shape[1]: + break + data_position_map[pos] = df_col_index + df_col_index += 1 + + for pos in text_positions: + if pos in index_positions: + continue + df_pos = data_position_map.get(pos) + if df_pos is None: + continue + frame.iloc[:, df_pos] = frame.iloc[:, df_pos].map( + self._text_format_converter + ) + + index_levels = self._index_levels_for_positions( + index_positions, text_positions, total_positions + ) + if index_levels: + frame.index = self._convert_index_labels(frame.index, index_levels) + + def _coerce_text_data(self, data: list, text_formatted_cols: set[int]) -> None: + if not text_formatted_cols or not data: + return + + for row in data: + if not row: + continue + for col_idx in text_formatted_cols: + if col_idx >= len(row): + continue + row[col_idx] = self._text_format_converter(row[col_idx]) + + def _resolve_index_positions(self, parser, total_positions: int) -> set[int]: + index_col = self._parser_attr(parser, "index_col") + if index_col is None or index_col is False: + return set() + + if is_list_like(index_col) and not isinstance(index_col, (str, bytes)): + entries = list(index_col) + else: + entries = [index_col] + + orig_names = self._parser_attr(parser, "orig_names") + positions: set[int] = set() + for entry in entries: + if isinstance(entry, int): + if 0 <= entry < total_positions: + positions.add(entry) + elif orig_names is not None: + try: + pos = orig_names.index(entry) + except ValueError: + continue + positions.add(pos) + return positions + + def _index_levels_for_positions( + self, + index_positions: set[int], + text_positions: set[int], + total_positions: int, + ) -> list[int]: + if not index_positions: + return [] + + position_to_level: dict[int, int] = {} + level = 0 + for pos in range(total_positions): + if pos in index_positions: + position_to_level[pos] = level + level += 1 + + ordered_levels: list[int] = [] + for pos in sorted(text_positions): + level_idx = position_to_level.get(pos) + if level_idx is not None and level_idx not in ordered_levels: + ordered_levels.append(level_idx) + return ordered_levels + + def _convert_index_labels(self, index, levels_to_convert: list[int]): + if not levels_to_convert: + return index + + converter = self._text_format_converter + if getattr(index, "nlevels", 1) == 1: + return index.map(converter) + + levels_set = set(levels_to_convert) + tuples = [] + for value in index.tolist(): + mutable = list(value) + for level in levels_set: + if level < len(mutable): + mutable[level] = converter(mutable[level]) + tuples.append(tuple(mutable)) + return type(index).from_tuples(tuples, names=index.names) + @set_module("pandas") @doc(storage_options=_shared_docs["storage_options"]) @@ -1617,6 +1839,7 @@ def parse( index_col: int | Sequence[int] | None = None, usecols=None, converters=None, + dtype_from_format: bool = False, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, skiprows: Sequence[int] | int | Callable[[int], object] | None = None, @@ -1679,6 +1902,9 @@ def parse( either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. + dtype_from_format : bool, default False + Preserve cells formatted as ``Text`` in Excel as strings instead of + coercing them to numeric dtypes. true_values : list, default None Values to consider as True. false_values : list, default None @@ -1766,6 +1992,7 @@ def parse( index_col=index_col, usecols=usecols, converters=converters, + dtype_from_format=dtype_from_format, true_values=true_values, false_values=false_values, skiprows=skiprows, diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 0bdd2b42aad51..a68e93e0e9c20 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -98,8 +98,12 @@ def get_sheet_by_index(self, index: int) -> CalamineSheet: return self.book.get_sheet_by_index(index) def get_sheet_data( - self, sheet: CalamineSheet, file_rows_needed: int | None = None - ) -> list[list[Scalar | NaTType | time]]: + self, + sheet: CalamineSheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar | NaTType | time]], set[int]]: def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: if isinstance(value, float): val = int(value) @@ -121,4 +125,4 @@ def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: ) data = [[_convert_cell(cell) for cell in row] for row in rows] - return data + return data, set() diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index f79417d11080d..7df9b30aadfd0 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -100,8 +100,12 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, file_rows_needed: int | None = None - ) -> list[list[Scalar | NaTType]]: + self, + sheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar | NaTType]], set[int]]: """ Parse an ODF Table into a list of lists """ @@ -161,7 +165,7 @@ def get_sheet_data( if len(row) < max_row_len: row.extend([self.empty_value] * (max_row_len - len(row))) - return table + return table, set() def _get_row_repeat(self, row) -> int: """ diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 867d11583dcc0..7d26bd0ef4442 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -589,6 +589,15 @@ def get_sheet_by_index(self, index: int): self.raise_if_bad_sheet_by_index(index) return self.book.worksheets[index] + @staticmethod + def _cell_is_text_formatted(cell) -> bool: + number_format = getattr(cell, "number_format", None) + if number_format == "@": + return True + + format_id = getattr(cell, "number_format_id", None) + return format_id == 49 + def _convert_cell(self, cell) -> Scalar: from openpyxl.cell.cell import ( TYPE_ERROR, @@ -608,15 +617,24 @@ def _convert_cell(self, cell) -> Scalar: return cell.value def get_sheet_data( - self, sheet, file_rows_needed: int | None = None - ) -> list[list[Scalar]]: + self, + sheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar]], set[int]]: if self.book.read_only: sheet.reset_dimensions() data: list[list[Scalar]] = [] + text_formatted_cols: set[int] = set() last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): - converted_row = [self._convert_cell(cell) for cell in row] + converted_row: list[Scalar] = [] + for col_idx, cell in enumerate(row): + if dtype_from_format and self._cell_is_text_formatted(cell): + text_formatted_cols.add(col_idx) + converted_row.append(self._convert_cell(cell)) while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() @@ -639,4 +657,4 @@ def get_sheet_data( for data_row in data ] - return data + return data, (text_formatted_cols if dtype_from_format else set()) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index a6e42616c2043..a61225e8bb9a1 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -98,7 +98,9 @@ def get_sheet_data( self, sheet, file_rows_needed: int | None = None, - ) -> list[list[Scalar]]: + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar]], set[int]]: data: list[list[Scalar]] = [] previous_row_number = -1 # When sparse=True the rows can have different lengths and empty rows are @@ -124,4 +126,4 @@ def get_sheet_data( data_row + (max_width - len(data_row)) * empty_cell for data_row in data ] - return data + return data, set() diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 46b0064247096..5d1dde880dad3 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -76,9 +76,48 @@ def get_sheet_by_index(self, index): self.raise_if_bad_sheet_by_index(index) return self.book.sheet_by_index(index) + @staticmethod + def _maybe_mark_text_column( + text_columns: set[int], + sheet, + row_idx: int, + col_idx: int, + dtype_from_format: bool, + format_map, + xf_list, + ) -> bool: + if not dtype_from_format or not xf_list: + return False + + try: + xf_index = sheet.cell_xf_index(row_idx, col_idx) + except AttributeError: + return False + + if xf_index is None or xf_index >= len(xf_list): + return False + + xf = xf_list[xf_index] + format_key = getattr(xf, "format_key", None) + if format_key is None: + return False + + cell_format = format_map.get(format_key) + if cell_format is None: + return False + + if getattr(cell_format, "format_str", "") == "@": + text_columns.add(col_idx) + return True + return False + def get_sheet_data( - self, sheet, file_rows_needed: int | None = None - ) -> list[list[Scalar]]: + self, + sheet, + file_rows_needed: int | None = None, + *, + dtype_from_format: bool = False, + ) -> tuple[list[list[Scalar]], set[int]]: from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -88,6 +127,9 @@ def get_sheet_data( ) epoch1904 = self.book.datemode + text_formatted_cols: set[int] = set() + format_map = getattr(self.book, "format_map", {}) if dtype_from_format else {} + xf_list = getattr(self.book, "xf_list", []) if dtype_from_format else [] def _parse_cell(cell_contents, cell_typ): """ @@ -131,12 +173,24 @@ def _parse_cell(cell_contents, cell_typ): nrows = sheet.nrows if file_rows_needed is not None: nrows = min(nrows, file_rows_needed) - return [ - [ - _parse_cell(value, typ) - for value, typ in zip( - sheet.row_values(i), sheet.row_types(i), strict=True - ) - ] - for i in range(nrows) - ] + + rows: list[list[Scalar]] = [] + for i in range(nrows): + parsed_row: list[Scalar] = [] + row_values = sheet.row_values(i) + row_types = sheet.row_types(i) + for j, (value, typ) in enumerate(zip(row_values, row_types)): + if dtype_from_format: + self._maybe_mark_text_column( + text_formatted_cols, + sheet, + i, + j, + dtype_from_format, + format_map, + xf_list, + ) + parsed_row.append(_parse_cell(value, typ)) + rows.append(parsed_row) + + return rows, (text_formatted_cols if dtype_from_format else set()) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fca63b1709dce..6599e5428744a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -157,6 +157,66 @@ def xfail_datetimes_with_pyxlsb(engine, request): ) +@td.skip_if_no("openpyxl") +def test_dtype_from_format_openpyxl(tmp_path): + from openpyxl import Workbook + + path = tmp_path / "text_format.xlsx" + wb = Workbook() + ws = wb.active + ws.title = "Sheet1" + ws.append(["id", "value"]) + ws.append([123, 1]) + ws.append([456, 2]) + + for cell in ws["A"]: + cell.number_format = "@" + + wb.save(path) + + df_default = pd.read_excel(path, engine="openpyxl") + assert df_default["id"].tolist() == [123, 456] + + df_text = pd.read_excel(path, engine="openpyxl", dtype_from_format=True) + assert df_text["id"].tolist() == ["123", "456"] + + df_index = pd.read_excel( + path, engine="openpyxl", dtype_from_format=True, index_col="id" + ) + assert df_index.index.tolist() == ["123", "456"] + + +@td.skip_if_no("xlrd") +@td.skip_if_no("xlwt") +def test_dtype_from_format_xlrd(tmp_path): + import xlwt + + path = tmp_path / "text_format.xls" + wb = xlwt.Workbook() + ws = wb.add_sheet("Sheet1") + text_style = xlwt.easyxf(num_format_str="@") + + ws.write(0, 0, "id") + ws.write(0, 1, "value") + ws.write(1, 0, 123, text_style) + ws.write(1, 1, 1) + ws.write(2, 0, 456, text_style) + ws.write(2, 1, 2) + + wb.save(str(path)) + + df_default = pd.read_excel(path, engine="xlrd") + assert df_default["id"].tolist() == [123, 456] + + df_text = pd.read_excel(path, engine="xlrd", dtype_from_format=True) + assert df_text["id"].tolist() == ["123", "456"] + + df_index = pd.read_excel( + path, engine="xlrd", dtype_from_format=True, index_col="id" + ) + assert df_index.index.tolist() == ["123", "456"] + + class TestReaders: @pytest.mark.parametrize("col", [[True, None, False], [True], [True, False]]) def test_read_excel_type_check(self, col, tmp_excel, read_ext): diff --git a/tmp.xlsx b/tmp.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..23da70baf534328cead4d978ecf2eabb7a8db6ae GIT binary patch literal 4889 zcmZ`-1yodB*B)9*7>1CcBn3el>6V5eL>L$alvZNsAqNlvDe3MQKw3#bq!ExALO^K= z0R<@?Lcnj-^?$zdzu&#k761TT07M1Z!_@orG*fU-!?;0> z8#b=i2JWtK523qoxFEvWNe`n(-X%;yy!gt~PZmnl3)4h7!}lX>Y?@qMy`7sP6@vTUoa_xt3RlG$MTBAaZs^*GEW}hFn0;!# zt5L{{_e`PiW{V)HW#;$z?7}@=l1Oc289}H+CJe4Q$ZI5wSJ6yuHU+-srJ}~3v#Tz2 zc$jg_XeVy<2`| zSvbPqFCb&8suEM&*0;wG(<+T*^3FJUF+IP{3cUiZtCp(CY|U|4G1u%|xH82SWO|n& zW^M-Q6ad^bKlEalJ4a5&?QBkJVgMlf3IG7Yk%Mp&@^r9uw*5Ve{G?~g9Of}CMH{qT zGlH*CxMC7h|N3fbaeviY3xn?@q@sH&sh%HWU(;)@drM0<8hDsszwl$?YsIB-PMo#Dn>j% zJMvsPt$t9vLGPfVzWmvbhOS{%Y5Xz(3iqCTGe(eV!FTMH41UkjI;VVl2l_?&G4QOf zmvGW?j4@SQ-90Cs=psO}XH?f9phmedYOyPE>~{PlhOfZ@d?E97$Ty8p_TJf=({E|5 z1I1!~Ie{|h2gyjs3cS*-)+|iYiO2d%UTM;lo(dY?CP2FTofCF9;);tb$X$lnUL^p7 z#U_++i(fohp zkB=UsPTxsN)JL(m+GdN=pmca?mlnjIN(l5$B2qebtFq--Ks3Y2r6{`bH6uv3lT%S*1jhg4OHI;RG-A zWWq&YqJ<>xU37Fj@D*#gly>G;p&OL&G}oeAt8+bc$nLZ`D11W9Ng(b(<1K^s3KdVO z;)bX=Nl~U5>B}&a`*}|z#1+a|MahVETUZrV{Sai5seNq*)uH})KSrk|yDAPtJEW0< zftZr*map;!V@;Brf%cu_(Xe@S7g*2tYLV$FaY0^dQ}^N5{l+y(eD9?5IyO%o6moaB zj^A^I)nuAo|YAP(=pwu5x)MG>Gmmo2I92NPL*5&>P=S7Q@aARi#Ar*@<|@n7~=04 z-Leva^%3yL7Y69YsI9}-(r#NWk0vh9pllmlYWRE@ojHu8yByO&_7#KkwsoDiSTMI> zd$sNNL zzur-;4#LhYAG;*UYU*bD_5$%d)$ln}n2G5bL{|*HB*qhi@P?BIZtUQP+Mt+&(~rqt z?oqVx(V?ppq>*X@55plAW85^fQMD9A>GpHUxpw8pjKy2|dIov*{i2LT*5?r(S$@r3YL!2r zldMOg3)1iznRx<>k01fn;-JpJsHk9I#l7|yHTp-~*|PQiyCIjeP9|rHv2%+Q0dFhJ zOKkXoiaI0#Y>C9yF|e=VmQQH!RiWyt`ldxFe1oC;6a{ut3lVCWV6lnlO6gcKSLF$% zRmGXNlnz)*apnXf3z3Y*>qRRNGP&8aZOYoNp12QN(Xa6S&aU~H_Xl0^0Du)-cD?jF zf%v$(Kk#s{we|E6`nCMZB3?!_E)Rt%PE&Z<@m)7_b9{>}< z!EJVlPzTjq<_M;$tJ1QbE2NchqO7Yjnmwsh{-jXg;YYf8!>oX+R@?fdo-_aM7p zJ@qzkS=ESyhymAEw{~+41*GO7^~lU)y%R2ysweZl!yxcsxROeEZb)nCLrTq!$G4d* z?IAMS$6e)tcaf}iKF^t(AH}BTar9p;ph{jb*fCjx90ZE%EusGg(1cv!E3OZcM2$0WbU{G*PlI9AzeX zi?nn-@(xg)^X}J(9hbT`_81-qgz9S^*}VhgGK#i=n}*C3^yUjga{4$#|KcaR5m|>D zRm9KT7pPtyTkGr!N7%tXxy zQ=ig(pejpjmT-H#Z=jk#lKN;aqWx-J_61KjTVJi?ZxfnvIh*(CUwkdU-dQ9qV=o9^ zcJlK{IN9xK;k4)Wvb4Rj_5zcPjR2We{;-m%mLCIGr0?ta7YEu)Kk&QEr)3(grc4Kt znQ{4=?>+c!#fo_AsZ+w}7D~boEAV`M2bQt4k}+_#B7@eSzsmv@)^_87jG2TpmK$B# z6|yORx_v=tnd}*}SMo?-Xj0#W52OBu?5oKgSEJ*$P}XR-3ZEdLROb;HN71S0MgXzK zkl@fnsS{6F1l2Ri#t+zP^!l_5mut=ysGD5pct#xldbZ)b1EDN}iVYBWm>0y5kF20?aJ4brPUOx(T>mn(STTC{^nq}kB6L2= z1DZF(cb0S9zh^l#UYDO$J@?INk0$-))$e$m*X&83v4wf6JgG=>09RW!poe4|54PXr zv@a~|q*9K_>sh4?V}0f$l~cy6Nq8U3O3W-+-AL^fKU7E^p;X*Qv8G|0sd#q7pU7A! zAs%mbrKk%-dd4nTL74>zVnAll>>uU166X3nwX>62HnHxLyjPwXX|lKoKr(M0^WSka z`{-+E#ZvXzhFD^QSELe>XQ^Wo0ZXMXvs#j+Oy`J$+s#7C?2ed=c+0gMhxpER%+96 zeeRU&7u{Kznlza@{`nE1wlWzRwuAKpGx8`%4|1taDPk`t>%&$D7b}5Zn^H>F8?L&7 zeR2^C-bk<3#DSLCH|qDO1fXXjJ}(FKVLf<2SO8QaG0PVfD&d~|;oeBor(HrnIhULf zkPMy$EW{pF-3wp06<714vtGZOY5kb|Oh49m3fh%9h*piS(;%K=qNwY7;Rh3X7ufaXm114nBm|8782v%f0*PZ151*Fc zf44DkHuLKW#_{|An;PfQ=Q{|$u>e3YUiUxI|JP4A4?lm#{sUjceg0og+~)*1m~gWqxK)D0mWZ2{5J>)&J?R z=fUT_{0}(w;xF+3@cHv{&O6c{IsUlF{4Z{0phJXfi2wkkxLW}izk7dnz5xFLZ-HY* literal 0 HcmV?d00001