Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add chardet encoding detection #193

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ ipython
ipywidgets
sphinx-gallery
sphinx-plotly-directive
sphinxcontrib-mermaid
sphinxcontrib-mermaid
matplotlib
h5py
pyyaml
importlib-resources
rapidfuzz
lark>=1.1.5
pint
pint
chardet
19 changes: 10 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ description = "An ellipsometry analysis tool for reproducible and comprehensible
dynamic = ["version"]
authors = [
{ name = "Marius Müller", email = "marius.mueller@physik.uni-giessen.de" },
{ name = "Florian Dobener", email = "pyelli@schroedingerscat.org" }
{ name = "Florian Dobener", email = "pyelli@schroedingerscat.org" },
]
requires-python = ">=3.8"
license = { file = "LICENSE.txt" }
Expand All @@ -19,7 +19,7 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12"
"Programming Language :: Python :: 3.12",
]
dependencies = [
"scipy",
Expand All @@ -32,6 +32,7 @@ dependencies = [
"rapidfuzz",
"lark>=1.1.5",
"pint",
"chardet",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -75,16 +76,16 @@ indent-width = 4

[tool.ruff.lint]
select = [
"E", # pycodestyle
"W", # pycodestyle
"PL", # pylint
"E", # pycodestyle
"W", # pycodestyle
"PL", # pylint
"NPY201", # numpy
]
ignore = [
"E501", # Line too long ({width} > {limit} characters)
"E701", # Multiple statements on one line (colon)
"E731", # Do not assign a lambda expression, use a def
"E402", # Module level import not at top of file
"E501", # Line too long ({width} > {limit} characters)
"E701", # Multiple statements on one line (colon)
"E731", # Do not assign a lambda expression, use a def
"E402", # Module level import not at top of file
"PLR0911", # Too many return statements
"PLR0912", # Too many branches
"PLR0913", # Too many arguments in function definition
Expand Down
7 changes: 6 additions & 1 deletion requirements/dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ cfgv==3.4.0 \
--hash=sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 \
--hash=sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560
# via pre-commit
chardet==5.2.0 \
--hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \
--hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970
# via
# -r requirements/fitting-requirements.txt
# pyelli (pyproject.toml)
comm==0.2.2 \
--hash=sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e \
--hash=sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3
Expand Down Expand Up @@ -1278,7 +1284,6 @@ typing-extensions==4.12.2 \
# -r requirements/fitting-requirements.txt
# flexcache
# flexparser
# ipython
# pint
tzdata==2024.1 \
--hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \
Expand Down
7 changes: 6 additions & 1 deletion requirements/fitting-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ asttokens==2.4.1 \
--hash=sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 \
--hash=sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0
# via stack-data
chardet==5.2.0 \
--hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \
--hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970
# via
# -r requirements/requirements.txt
# pyelli (pyproject.toml)
comm==0.2.2 \
--hash=sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e \
--hash=sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3
Expand Down Expand Up @@ -764,7 +770,6 @@ typing-extensions==4.12.2 \
# -r requirements/requirements.txt
# flexcache
# flexparser
# ipython
# pint
tzdata==2024.1 \
--hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \
Expand Down
4 changes: 4 additions & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ appdirs==1.4.4 \
--hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \
--hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128
# via pint
chardet==5.2.0 \
--hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \
--hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970
# via pyelli (pyproject.toml)
flexcache==0.3 \
--hash=sha256:18743bd5a0621bfe2cf8d519e4c3bfdf57a269c15d1ced3fb4b64e0ff4600656 \
--hash=sha256:d43c9fea82336af6e0115e308d9d33a185390b8346a017564611f1466dcd2e32
Expand Down
14 changes: 14 additions & 0 deletions src/elli/importer/encoding_detection.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would maybe call this file utils.py for future extensions or directly put it in the __init__.py. Seems a bit overkill to have this file for one function and it's unlikely that new functions will be added to this file.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, will do. Tried to avoid repeating and was not entirely sure if you should put actual code in the init.py

Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import chardet


def detect_encoding(fname: str) -> str:
r"""Detects the encoding of file fname.
Args:
fname (str): Filename
Returns:
str: Encoding identifier string.
"""
with open(fname, "rb") as f:
raw_data = f.read()
result = chardet.detect(raw_data)
return result["encoding"]
12 changes: 9 additions & 3 deletions src/elli/importer/spectraray.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from packaging.version import Version, parse

from ..utils import calc_rho
from .encoding_detection import detect_encoding


def read_spectraray_psi_delta(
Expand All @@ -25,10 +26,13 @@ def read_spectraray_psi_delta(
pd.DataFrame: DataFrame containing the psi/delta data in
the format to be further processes inside pyElli.
"""
# detect encoding
encoding = detect_encoding(fname)

# read data and drop empty column
psi_delta_df = pd.read_csv(
fname,
encoding=encoding,
index_col=0,
header=None,
sep=sep,
Expand Down Expand Up @@ -82,9 +86,11 @@ def read_spectraray_mmatrix(
pd.DataFrame: DataFrame containing the psi/delta data in
the format to be further processes inside pyElli.
"""
mueller_matrix = pd.read_csv(fname, sep=sep, decimal=decimal, index_col=0).iloc[
:, -17:-1
]
encoding = detect_encoding(fname)

mueller_matrix = pd.read_csv(
fname, encoding=encoding, sep=sep, decimal=decimal, index_col=0
).iloc[:, -17:-1]
mueller_matrix.index.name = "Wavelength"
mueller_matrix.columns = [
"M11",
Expand Down
5 changes: 4 additions & 1 deletion src/elli/importer/woollam.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ..units import ureg
from ..utils import calc_rho
from .encoding_detection import detect_encoding

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -167,7 +168,9 @@ def read_woollam_psi_delta(fname: str) -> pd.DataFrame:
the format to be further processes inside pyElli.
"""

with open(fname, encoding="utf-8") as fobj:
encoding = detect_encoding(fname)

with open(fname, encoding=encoding) as fobj:
line_number = fobj.tell()
metadata = []
file_format = ""
Expand Down