Skip to content

Commit 635e51a

Browse files
authored
Merge pull request #371 from chezou/fix-encoding
Set encoding on SubprocessTabula initialization
2 parents 76db276 + 782793d commit 635e51a

File tree

1 file changed

+13
-22
lines changed

1 file changed

+13
-22
lines changed

tabula/io.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444

4545

4646
def _run(
47-
java_options: List[str],
4847
options: TabulaOption,
48+
java_options: Optional[List[str]] = None,
4949
path: Optional[str] = None,
5050
encoding: str = "utf-8",
5151
force_subprocess: bool = False,
@@ -62,6 +62,8 @@ def _run(
6262
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog",
6363
}
6464

65+
java_options = _build_java_options(java_options, encoding)
66+
6567
global _tabula_vm
6668
if force_subprocess:
6769
_tabula_vm = SubprocessTabula(
@@ -381,20 +383,6 @@ def read_pdf(
381383
multiple_tables=multiple_tables,
382384
)
383385

384-
if java_options is None:
385-
java_options = []
386-
elif isinstance(java_options, str):
387-
java_options = shlex.split(java_options)
388-
389-
# to prevent tabula-py from stealing focus on every call on mac
390-
if platform.system() == "Darwin":
391-
if not any("java.awt.headless" in opt for opt in java_options):
392-
java_options += ["-Djava.awt.headless=true"]
393-
394-
if encoding == "utf-8":
395-
if not any("file.encoding" in opt for opt in java_options):
396-
java_options += ["-Dfile.encoding=UTF8"]
397-
398386
path, temporary = localize_file(input_path, user_agent, use_raw_url=use_raw_url)
399387

400388
if not os.path.exists(path):
@@ -405,8 +393,8 @@ def read_pdf(
405393

406394
try:
407395
output = _run(
408-
java_options,
409396
tabula_options,
397+
java_options,
410398
path,
411399
encoding=encoding,
412400
force_subprocess=force_subprocess,
@@ -827,7 +815,6 @@ def convert_into(
827815
output_path=output_path,
828816
options=options,
829817
)
830-
java_options = _build_java_options(java_options)
831818

832819
path, temporary = localize_file(input_path)
833820

@@ -838,7 +825,7 @@ def convert_into(
838825
raise ValueError(f"{path} is empty. Check the file, or download it manually.")
839826

840827
try:
841-
_run(java_options, tabula_options, path, force_subprocess=force_subprocess)
828+
_run(tabula_options, java_options, path, force_subprocess=force_subprocess)
842829
finally:
843830
if temporary:
844831
os.unlink(path)
@@ -948,8 +935,6 @@ def convert_into_by_batch(
948935

949936
format = _extract_format_for_conversion(output_format)
950937

951-
java_options = _build_java_options(java_options)
952-
953938
tabula_options = TabulaOption(
954939
pages=pages,
955940
guess=guess,
@@ -967,10 +952,12 @@ def convert_into_by_batch(
967952
options=options,
968953
)
969954

970-
_run(java_options, tabula_options, force_subprocess=force_subprocess)
955+
_run(tabula_options, java_options, force_subprocess=force_subprocess)
971956

972957

973-
def _build_java_options(_java_options: Optional[List[str]] = None) -> List[str]:
958+
def _build_java_options(
959+
_java_options: Optional[List[str]] = None, encoding: str = "utf-8"
960+
) -> List[str]:
974961
if _java_options is None:
975962
_java_options = []
976963
elif isinstance(_java_options, str):
@@ -982,6 +969,10 @@ def _build_java_options(_java_options: Optional[List[str]] = None) -> List[str]:
982969
if not any(filter(r.find, _java_options)): # type: ignore
983970
_java_options = _java_options + ["-Djava.awt.headless=true"]
984971

972+
if encoding == "utf-8":
973+
if not any("file.encoding" in opt for opt in _java_options):
974+
_java_options += ["-Dfile.encoding=UTF8"]
975+
985976
return _java_options
986977

987978

0 commit comments

Comments
 (0)