44
44
45
45
46
46
def _run (
47
- java_options : List [str ],
48
47
options : TabulaOption ,
48
+ java_options : Optional [List [str ]] = None ,
49
49
path : Optional [str ] = None ,
50
50
encoding : str = "utf-8" ,
51
51
force_subprocess : bool = False ,
@@ -62,6 +62,8 @@ def _run(
62
62
"-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.NoOpLog" ,
63
63
}
64
64
65
+ java_options = _build_java_options (java_options , encoding )
66
+
65
67
global _tabula_vm
66
68
if force_subprocess :
67
69
_tabula_vm = SubprocessTabula (
@@ -381,20 +383,6 @@ def read_pdf(
381
383
multiple_tables = multiple_tables ,
382
384
)
383
385
384
- if java_options is None :
385
- java_options = []
386
- elif isinstance (java_options , str ):
387
- java_options = shlex .split (java_options )
388
-
389
- # to prevent tabula-py from stealing focus on every call on mac
390
- if platform .system () == "Darwin" :
391
- if not any ("java.awt.headless" in opt for opt in java_options ):
392
- java_options += ["-Djava.awt.headless=true" ]
393
-
394
- if encoding == "utf-8" :
395
- if not any ("file.encoding" in opt for opt in java_options ):
396
- java_options += ["-Dfile.encoding=UTF8" ]
397
-
398
386
path , temporary = localize_file (input_path , user_agent , use_raw_url = use_raw_url )
399
387
400
388
if not os .path .exists (path ):
@@ -405,8 +393,8 @@ def read_pdf(
405
393
406
394
try :
407
395
output = _run (
408
- java_options ,
409
396
tabula_options ,
397
+ java_options ,
410
398
path ,
411
399
encoding = encoding ,
412
400
force_subprocess = force_subprocess ,
@@ -827,7 +815,6 @@ def convert_into(
827
815
output_path = output_path ,
828
816
options = options ,
829
817
)
830
- java_options = _build_java_options (java_options )
831
818
832
819
path , temporary = localize_file (input_path )
833
820
@@ -838,7 +825,7 @@ def convert_into(
838
825
raise ValueError (f"{ path } is empty. Check the file, or download it manually." )
839
826
840
827
try :
841
- _run (java_options , tabula_options , path , force_subprocess = force_subprocess )
828
+ _run (tabula_options , java_options , path , force_subprocess = force_subprocess )
842
829
finally :
843
830
if temporary :
844
831
os .unlink (path )
@@ -948,8 +935,6 @@ def convert_into_by_batch(
948
935
949
936
format = _extract_format_for_conversion (output_format )
950
937
951
- java_options = _build_java_options (java_options )
952
-
953
938
tabula_options = TabulaOption (
954
939
pages = pages ,
955
940
guess = guess ,
@@ -967,10 +952,12 @@ def convert_into_by_batch(
967
952
options = options ,
968
953
)
969
954
970
- _run (java_options , tabula_options , force_subprocess = force_subprocess )
955
+ _run (tabula_options , java_options , force_subprocess = force_subprocess )
971
956
972
957
973
- def _build_java_options (_java_options : Optional [List [str ]] = None ) -> List [str ]:
958
+ def _build_java_options (
959
+ _java_options : Optional [List [str ]] = None , encoding : str = "utf-8"
960
+ ) -> List [str ]:
974
961
if _java_options is None :
975
962
_java_options = []
976
963
elif isinstance (_java_options , str ):
@@ -982,6 +969,10 @@ def _build_java_options(_java_options: Optional[List[str]] = None) -> List[str]:
982
969
if not any (filter (r .find , _java_options )): # type: ignore
983
970
_java_options = _java_options + ["-Djava.awt.headless=true" ]
984
971
972
+ if encoding == "utf-8" :
973
+ if not any ("file.encoding" in opt for opt in _java_options ):
974
+ _java_options += ["-Dfile.encoding=UTF8" ]
975
+
985
976
return _java_options
986
977
987
978
0 commit comments