-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtika-config.xml
87 lines (73 loc) · 5.18 KB
/
tika-config.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser">
<!-- this is not formally necessary, but prevents loading of unnecessary parser -->
<parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
</parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="enableImagePreprocessing" type="bool">true</param>
<param name="language" type="string">eng</param>
<param name="maxFileSizeToOcr" type="long">2147483647</param>
<param name="minFileSizeToOcr" type="long">0</param>
<param name="timeoutSeconds" type="int">120</param>
</params>
</parser>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<!-- these are the defaults; you only need to specify the ones you want to modify -->
<param name="allowExtractionForAccessibility" type="bool">true</param>
<param name="extractActions" type="bool">true</param>
<param name="catchIntermediateIOExceptions" type="bool">true</param>
<param name="extractAnnotationText" type="bool">true</param>
<param name="extractFontNames" type="bool">true</param>
<param name="extractMarkedContent" type="bool">true</param>
<param name="extractInlineImages" type="bool">true</param>
</params>
</parser>
</parsers>
<server>
<params>
<!-- which port to start the server on. If you specify a range, e.g. 9995-9998, TikaServerCli will start four forked servers,
one at each port. You can also specify multiple forked servers via a comma-delimited value: 9995,9997.-->
<port>9998</port>
<host>0.0.0.0</host>
<!-- if specified, this will be the id that is used in the /status endpoint and elsewhere. If an id is specified
and more than one forked processes are invoked, each process will have an id followed by the port, e.g my_id-9998. If a
forked server has to restart, it will maintain its original id. If not specified, a UUID will be generated. -->
<id>knowledge-canvas-tika-server</id>
<!-- whether or not to allow CORS requests. Set to 'ALL' if you want to allow all CORS requests. Set to NONE or leave blank if you do not want to enable CORS. -->
<cors>ALL</cors>
<!-- which digests to calculate, comma delimited (e.g. md5,sha256); optionally specify encoding followed by a colon (e.g. "sha1:32").
Can be empty if you don't want to calculate a digest -->
<digest>sha256</digest>
<!-- how much to read to memory during the digest phase before spooling to disc...only if digest is selected -->
<digestMarkLimit>1000000</digestMarkLimit>
<!-- request URI log level 'debug' or 'info' -->
<logLevel>info</logLevel>
<!-- whether or not to return the stacktrace in the data returned to the user when a parse exception happens-->
<returnStackTrace>false</returnStackTrace>
<!-- If set to 'true', this runs tika server "in process" in the legacy 1.x mode.
This means that the server will be susceptible to infinite loops and crashes.
If set to 'false', the server will spawn a forked process and restart the forked process on catastrophic failures
(this was called -spawnChild mode in 1.x). noFork=false is the default in 2.x -->
<noFork>false</noFork>
<!-- maximum time to allow per parse before shutting down and restarting the forked parser. Not allowed if noFork=true. -->
<taskTimeoutMillis>300000</taskTimeoutMillis>
<!-- maximum amount of time to wait for a forked process to start up. Not allowed if noFork=true. -->
<maxForkedStartupMillis>120000</maxForkedStartupMillis>
<!-- maximum number of times to allow a specific forked process to be restarted. Not allowed if noFork=true. -->
<maxRestarts>-1</maxRestarts>
<!-- maximum files to parse per forked process before restarting the forked process to clear potential memory leaks. Not allowed if noFork=true. -->
<maxFiles>100000</maxFiles>
<!-- if you want to specify a specific javaPath for the forked process. This path should end the application 'java', e.g. /my/special-java/java Not allowed if noFork=true. -->
<javaPath>java</javaPath>
<!-- this must be set to true for any handler that uses a fetcher or emitter. These pipes features are inherently unsecure because
the client has the same read/write access as the tika-server process. Implementers must secure Tika server so that only their clients can reach it.
A byproduct of setting this to true is that the /status endpoint is turned on -->
<enableUnsecureFeatures>true</enableUnsecureFeatures>
</params>
</server>
</properties>