Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated tika to use sha1 hash instead of md5 for checksum #399

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,18 @@ These are read once, when tika/tika.py is initially loaded and used throughout a

1. `TIKA_VERSION` - set to the version string, e.g., 1.12 or default to current Tika version.
2. `TIKA_SERVER_JAR` - set to the full URL to the remote Tika server jar to download and cache.
3. `TIKA_SERVER_ENDPOINT` - set to the host (local or remote) for the running Tika server jar.
4. `TIKA_CLIENT_ONLY` - if set to True, then `TIKA_SERVER_JAR` is ignored, and relies on the value for `TIKA_SERVER_ENDPOINT` and treats Tika like a REST client.
5. `TIKA_TRANSLATOR` - set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation.
6. `TIKA_SERVER_CLASSPATH` - set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
7. `TIKA_LOG_PATH` - set to a directory with write permissions and the `tika.log` and `tika-server.log` files will be placed in this directory.
8. `TIKA_PATH` - set to a directory with write permissions and the `tika_server.jar` file will be placed in this directory.
9. `TIKA_JAVA` - set the Java runtime name, e.g., `java` or `java9`
10. `TIKA_STARTUP_SLEEP` - number of seconds (`float`) to wait per check if Tika server is launched at runtime
11. `TIKA_STARTUP_MAX_RETRY` - number of checks (`int`) to attempt for Tika server startup if launched at runtime
12. `TIKA_JAVA_ARGS` - set java runtime arguments, e.g, `-Xmx4g`
13. `TIKA_LOG_FILE` - set the filename for the log file. default: `tika.log`. if it is an empty string (`''`), no log file is created.
3. `TIKA_JAR_HASH_ALGO` - set to `sha1` when running on FIPS-compliant systems; default value is `md5`.
4. `TIKA_SERVER_ENDPOINT` - set to the host (local or remote) for the running Tika server jar.
5. `TIKA_CLIENT_ONLY` - if set to True, then `TIKA_SERVER_JAR` is ignored, and relies on the value for `TIKA_SERVER_ENDPOINT` and treats Tika like a REST client.
6. `TIKA_TRANSLATOR` - set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation.
7. `TIKA_SERVER_CLASSPATH` - set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
8. `TIKA_LOG_PATH` - set to a directory with write permissions and the `tika.log` and `tika-server.log` files will be placed in this directory.
9. `TIKA_PATH` - set to a directory with write permissions and the `tika_server.jar` file will be placed in this directory.
10. `TIKA_JAVA` - set the Java runtime name, e.g., `java` or `java9`
11. `TIKA_STARTUP_SLEEP` - number of seconds (`float`) to wait per check if Tika server is launched at runtime
12. `TIKA_STARTUP_MAX_RETRY` - number of checks (`int`) to attempt for Tika server startup if launched at runtime
13. `TIKA_JAVA_ARGS` - set java runtime arguments, e.g, `-Xmx4g`
14. `TIKA_LOG_FILE` - set the filename for the log file. default: `tika.log`. if it is an empty string (`''`), no log file is created.

Testing it out
==============
Expand Down
63 changes: 33 additions & 30 deletions tika/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@

Arguments:
urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika

Switches:
--verbose, -v = verbose mode
--encode, -e = encode response in UTF-8
Expand All @@ -106,7 +106,7 @@

import sys, os, getopt, time, codecs, re
try:
unicode_string = unicode
unicode_string = unicode
binary_string = str
except NameError:
unicode_string = str
Expand All @@ -133,7 +133,7 @@ def make_content_disposition_header(fn):
open = codecs.open

import requests
import socket
import socket
import tempfile
import hashlib
import platform
Expand Down Expand Up @@ -173,6 +173,7 @@ def make_content_disposition_header(fn):
TikaServerJar = os.getenv(
'TIKA_SERVER_JAR',
"http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/"+TikaVersion+"/tika-server-standard-"+TikaVersion+".jar")
TikaJarHashAlgo=os.getenv('TIKA_JAR_HASH_ALGO', 'md5')
ServerHost = "localhost"
Port = "9998"
ServerEndpoint = os.getenv(
Expand Down Expand Up @@ -229,7 +230,7 @@ def runCommand(cmd, option, urlOrPaths, port, outDir=None,
elif cmd == "language":
return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
elif cmd == "translate":
return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
elif cmd == "config":
status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
return resp
Expand Down Expand Up @@ -290,7 +291,7 @@ def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint,
return metaPaths


def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='application/json',
services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
'''
Expand Down Expand Up @@ -359,7 +360,7 @@ def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos
return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in paths]

def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'file' : '/language/stream'}, requestOptions={}):
'''
Expand All @@ -382,7 +383,7 @@ def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos
{'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
return (status, response)

def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'all': '/translate/all'}):
'''
Expand All @@ -399,9 +400,9 @@ def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbo
paths = getPaths(urlOrPaths)
return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in paths]

def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
responseMimeType='text/plain',
services={'all': '/translate/all'}, requestOptions={}):
'''

Expand All @@ -417,7 +418,7 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo
path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
srcLang = ""
destLang = ""

if ":" in option:
options = option.rsplit(':')
srcLang = options[0]
Expand All @@ -427,17 +428,17 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo
raise TikaException('Translate options are specified as srcLang:destLang or as destLang')
else:
destLang = option

if srcLang != "" and destLang != "":
service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
else:
service = services["all"] + "/" + Translator + "/" + destLang
service = services["all"] + "/" + Translator + "/" + destLang
status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
{'Accept' : responseMimeType},
verbose, tikaServerJar, requestOptions=requestOptions)
return (status, response)
def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,

def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'type': '/detect/stream'}):
'''
Expand All @@ -455,7 +456,7 @@ def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos
return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
for path in paths]

def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
responseMimeType='text/plain',
services={'type': '/detect/stream'}, config_path=None, requestOptions={}):
'''
Expand Down Expand Up @@ -519,14 +520,14 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti
:param classpath:
:return:
'''
parsedUrl = urlparse(serverEndpoint)
parsedUrl = urlparse(serverEndpoint)
serverHost = parsedUrl.hostname
scheme = parsedUrl.scheme

port = parsedUrl.port
if classpath is None:
classpath = TikaServerClasspath

global TikaClientOnly
if not TikaClientOnly:
serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
Expand All @@ -539,7 +540,7 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti

if Windows and hasattr(data, "read"):
data = data.read()

encodedData = data
if type(data) is unicode_string:
encodedData = data.encode('utf-8')
Expand Down Expand Up @@ -609,13 +610,15 @@ def checkJarSig(tikaServerJar, jarPath):
:param jarPath:
:return: ``True`` if the signature of the jar matches
'''
if not os.path.isfile(jarPath + ".md5"):
getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
m = hashlib.md5()
localChecksumPath = '.'.join([jarPath, TikaJarHashAlgo])
if not os.path.isfile(localChecksumPath):
remoteChecksum = '.'.join([tikaServerJar, TikaJarHashAlgo])
getRemoteJar(remoteChecksum, localChecksumPath)
m = hashlib.new(TikaJarHashAlgo)
with open(jarPath, 'rb') as f:
binContents = f.read()
m.update(binContents)
with open(jarPath + ".md5", "r") as em:
with open(localChecksumPath, "r") as em:
existingContents = em.read()
return existingContents == m.hexdigest()

Expand Down Expand Up @@ -674,7 +677,7 @@ def startServer(tikaServerJar, java_path = TikaJava, java_args = TikaJavaArgs, s
# Patch for Windows support
if Windows:
if sys.version.startswith("2"):
# Python 2.x
# Python 2.x
TikaServerProcess = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
elif sys.version.startswith("3"):
# Python 3.x
Expand Down Expand Up @@ -710,7 +713,7 @@ def killServer():
try:
os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM)
except:
log.error("Failed to kill the current server session")
log.error("Failed to kill the current server session")
time.sleep(1)
# patch to support subprocess killing for windows
if Windows:
Expand All @@ -729,7 +732,7 @@ def killServer():
try:
os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM)
except:
log.error("Failed to kill the current server session")
log.error("Failed to kill the current server session")
time.sleep(1)
else:
log.error("Server not running, or was already running before")
Expand Down Expand Up @@ -777,7 +780,7 @@ def getRemoteFile(urlOrPath, destPath):
try:
urlretrieve(urlOrPath, destPath)
except IOError:
# monkey patch fix for SSL/Windows per Tika-Python #54
# monkey patch fix for SSL/Windows per Tika-Python #54
# https://github.com/chrismattmann/tika-python/issues/54
import ssl
if hasattr(ssl, '_create_unverified_context'):
Expand All @@ -803,18 +806,18 @@ def getRemoteJar(urlOrPath, destPath):
try:
urlretrieve(urlOrPath, destPath)
except IOError:
# monkey patch fix for SSL/Windows per Tika-Python #54
# monkey patch fix for SSL/Windows per Tika-Python #54
# https://github.com/chrismattmann/tika-python/issues/54
import ssl
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
# delete whatever we had there
if os.path.exists(destPath) and os.path.isfile(destPath):
os.remove(destPath)
urlretrieve(urlOrPath, destPath)
urlretrieve(urlOrPath, destPath)

return (destPath, 'remote')

def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):
'''
Checks if the specified port is open
Expand Down