-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
90 lines (78 loc) · 3.96 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
##Copyright (c) 2022 duncan g. smith
##
##Permission is hereby granted, free of charge, to any person obtaining a
##copy of this software and associated documentation files (the "Software"),
##to deal in the Software without restriction, including without limitation
##the rights to use, copy, modify, merge, publish, distribute, sublicense,
##and/or sell copies of the Software, and to permit persons to whom the
##Software is furnished to do so, subject to the following conditions:
##
##The above copyright notice and this permission notice shall be included
##in all copies or substantial portions of the Software.
##
##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
##OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
##FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
##THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
##OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
##ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
##OTHER DEALINGS IN THE SOFTWARE.
import os
import textract
EXTENSION_SYNONYMS = {} # mapping of alternative extensions to the ones used here (lower case)
def process(filename, input_encoding=None, output_encoding='utf8',
extension=None, **kwargs):
"""Passes arguments directly to textract.process initially. If the file
extension (e.g. pdf) is not provided it is extracted from
filename.
Returns a bytestring encoded with 'output_encoding'.
Supported filetypes and dependencies are listed at
https://github.com/deanmalmgren/textract/blob/05fdc7a08dc3fc52eb519aefac4fcbec8981dd8e/docs/index.rst
If textract does not support the extension, then an appropriate parser is sought in this module.
If no suitable parser exists (in textract or this module) a NotImplementedError is raised.
"""
try:
txt = textract.process(filename, input_encoding=input_encoding,
output_encoding=output_encoding,
extension=extension, **kwargs)
except textract.exceptions.ExtensionNotSupported:
# see if it is supported in this module
# get extension
ext = extension
if ext is None:
_, ext = os.path.splitext(filename)
else:
if not ext.startswith('.'):
ext = '.' + ext
ext = ext.lower()
if ext in EXTENSION_SYNONYMS:
# get the extension that we use here
ext = EXTENSION_SYNONYMS[ext]
# look for relevant class or raise NotImplementedError if not found
try:
parser = globals()[ext[1:].capitalize() + 'Parser']()
except KeyError:
raise NotImplementedError('No available parser for extension {}'.format(extension))
else:
txt = parser.process(filename, input_encoding=input_encoding,
output_encoding=output_encoding, **kwargs)
return txt
# For convenience (and potentially contributing to textract) parsers can be derived
# from the following textract classes.
# Derived classes need only define an 'extract' method taking a filename as argument
# (and which is also passed **kwargs) that returns a byte-encoded or unicode string.
# ShellParser is the better option for parsers that need to run external programs
# See https://github.com/deanmalmgren/textract/tree/master/textract/parsers for details.
from textract.parsers import utils
BaseParser = utils.BaseParser
ShellParser = utils.ShellParser
# Place parser classes here
# For extension e.g. '.ext' the corresponding parser should be named 'ExtParser'
class TestParser(BaseParser):
def extract(self, filename, **kwargs):
# Read file and do whatever is required to
# generate the text.
# To test just call process with the path to a text
# file and supply "extension='.test'".
with open(filename, 'r') as f:
return f.read()