-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxtract_tika_main.py
46 lines (35 loc) · 1.16 KB
/
xtract_tika_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json
import tika
import os
import argparse
tika.TikaClientOnly = True
# tika.TIKA_SERVER_JAR="file://tika-tester/tika-server-1.24-bin/tika-server.jar"
from tika import parser
# tika.tika.TikaServerJar = "file://tika-tester/tika-server-1.24-bin/tika-server.jar"
# tika.tika.TikaServerLogFilePath = ""
# print(tika.tika.TikaLogFile)
# Parses a single file
def parse_file(path):
print(f"Path: {path}")
print(type(path))
parsed = parser.from_file(path)
return parsed["metadata"]
# Goes through all the files in rootdir and saves the metadata to JSON
def save_output(rootdir):
for subdir, dirs, files in os.walk(rootdir):
for filename in files:
filepath = subdir + os.sep + filename
output = parse_file(filepath)
with open(f'output/{filename}.json', 'w') as f:
json.dump(output, f)
# parse_file('coviddata2021-02-13.csv')
def execute_extractor(path):
get_data = parse_file(path)
return get_data
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument('--path')
args = ap.parse_args()
path = args.path
x = parse_file(path)
print(f"Metadata: {x}")