-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_texts_from_path.py
107 lines (98 loc) · 4.07 KB
/
get_texts_from_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import json
import numpy as np
import re
def read_json(json_doc):
with open(json_doc, "r", encoding="utf8") as src:
doc = json.load(src)
return doc
def get_docs_batch(path, start, end):
docs = []
files = sorted(os.listdir(path))
batch = files[start:end]
for json_doc in batch:
extracted_doc = read_json(os.path.join(path, json_doc))
sources = np.array(extracted_doc['paragraph_source_docs'])
indices = np.where(sources[:-1] != sources[1:])[0]
indices = indices + np.ones(len(indices), dtype=int)
breaches = []
changes = False
if len(indices):
text = []
changes = True
indices = np.insert(indices, 0, 0)
indices = np.insert(indices, len(indices), len(sources))
length = 0
for i in range(len(indices) - 1):
if i == len(indices) - 2:
text.append('\n'.join(extracted_doc['paragraphs'][indices[i]:indices[i + 1]]))
else:
text.append('\n'.join([*extracted_doc['paragraphs'][indices[i]:indices[i + 1]], '']))
length += len(text[-1])
breaches.append(length)
text = ''.join(text)
else:
text = '\n'.join(extracted_doc['paragraphs'])
if len(text) > 0:
docs.append((json_doc, text, {"style_change": changes, "style_breaches": breaches}))
return docs
def get_docs(path):
docs = []
files = sorted(os.listdir(path))
for json_doc in files:
extracted_doc = read_json(os.path.join(path, json_doc))
sources = np.array(extracted_doc['paragraph_source_docs'])
indices = np.where(sources[:-1] != sources[1:])[0]
indices = indices + np.ones(len(indices), dtype=int)
breaches = []
changes = False
if len(indices):
text = []
changes = True
indices = np.insert(indices, 0, 0)
indices = np.insert(indices, len(indices), len(sources))
length = 0
for i in range(len(indices) - 1):
if i == len(indices) - 2:
text.append('\n'.join(extracted_doc['paragraphs'][indices[i]:indices[i + 1]]))
else:
text.append('\n'.join([*extracted_doc['paragraphs'][indices[i]:indices[i + 1]], '']))
length += len(text[-1])
breaches.append(length)
text = ''.join(text)
else:
text = '\n'.join(extracted_doc['paragraphs'])
if len(text) > 0:
docs.append((json_doc, text, {"style_change": changes, "style_breaches": breaches}))
return docs
def get_vecs(path, vecpath):
doc_vecs = []
files = sorted(os.listdir(path))
filenames = [re.sub('json$', '', file)for file in files]
for fn in filenames:
extracted_vec = read_json(os.path.join(vecpath, fn + 'truth'))["vector"]
extracted_doc = read_json(os.path.join(path, fn + 'json'))
sources = np.array(extracted_doc['paragraph_source_docs'])
indices = np.where(sources[:-1] != sources[1:])[0]
indices = indices + np.ones(len(indices), dtype=int)
breaches = []
changes = False
if len(indices):
text = []
changes = True
indices = np.insert(indices, 0, 0)
indices = np.insert(indices, len(indices), len(sources))
length = 0
for i in range(len(indices) - 1):
if i == len(indices) - 2:
text.append('\n'.join(extracted_doc['paragraphs'][indices[i]:indices[i + 1]]))
else:
text.append('\n'.join([*extracted_doc['paragraphs'][indices[i]:indices[i + 1]], '']))
length += len(text[-1])
breaches.append(length)
text = ''.join(text)
else:
text = '\n'.join(extracted_doc['paragraphs'])
if len(text) > 0:
doc_vecs.append((fn, text, extracted_vec, {"style_change": changes, "style_breaches": breaches}))
return doc_vecs