-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_typesense_index.py
98 lines (87 loc) · 2.79 KB
/
make_typesense_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import glob
import os
from typesense.api_call import ObjectNotFound
from acdh_cfts_pyutils import TYPESENSE_CLIENT as client
from acdh_cfts_pyutils import CFTS_COLLECTION
from acdh_tei_pyutils.tei import TeiReader
from tqdm import tqdm
files = glob.glob("./data/editions/*.xml")
try:
client.collections["hbtv"].delete()
except ObjectNotFound:
pass
current_schema = {
"name": "hbtv",
"enable_nested_fields": True,
"fields": [
{"name": "id", "type": "string"},
{"name": "rec_id", "type": "string"},
{"name": "title", "type": "string"},
{"name": "full_text", "type": "string"},
{"name": "authors", "type": "object[]", "facet": True, "optional": True},
{"name": "type", "type": "string[]", "facet": True, "optional": True},
{
"name": "year",
"type": "int32",
"optional": True,
"facet": True,
},
],
}
client.collections.create(current_schema)
records = []
cfts_records = []
for x in tqdm(files, total=len(files)):
cfts_record = {
"project": "hbtv",
}
record = {}
doc = TeiReader(x)
body = doc.any_xpath(".//tei:body")[0]
record["id"] = os.path.split(x)[-1].replace(".xml", "")
cfts_record["id"] = record["id"]
cfts_record[
"resolver"
] = f"https://bahr-textverzeichnis.acdh.oeaw.ac.at/{record['id']}.html"
record["rec_id"] = os.path.split(x)[-1]
types = []
for a in doc.any_xpath(".//tei:biblStruct[@type]"):
try:
types.append(a.attrib["subtype"])
except KeyError:
pass
types.append(a.attrib["type"])
record["type"] = types
authors = []
for a in doc.any_xpath(".//tei:author[@ref]"):
item = {}
item["id"] = a.attrib["ref"]
item["name"] = a.text
authors.append(item)
record["authors"] = authors
cfts_record["rec_id"] = record["rec_id"]
record["title"] = " ".join(
" ".join(doc.any_xpath('.//tei:titleStmt/tei:title[@level="a"]/text()')).split()
)
cfts_record["title"] = record["title"]
try:
date_str = doc.any_xpath('//tei:titleStmt/tei:title[@type="iso-date"]/text()')[
0
]
except IndexError:
date_str = "1000"
try:
record["year"] = int(date_str[:4])
cfts_record["year"] = int(date_str[:4])
except ValueError:
pass
record["full_text"] = " ".join("".join(body.itertext()).split())
cfts_record["full_text"] = record["full_text"]
records.append(record)
cfts_records.append(cfts_record)
make_index = client.collections["hbtv"].documents.import_(records)
print(make_index)
print("done with indexing hbtv")
make_index = CFTS_COLLECTION.documents.import_(cfts_records, {"action": "upsert"})
print(make_index)
print("done with cfts-index hbtv")