-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_ingest.py
93 lines (70 loc) · 2.68 KB
/
data_ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#Data ingest
import io
import requests
import docx
if 'data_loader' not in globals():
from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
from mage_ai.data_preparation.decorators import test
@data_loader
def load_data(*args, **kwargs):
def clean_line(line):
line = line.strip()
line = line.strip('\uFEFF')
return line
def read_faq(file_id):
url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
response = requests.get(url)
response.raise_for_status()
with io.BytesIO(response.content) as f_in:
doc = docx.Document(f_in)
questions = []
question_heading_style = 'heading 2'
section_heading_style = 'heading 1'
heading_id = ''
section_title = ''
question_title = ''
answer_text_so_far = ''
for p in doc.paragraphs:
style = p.style.name.lower()
p_text = clean_line(p.text)
if len(p_text) == 0:
continue
if style == section_heading_style:
section_title = p_text
continue
if style == question_heading_style:
answer_text_so_far = answer_text_so_far.strip()
if answer_text_so_far != '' and section_title != '' and question_title != '':
questions.append({
'text': answer_text_so_far,
'section': section_title,
'question': question_title,
})
answer_text_so_far = ''
question_title = p_text
continue
answer_text_so_far += '\n' + p_text
answer_text_so_far = answer_text_so_far.strip()
if answer_text_so_far != '' and section_title != '' and question_title != '':
questions.append({
'text': answer_text_so_far,
'section': section_title,
'question': question_title,
})
return questions
faq_documents = {
'llm-zoomcamp': '1T3MdwUvqCL3jrh3d3VCXQ8xE0UqRzI3bfgpfBq3ZWG0',
}
documents = []
for course, file_id in faq_documents.items():
course_documents = read_faq(file_id)
documents.append({'course': course, 'documents': course_documents})
print('Length:',len(documents))
return [documents]
@test
def test_output(output, *args) -> None:
"""
Template code for testing the output of the block.
"""
assert output is not None, 'The output is undefined'