-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_json_files.py
154 lines (127 loc) · 5.21 KB
/
parse_json_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import json
import logging
import os
import random
from pathlib import Path
from typing import List
import genanki
import requests
from requests import HTTPError
DATA_DIR = "./data" # change to json dir, move to settings.py
AUDIO_DIR = "./audio"
logging.basicConfig(filename='info.log', level=logging.INFO)
def load_files() -> List[dict]:
tmp = []
for json_file in os.listdir(DATA_DIR):
with open(f"{DATA_DIR}/{json_file}", "r") as f:
tmp.append(json.load(f))
return tmp
def get_hint_text_and_audio(skill: dict) -> List[dict]:
title = skill.get('title', '')
logging.info(title)
hints = []
# resources to prefetch
resourcesToPrefetch = skill.get('resourcesToPrefetch', [])
for resource in resourcesToPrefetch:
required = resource.get('required')
type = resource.get('type')
url = resource.get('url')
if type != 'mp3':
continue
logging.info((required, type, url))
# elements
elements: List = skill.get('elements', [])
for element in elements:
# we only want elements that are of the type 'text'
element_type: str = element.get('type', '')
if element_type != 'text':
continue
# meta
autoPlayableTTS: List = element.get('meta', {}).get('autoPlayableTTS', [])
sectionIndex: int = element.get('meta', {}).get('sectionIndex')
# element.styledString
text: str = element.get('element', {}).get('styledString', {}).get('text', '')
styling = element.get('element', {}).get('styledString', {}).get('styling', '')
# element.hints
hintLinks = element.get('element', {}).get('hints', {}).get('hintLinks', [])
hints_hints = element.get('element', {}).get('hints', {}).get('hints', [])
for link in hintLinks:
index = link.get('index')
rangeEnd = link.get('rangeEnd')
rangeStart = link.get('rangeStart')
logging.info((index, rangeEnd, rangeStart, text[rangeStart:rangeEnd + 1]))
logging.info(text[rangeStart:rangeEnd + 1], hints_hints[index])
# element.tokenTTS
tokenTTSCollection = element.get('element', {}).get('tokenTTS', {}).get('tokenTTSCollection', [])
for tokenTTS in tokenTTSCollection:
endIndex = tokenTTS.get('endIndex')
startIndex = tokenTTS.get('startIndex')
ttsURL = tokenTTS.get('ttsURL')
logging.info((endIndex, startIndex, ttsURL, text[startIndex:endIndex + 1]))
# find hint link with same start and end as this token
for link in hintLinks:
if link.get("rangeEnd") is endIndex and link.get("rangeStart") is startIndex:
index = link.get('index')
hint = hints_hints[index]
hints.append({
# sometimes startIndex and endIndex are the same, meaning it's one character at that index
'text': text[startIndex:endIndex + 1],
'hint': hint,
'ttsURL': ttsURL
})
return hints
def create_deck():
files = load_files()
if not os.path.exists(AUDIO_DIR):
os.mkdir(AUDIO_DIR)
model_id = random.randrange(1 << 30, 1 << 31)
model = genanki.Model(
model_id,
'Simple Model with Media',
fields=[
{'name': 'Question'},
{'name': 'Answer'},
{'name': 'AnswerSound'}
],
templates=[
{
'name': 'Card 1',
'qfmt': '{{Question}}<br>',
'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}<br>{{AnswerSound}}',
},
])
deck = genanki.Deck(random.randrange(1 << 30, 1 << 31), "Duolingo Mandrin Chinese")
media_files = []
for skill in files:
hints: List = get_hint_text_and_audio(skill)
for hint in hints:
try:
ttsURL = hint.get('ttsURL')
hint_text = hint.get('text')
hint_hint = hint.get('hint')
# download audio
response = requests.get(ttsURL)
response.raise_for_status()
# write audio to file
file_name = f"{hint_text}.mp3"
with open(file_name, "wb") as f:
f.write(response.content)
# add audio file to media files list
media_files.append(file_name)
deck.add_note(genanki.Note(model=model,
fields=[f"{hint_text}",
f"{hint_hint}",
f"[sound:{file_name}]"
]))
logging.info(f"Created note for {hint_text} / {hint_hint}.")
except HTTPError as e:
logging.error(f"Got {str(e)} from {hint}")
package = genanki.Package(deck)
package.media_files = media_files
package.write_to_file('output.apkg')
logging.info("Finished creating deck.")
for p in Path(".").glob("*.mp3"):
p.unlink()
logging.info("Removed .mp3 files")
if __name__ == "__main__":
create_deck()