}-zm}MwDq(^TlJ<1
zU#1uOZFB|nSLjQj-v|A6x)S;;MZX{VtLSRzuh#2e{-FLeU2{~vAhRAy`lR&QTbJco
za(MNyZ5ON!ZEgH@WqEww-QM{4cq{*-QXgM>&&a1rb9~D^BX#~&@kRHH?D7Ak><9fr
z@NI|BItf6f60s#pQc{<6cuq-nBuA!lN;(=~F`#@Ym?zTMlChXWEXO7zGF4I%_0z!1
za>-}=Mid$Z8K3PR3D6J?)5tlQM$aiF#Zg9ruyzRA@C2-ZyF8JSCL_R32IVOja?svb
z4_ge1Faq!qxfH4=?YtqhuAGj#;qCi+`a5^`boaVJOSi0?X|UD6<|=5I8)W)eQMYV2
zSY-KpZeSHt#3a>h&Gi>p&ag89%hv3oi^j_U&jZAcfV)&PtWQREnjk&ql93Q3YVBtRnmD#AMX
z;mGG?1^;riT(cFI?Er8?oATP|0M)h(^N*v~Nf*BuTUN36?lkKM>M;LxjFNud9~
z1O=$mJllYJ7CuV`2obgoHfp?eChjBAK4j7n`uXKV$J|3meg#2@i-kJGG{ESse{g$Mvzte7^e0$6)@%
zWByz9x=sIo6|5dBa-S{vb
z8zy9K)%8LcMaA6+yuATtj$5F-14C}CYxnm4-TQhvdv|yDrDfs6nF^ZA+7KQzc3d^I
zQQcE_X6sRq^z-`q3*-QQwSErZyY=(t3+sfJPr&^BG8iYLJkqdtgHT0gL&$U-;LfhD
z&m0)JPs)vD^QNV%G^gdwp-O(S;T7WWwW-~!#AaUy=}cTJ934^zb9wy%s=8ruvdB`y
z-%Q;m`}y;;Y7U^5nczUtwjn&h{^wB_988C1^kNAd{RToD;S53)0Z$GtRp{V}@(*V%
zZxjbKeM)Z3(^@To5u`LeLH642Y&iR%eHCO`N*l`b2hE&mJ8xpHs8>
zT=wW@{?FOJf^hiuoOSCx?{KSzbVFyFt*e81%^J>X@EZ5vN;4kQf!!W}ZpaA`PL?;W
z3GBzQZ}Q>BJzeL3Oov^?K3>pWzg;YVUkk%$Y@WWUFIXS&tUrPaW%y@}^HLX)6xL7s
zHQNniowwTOK0#JqzP8sDG+VNAJx%=er6fjf^}r&bp!dIca7)pk-YMaB~uOqAh8L9?o_l
zJj|i5c@F){d95w@cHM-fXHA1*)Do2lCfgHAGyK65M-Dd08|SC^!g)>IY$5df(unC0;8EKEUUDv
ztyA^xitH+~XAqu2_%8BAhKlffL`u#i3L3Ljb5OMkx}F_o-$pL+cJ6(mZrB<&$5kxy
zR5(rx^^@cXRAdnnA@h}Fl7yfr3y?r<2&ftb%CzbK7^xvi{{DhJ<&z+lmUrw(S27Jq
zHh_>tz;et>Vq-{QMq-#hyb~2MRfJf4=IsjHL)+~u%
zUD!x;EaLd@7Mg7$IQp0<<*p;EP~s?3LR>7=ad+aU{Me$OdnJRIFmOUpGkh37Kk(d5
zU3~a4n1Amv|8?<-$6$6n=1nbY9)tN09`j2ro$JqoR*+7F3;Q5Vc-~cy`p?AwX(QZ_
zX{eBLn0=d9r#EzK$amjG-N3Pjo;aT6+pxW*01+awB)iTprGHD#bGl^T0(4vWh2TB(
zd(Y4H4-|8G`WM1KSTasBeBaXA<>Co_4`f^)D;f_}7=Drneg97Q#ichY(C1x0%MA_N
zkP>e49m^IpiBjVM#)ij>07^O-6A)NdnyBS
zCb+A+v#YzeFB9y$^Yd{nZ)~5nV;wHqiLea;-z5{z4t4bGn{kI}3*dL$s7Z@?eGA@?
YW%C!>hbvOaWU?%krM~3CWJ{#(e>> get_valid_filename("john's portrait in 2004.jpg")
+ 'johns_portrait_in_2004.jpg'
+ """
+ s = str(name).strip().replace(" ", "-")
+ s = re.sub(r"(?u)[^-\w.]", "", s)
+ if s in {"", ".", ".."}:
+ raise SuspiciousOperation("Could not derive file name from '%s'" % name)
+ return s
+
+
def get_tree(branch, seen, *args, **kwargs):
out = []
for d in branch.find_all("div", class_="cid"):
@@ -50,73 +70,88 @@ def parse(self):
count = 0
sections = self.soup.select(".dsense")
- print(len(sections))
+ for section in sections:
+ section_id = get_tree(section, set())
+ print(section_id)
+ idiom_block = self.soup.select(".idiom-block")
last_true_section_id = None
for section in sections:
section_id = get_tree(section, set())
- more_words = {section_id[0]: {}}
+ more_words = {section_id[0]: {}}
# dphrase_block = section.css(".dphrase-block").extract()
parts_of_speech = section.select(".dsense_pos")
if not parts_of_speech:
in_dsense = False
print('not in_dsense:', section_id)
- word = extract_text(section.select_one(".dphrase-title b"))
- guide_word = ''
- part_of_speech = self.soup.select_one(f"#{section_id[0]} ~ .dpos-h .dpos")
- if not part_of_speech:
- # print("pos None")
- if last_true_section_id.split('-')[0] == section_id[0].split('-')[0]:
- part_of_speech = extract_text(self.soup.select_one(f"#{last_true_section_id} ~ .dsense_h .dsense_pos"))
- # print("last")
- else:
- cid = '-'.join(section_id[0].split('-', 2)[:2])
- part_of_speech = extract_text(self.soup.select_one(f"#{cid} ~ .dpos-h .dpos"))
- # print("last not correct")
- # combinators = ['', '>', '+', '~']
- # for combinator in combinators:
- # part_of_speech = response.css(f"#{cid}{combinator} .dpos-h .dpos").css("::text").extract_first()
- # print(f"#{cid}{combinator} .dpos-h .dpos")
- # if part_of_speech is not None:
- # print("correct")
- # break
- # slice_number = 0
- # while bool(re.findall('[0-9]+', section_id[0].rsplit('-', slice_number)[0])) and part_of_speech is None:
- # combinators = ['', '>', '+', '~']
- # for combinator in combinators:
- # part_of_speech = response.css(f"#{section_id[0].rsplit('-', slice_number)[0]}{combinator} "
- # f".dpos-h .dpos").css("::text").extract_first()
- # # print(f"#{section_id[0][:slice_number]}{combinator} .dpos-h .dpos")
- # if part_of_speech is not None:
- # # print("correct")
- # break
- # if slice_number is None:
- # slice_number = 0
- # slice_number += 1
- if not word:
- word = extract_text(self.soup.select_one(".hw.dhw"))
- domain = extract_text(section.select(".ddomain"), join_char='/')
- word_meaning = extract_text(section.select(".ddef_d"))
- dlu = extract_text(section.select(".dlu"), join_char='/')
- cl = extract_text(section.select(".cl"), join_char=' ')
- if domain:
- word += f" ({domain})"
- if dlu:
- word += f" ({dlu})"
- if cl:
- word += f" ({cl})"
- elif dlu:
- word = f"{dlu}"
- if cl:
- word += f" ({cl})"
- elif cl:
- word = f"{cl}"
- else:
- word += f" ({word_meaning.split(':')[0]})"
+ if idiom_block:
+ # cid = '-'.join(section_ids[0].split('-', 2)[:2])
+ # word = extract_text(self.soup.select(f"#{cid} ~ .idiom-block b"))
+ word = extract_text(self.soup.select_one(f".idiom-block b"))
+ guide_word = '(' + extract_text(section.select(f".dsense_b .ddef_d .query"), join_char=' ') + ')'
+ part_of_speech = 'idiom'
+ else:
+ word = extract_text(section.select_one(".dphrase-title b"))
+ guide_word = ''
+ part_of_speech = self.soup.select_one(f"#{section_id[0]} ~ .dpos-h .dpos")
+ # print("before not pos")
+ if not part_of_speech:
+ # print("pos None")
+ if last_true_section_id is not None:
+ if last_true_section_id.split('-')[0] == section_id[0].split('-')[0]:
+ part_of_speech = extract_text(self.soup.select_one(f"#{last_true_section_id} ~ .dsense_h .dsense_pos"))
+ # print("last")
+ else:
+ cid = '-'.join(section_id[0].split('-', 2)[:2])
+ part_of_speech = extract_text(self.soup.select_one(f"#{cid} ~ .dpos-h .dpos"))
+ # print("last not correct")
+ else:
+ cid = '-'.join(section_id[0].split('-', 2)[:2])
+ part_of_speech = extract_text(self.soup.select_one(f"#{cid} ~ .dpos-h .dpos"))
+ # print("last not correct")
+ # combinators = ['', '>', '+', '~']
+ # for combinator in combinators:
+ # part_of_speech = response.css(f"#{cid}{combinator} .dpos-h .dpos").css("::text").extract_first()
+ # print(f"#{cid}{combinator} .dpos-h .dpos")
+ # if part_of_speech is not None:
+ # print("correct")
+ # break
+ # slice_number = 0
+ # while bool(re.findall('[0-9]+', section_id[0].rsplit('-', slice_number)[0])) and part_of_speech is None:
+ # combinators = ['', '>', '+', '~']
+ # for combinator in combinators:
+ # part_of_speech = response.css(f"#{section_id[0].rsplit('-', slice_number)[0]}{combinator} "
+ # f".dpos-h .dpos").css("::text").extract_first()
+ # # print(f"#{section_id[0][:slice_number]}{combinator} .dpos-h .dpos")
+ # if part_of_speech is not None:
+ # # print("correct")
+ # break
+ # if slice_number is None:
+ # slice_number = 0
+ # slice_number += 1
+ if not word:
+ word = extract_text(self.soup.select_one(".hw.dhw"))
+ domain = extract_text(section.select(".ddomain"), join_char='/')
+ word_meaning = extract_text(section.select(".ddef_d"))
+ dlu = extract_text(section.select(".dlu"), join_char='/')
+ cl = extract_text(section.select(".cl"), join_char=' ')
+ if domain:
+ word += f" ({domain})"
+ if dlu:
+ word += f" ({dlu})"
+ if cl:
+ word += f" ({cl})"
+ elif dlu:
+ word = f"{dlu}"
+ if cl:
+ word += f" ({cl})"
+ elif cl:
+ word = f"{cl}"
+ else:
+ word += f" ({word_meaning.split(':')[0]})"
else:
in_dsense = True
print('in_dsense:', section_id)
-
last_true_section_id = section_id[0]
# if len(section_id) > 1:
@@ -131,38 +166,45 @@ def parse(self):
# ignore this word
# else meaning found then:
# keep this word
-
- extracted_meanings = extract_text(section.select(".dsense_b > .ddef_block .ddef_d"))
- meanings_list = extracted_meanings.split(':')[:-1]
-
- if len(section_id) <= 1:
- if len(meanings_list) > 1:
- for i in range(len(meanings_list)):
- more_words[section_id[0]][i + 1] = meanings_list[i]
+ if idiom_block:
+ # print("IDIOM")
+ # cid = '-'.join(section_ids[0].split('-', 2)[:2])
+ # word = extract_text(self.soup.select(f"#{cid} ~ .idiom-block b"))
+ word = extract_text(self.soup.select_one(f".idiom-block b"))
+ guide_word = ''
+ part_of_speech = 'idiom'
else:
- if meanings_list:
- for i in range(len(meanings_list)):
- more_words[section_id[0]][i + 1] = meanings_list[i]
- for bid in section_id[1:]:
- blue_block_title = extract_text(section.select(f"#{bid} ~ .dphrase_h b"))
- if not blue_block_title:
- blue_block_meaning = extract_text(section.select(f"#{bid} ~ .dphrase_b .ddef_d"))[:-1]
- more_words[section_id[0]][bid] = blue_block_meaning
- else:
- more_words[section_id[0]][bid] = blue_block_title
- # if word has multiple meanings:
- # create another instances of those meanings
- word = extract_text(section.select_one(".dsense_hw"))
- guide_word = '(' + extract_text(section.select_one(".dsense_gw span")) + ')'
- # b = section.css("b").css("::text").extract()
- # if b:
- # if guide_word:
- # guide_word += f" ({' '.join(b)})"
- # else:
- # guide_word = f" ({' '.join(b)})"
- part_of_speech = extract_text(section.select_one(".dsense_pos"))
- # definitions = section.css(".ddef_d").css("::text").extract()
- # sentences = section.css(".deg").css("::text").extract()
+ extracted_meanings = extract_text(section.select(".dsense_b > .ddef_block .ddef_d"))
+ meanings_list = extracted_meanings.split(':')[:-1]
+
+ if len(section_id) <= 1:
+ if len(meanings_list) > 1:
+ for i in range(len(meanings_list)):
+ more_words[section_id[0]][i + 1] = meanings_list[i]
+ else:
+ if meanings_list:
+ for i in range(len(meanings_list)):
+ more_words[section_id[0]][i + 1] = meanings_list[i]
+ for bid in section_id[1:]:
+ blue_block_title = extract_text(section.select(f"#{bid} ~ .dphrase_h b"))
+ if not blue_block_title:
+ blue_block_meaning = extract_text(section.select(f"#{bid} ~ .dphrase_b .ddef_d"))[:-1]
+ more_words[section_id[0]][bid] = blue_block_meaning
+ else:
+ more_words[section_id[0]][bid] = blue_block_title
+ # if word has multiple meanings:
+ # create another instances of those meanings
+ word = extract_text(section.select_one(".dsense_hw"))
+ guide_word = '(' + extract_text(section.select_one(".dsense_gw span")) + ')'
+ # b = section.css("b").css("::text").extract()
+ # if b:
+ # if guide_word:
+ # guide_word += f" ({' '.join(b)})"
+ # else:
+ # guide_word = f" ({' '.join(b)})"
+ part_of_speech = extract_text(section.select_one(".dsense_pos"))
+ # definitions = section.css(".ddef_d").css("::text").extract()
+ # sentences = section.css(".deg").css("::text").extract()
if word:
word = re.sub("\s\s+", " ", word)
if guide_word:
@@ -172,6 +214,7 @@ def parse(self):
count += 1
# print(count)
+ print(meanings)
return meanings
@@ -237,6 +280,7 @@ def parse(self):
# = #cid~ .dsense_b .ddef_d (cbed-1-1, ..., cbed-1-8, )
word = extract_text(self.soup.select_one(f".hw.dhw"))
+
if in_dsense is True:
# word = response.css(f"#{cid}~ .dsense_h .dsense_hw").css("::text").extract_first()
if type(meaning) is tuple:
@@ -253,16 +297,23 @@ def parse(self):
meaning_text = extract_text(self.soup.select(f"#{cid} ~ .dsense_b .ddef_d"))
sentences = self.soup.select(f"#{cid} ~ .dsense_b .dexamp")
else: # in_dsense is False:
- if len(section_ids) > 1:
- cid = section_ids[1]
- meaning_text = extract_text(self.soup.select(f"#{cid} ~ .dphrase_b .ddef_d"))
- sentences = self.soup.select(f"#{cid} ~ .dphrase_b .dexamp")
+ if part_of_speech == 'idiom':
+ word = extract_text(self.soup.select_one(f".idiom-block b"))
+ # cid = '-'.join(section_ids[0].split('-', 2)[:2])
+ # word = extract_text(self.soup.select(f"#{cid} ~ .idiom-block b"))
+ meaning_text = extract_text(self.soup.select(f"#{section_ids[0]} ~ .dsense_b .ddef_d"))
+ sentences = self.soup.select(f"#{section_ids[0]} ~ .dsense_b .dexamp")
else:
- cid = section_ids[0]
- meaning_text = extract_text(self.soup.select(f"#{cid} ~ .dsense_b .ddef_d"))
- sentences = self.soup.select(f"#{cid} ~ .dsense_b .dexamp")
+ if len(section_ids) > 1:
+ cid = section_ids[1]
+ meaning_text = extract_text(self.soup.select(f"#{cid} ~ .dphrase_b .ddef_d"))
+ sentences = self.soup.select(f"#{cid} ~ .dphrase_b .dexamp")
+ else:
+ cid = section_ids[0]
+ meaning_text = extract_text(self.soup.select(f"#{cid} ~ .dsense_b .ddef_d"))
+ sentences = self.soup.select(f"#{cid} ~ .dsense_b .dexamp")
# print("MeaningText:", meaning_text)
- print("Sentences:", type(sentences), len(sentences), type(sentences[0]))
+ # print("Sentences:", type(sentences), len(sentences), type(sentences[0]))
if tld == "co.uk":
accent_tld = "uk"
@@ -314,8 +365,8 @@ def parse(self):
# us_pronunciation = response.css(".us #ampaudio2 source::attr(src)").extract_first() # amp-audio
def download_audio() -> str:
- filename = word + '_' + accent_tld + '.mp3'
-
+ filename = get_valid_filename(word + '_' + accent_tld + '.mp3')
+ # print(filename)
tts = gTTS(word, lang='en', tld=tld)
if not os.path.exists('media'):
os.makedirs('media')
@@ -358,14 +409,16 @@ def download_audio() -> str:
'part_of_speech': part_of_speech,
'meaning': meaning_text.split(':')[0],
'sentences': ''.join(sentences_list),
- 'phonemic_script': '/' + phonemic_script + '/',
+ 'phonemic_script': '' if not phonemic_script else '/' + phonemic_script + '/',
'pronunciation_word': download_audio(),
- 'synonyms': f"Synonyms"
+ 'synonyms': f"Synonyms"
}
# dictionary_item['sentences'] = ''.join(sentences).split('.')[:2] # ''.join(sentences)
# dictionary_item['sentences'] = re.findall('.*?[.!?]', ''.join(sentences))[:2]
# dictionary_item['us_phonemic_script'] = '/' + us_phonemic_script + '/'
# dictionary_item['us_pronunciation'] = download_audio('us', us_pronunciation)
jta = JsonToApkg(dictionary_item)
+ # print(dictionary_item)
jta.generate_apkg()
+ # print("Generated.")
return dictionary_item
diff --git a/src/lib/__pycache__/json_to_apkg.cpython-38.pyc b/src/lib/__pycache__/json_to_apkg.cpython-38.pyc
index d87b2fe1fd004578da39d2a9117d2d18343038ec..977263be3dc565339408e4af10063848a3129d75 100644
GIT binary patch
delta 148
zcmdljdt8<;l$V!_0SFAGJ(6asPUM@yC^d0$7^A?%Uz<2;n2Wtqm}{9fuVKt$WU~S(
zW1cL-Y{bgN#KOcj*`IkOW60*8%o8}d!+}!82@nNSc=dIaco>l(6C=-e7Csh6CPt=z
eY^)HPg@ucSi&2V&g=z8!-ZaLb$>Ds-j0ON}1RWFr
delta 135
zcmX>uyIYnol$V!_0SFfScS|~=Jdtk-qtL{~VT>FTe{I^lg)xhf%?u>VI9Y|+h>>S<
zH1kSEzs;;H6F9j8fx^WJAW5dl3wZT)6?qtuArm9dcNSh2MkYq4e{8G}nuVE*nTt`1
Tg@tkQAKo-ZpUKI5$&6Y6Sppkv
diff --git a/src/lib/json_to_apkg.py b/src/lib/json_to_apkg.py
index 2039687..52aed3c 100644
--- a/src/lib/json_to_apkg.py
+++ b/src/lib/json_to_apkg.py
@@ -32,6 +32,8 @@
def generate_cloze(phrase: str):
+ # print("Starting generate_cloze..")
+ # print(phrase)
n = len(phrase) - phrase.count(' ')
if (n % 2) == 0:
u_count = int(n/2)
@@ -68,8 +70,12 @@ def generate_cloze(phrase: str):
cloze_list[temp_index] = cloze_text
phrase_list.remove(temp_word)
# print(phrase_list)
+ if not phrase_list:
+ # print("Avoiding error")
+ u_count = 0
# print(cloze_list)
# print("end of loop:", temp_len, u_count)
+ # print("No Problem in generate_cloze")
return ' '.join(cloze_list)
@@ -80,6 +86,7 @@ def __init__(self, j_dict):
def generate_apkg(self):
# create/initialize model
+ # print('before my_model')
my_model = genanki.Model(
1646879431108, # todo: change id and also create new customized structure
name='English Vocab',
@@ -112,6 +119,7 @@ def generate_apkg(self):
# just do these steps
# automatic fill
# todo: cloze, picture, synonyms, arrange in order, if sound not there then?
+ # print('before list_of_fields')
list_of_fields = [
self.j_dict.get("word", ""),
self.j_dict.get("part_of_speech", ""),
@@ -125,23 +133,27 @@ def generate_apkg(self):
]
# list_of_fields = [x for x in self.j_dict.values()]
+ # print('Before my_note')
my_note = genanki.Note(
model=my_model,
fields=list_of_fields
)
-
+ # print('Before my_deck')
my_deck = genanki.Deck(
1646145285163, # todo: change id and name
"English Vocabulary (British Accent)")
+ # print('before adding a note to deck')
my_deck.add_note(my_note)
# add media
+ # print('before my_package')
my_package = genanki.Package(my_deck)
my_package.media_files = ['media/' + self.j_dict["pronunciation_word"][7:-1:]]
# generate apkg
# my_package.write_to_file('output-' + self.j_dict["word"] + '.apkg')
# apkg_filename = 'output-' + dt.now().strftime("%Y%m%d%H%M%S") + '.apkg'
apkg_filename = 'output' + '.apkg'
+ # print('before writing')
my_package.write_to_file(apkg_filename)
return apkg_filename