Skip to content

Commit 5051a5c

Browse files
committed
links and docker
1 parent 9b61f5d commit 5051a5c

File tree

6 files changed

+77
-46
lines changed

6 files changed

+77
-46
lines changed

Dockerfile

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
FROM ubuntu
2+
3+
RUN apt-get update && apt-get install -y sudo wget python3 python3-pip git
4+
5+
6+
RUN git clone https://github.com/zevisvei/sefaria_ebooks.git
7+
8+
9+
WORKDIR /sefaria_ebooks
10+
11+
12+
RUN sudo apt install python3.12-venv -y
13+
RUN python3 -m venv venv
14+
RUN ./venv/bin/pip install -r requirements.txt
15+
16+
17+
RUN sudo apt install libegl1 libopengl0 libxcb-cursor0 libfreetype6 xz-utils libqt6core6 libqt6gui6t64 libqt6widgets6t64 -y
18+
19+
20+
RUN sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin
21+
22+
23+
CMD ["./venv/bin/python", "main.py"]

main.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
app = Flask(__name__)
77

8-
# טוען את כל הספרים
98
sefaria_api_instance = sefaria_api.SefariaApi()
109
all_books = sefaria_api_instance.table_of_contents()
1110
list_all_books = utils.recursive_register_categories(all_books)
@@ -34,17 +33,16 @@ def run_script():
3433
book_dir = ' dir="rtl"' if lang == "hebrew" else ""
3534
book_content = result.process_book()
3635
book_content = f'<html lang={lang[:2]}><head><title></title></head><body{book_dir}>{"".join(book_content)}</body></html>'
36+
if "footnote-marker" in book_content:
37+
book_content = utils.footnotes_to_epub(book_content)
3738
metadata = result.get_metadata()
38-
39-
# יצירת קובץ HTML
39+
4040
with open(html_file, "w", encoding="utf-8") as f:
4141
f.write(book_content)
42-
43-
# יצירת קובץ EPUB
42+
4443
utils.to_ebook(html_file, epub_file, metadata)
4544
epub_file = os.path.abspath(f"{file_name}.epub")
46-
47-
# החזרת הקובץ להורדה
45+
4846
return send_file(
4947
epub_file,
5048
as_attachment=True,
@@ -62,4 +60,4 @@ def run_script():
6260

6361

6462
if __name__ == "__main__":
65-
app.run(debug=True)
63+
app.run(host='0.0.0.0', port=8000, debug=True)

main_from_export.py

+6-34
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,6 @@
11
from sefaria.get_from_export import Book
2-
from sefaria.utils import sanitize_filename
2+
from sefaria.utils import sanitize_filename, to_ebook, footnotes_to_epub
33
import os
4-
import subprocess
5-
6-
7-
def to_ebook(
8-
input_file: str,
9-
output_file: str,
10-
dict_args: dict[str, str],
11-
level1_toc: str = "//h:h1",
12-
level2_toc: str = "//h:h2",
13-
level3_toc: str = "//h:h3",
14-
):
15-
args = [
16-
"ebook-convert",
17-
input_file,
18-
output_file,
19-
f"--level1-toc={level1_toc}",
20-
f"--level2-toc={level2_toc}",
21-
f"--level3-toc={level3_toc}",
22-
]
23-
for key, value in dict_args.items():
24-
args.append(f"--{key}={value}")
25-
subprocess.run(
26-
args,
27-
stdout=subprocess.DEVNULL, # משתיק את הפלט
28-
stderr=subprocess.DEVNULL, # משתיק את השגיאות
29-
check=True,
30-
)
314

325

336
def get_book(book_title: str, text_file_path: str, schema_file_path: str, lang: str):
@@ -41,7 +14,6 @@ def get_book(book_title: str, text_file_path: str, schema_file_path: str, lang:
4114

4215

4316
def main(json_folder, schemas_folder, output_folder, lang: str):
44-
eroor_list = []
4517
"""
4618
Process all books in the given folder whose path ends with 'Hebrew/Merged.json'.
4719
It finds the corresponding schema file in the schemas folder by matching the
@@ -50,7 +22,7 @@ def main(json_folder, schemas_folder, output_folder, lang: str):
5022
:param folder_path: Path to the folder containing the book files.
5123
:param schemas_folder: Path to the folder containing the schema files.
5224
"""
53-
for root, _,files in os.walk(json_folder):
25+
for root, _, files in os.walk(json_folder):
5426
for file in files:
5527
file_path = os.path.join(root, file)
5628
if file_path.lower().endswith(f'{lang}{os.sep}merged.json'):
@@ -66,14 +38,15 @@ def main(json_folder, schemas_folder, output_folder, lang: str):
6638
print(output_file_name)
6739
book_dir = ' dir="rtl"' if lang == "hebrew" else ""
6840
book_content = f'<html lang={lang[:2]}><head><title></title></head><body{book_dir}>{"".join(book_content)}</body></html>'
41+
if "footnote-marker" in book_content:
42+
book_content = footnotes_to_epub(book_content)
6943
with open(f'{output_file_name}.html', 'w', encoding='utf-8') as file:
7044
file.write(book_content)
7145
to_ebook(f"{output_file_name}.html", f"{output_file_name}.epub", metadata)
7246
os.remove(f"{output_file_name}.html")
7347
except Exception as e:
74-
eroor_list.append(f"{file_path} {e}")
75-
with open("error.txt", "w", encoding="utf-8") as f:
76-
f.write("\n".join(eroor_list))
48+
with open("error.txt", "a", encoding="utf-8") as f:
49+
f.write(f"{file_path} {e}\n")
7750

7851

7952
json_folder = "json"
@@ -82,4 +55,3 @@ def main(json_folder, schemas_folder, output_folder, lang: str):
8255
lang = "hebrew"
8356
main(json_folder=json_folder, schemas_folder=schemas_folder,
8457
output_folder=output_folder, lang=lang)
85-

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
python_hebrew_numbers
22
requests
3-
flask
3+
flask
4+
bs4

sefaria/get_from_export.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,12 @@ def process_simple_book(self) -> None:
119119

120120
def process_node(self, node: dict, text: list, level: int = 0) -> None:
121121
node_title = node['heTitle'] if self.section_names_lang == "he" else node["title"]
122-
self.book_content.append(f"<h{min(level, 6)}>{node_title}</h{min(level, 6)}>\n")
122+
if node_title:
123+
self.book_content.append(f"<h{min(level, 6)}>{node_title}</h{min(level, 6)}>\n")
124+
level += 1
123125
if node.get("nodes"):
124126
for sub_node in node['nodes']:
125-
self.process_node(sub_node, text[sub_node['title']] if sub_node['key'] != 'default' else text[''], level=level+1)
127+
self.process_node(sub_node, text[sub_node['title']] if sub_node['key'] != 'default' else text[''], level=level)
126128
else: # Process nested arrays
127129
if self.section_names_lang == "he":
128130
section_names = node.get(
@@ -133,7 +135,7 @@ def process_node(self, node: dict, text: list, level: int = 0) -> None:
133135
"sectionNames"
134136
)
135137
depth = node.get('depth', 1)
136-
self.recursive_sections(section_names, text, depth, level+1)
138+
self.recursive_sections(section_names, text, depth, level)
137139

138140
def recursive_sections(
139141
self,

sefaria/utils.py

+35
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from hebrew_numbers import int_to_gematria
33
import json
44
import subprocess
5+
from bs4 import BeautifulSoup
56

67

78
def recursive_register_categories(
@@ -99,3 +100,37 @@ def to_ebook(
99100
stderr=subprocess.DEVNULL, # משתיק את השגיאות
100101
check=True,
101102
)
103+
104+
105+
def footnotes_to_epub(html_content: str) -> str:
106+
soup = BeautifulSoup(html_content, 'html.parser')
107+
notes = []
108+
for sup_tag in soup.find_all('sup', class_='footnote-marker'):
109+
next_tag = sup_tag.find_next_sibling()
110+
if next_tag and next_tag.name == 'i' and 'footnote' in next_tag.get('class', []):
111+
note_id = f"note_{len(notes) + 1}"
112+
back_note_id = f"back_note_{len(notes) + 1}"
113+
sup_a = soup.new_tag("a", id=back_note_id, href=f"#{note_id}", title=sup_tag.text, class_="noteref", role="doc-noteref")
114+
sup_a.string = sup_tag.text
115+
sup_tag.string = ""
116+
sup_tag.append(sup_a)
117+
note_a = soup.new_tag("a", href=f"#{back_note_id}", title=sup_tag.text)
118+
note_a.string = f"←{sup_tag.text}"
119+
note_span = soup.new_tag("span", id=note_id)
120+
note_span.string = next_tag.text
121+
note_p = soup.new_tag("p")
122+
note_p.append(note_a)
123+
note_p.append(note_span)
124+
notes.append(note_p)
125+
next_tag.extract()
126+
127+
if notes:
128+
h1 = soup.new_tag("h1")
129+
h1.string = "הערות שוליים"
130+
body_tag = soup.body
131+
if body_tag:
132+
body_tag.append(h1)
133+
for note in notes:
134+
body_tag.append(note)
135+
136+
return str(soup)

0 commit comments

Comments
 (0)