Skip to content

Commit 9f99f72

Browse files
authored
Merge pull request #85 from haragam22/main
Add PDF text extraction feature using PyPDF2
2 parents 769cfa9 + 9c1a7b4 commit 9f99f72

File tree

4 files changed

+48
-0
lines changed

4 files changed

+48
-0
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
from PyPDF2 import PdfReader
3+
4+
# Input and output folders
5+
INPUT_FOLDER = "input"
6+
OUTPUT_FOLDER = "output"
7+
8+
# Ensure output directory exists
9+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
10+
11+
def extract_text_from_pdf(pdf_path):
12+
"""Extract all text from a single PDF file."""
13+
text = ""
14+
try:
15+
with open(pdf_path, "rb") as file:
16+
reader = PdfReader(file)
17+
for page_num, page in enumerate(reader.pages):
18+
text += f"\n--- Page {page_num + 1} ---\n"
19+
text += page.extract_text() or ""
20+
except Exception as e:
21+
print(f"Error reading {pdf_path}: {e}")
22+
return text
23+
24+
def process_all_pdfs():
25+
"""Read all PDFs in input folder and save text to output folder."""
26+
pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]
27+
28+
if not pdf_files:
29+
print("No PDF files found in 'pdfs/' folder.")
30+
return
31+
32+
for pdf_file in pdf_files:
33+
pdf_path = os.path.join(INPUT_FOLDER, pdf_file)
34+
txt_filename = os.path.splitext(pdf_file)[0] + ".txt"
35+
txt_path = os.path.join(OUTPUT_FOLDER, txt_filename)
36+
37+
print(f"Processing {pdf_file}...")
38+
text = extract_text_from_pdf(pdf_path)
39+
40+
with open(txt_path, "w", encoding="utf-8") as txt_file:
41+
txt_file.write(text)
42+
43+
print(f"Saved extracted text to {txt_path}")
44+
45+
if __name__ == "__main__":
46+
process_all_pdfs()
47+
1.99 MB
Binary file not shown.
15.1 MB
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
PyPDF2

0 commit comments

Comments
 (0)