1+ import os
2+ from PyPDF2 import PdfReader
3+
4+ # Input and output folders
5+ INPUT_FOLDER = "input"
6+ OUTPUT_FOLDER = "output"
7+
8+ # Ensure output directory exists
9+ os .makedirs (OUTPUT_FOLDER , exist_ok = True )
10+
11+ def extract_text_from_pdf (pdf_path ):
12+ """Extract all text from a single PDF file."""
13+ text = ""
14+ try :
15+ with open (pdf_path , "rb" ) as file :
16+ reader = PdfReader (file )
17+ for page_num , page in enumerate (reader .pages ):
18+ text += f"\n --- Page { page_num + 1 } ---\n "
19+ text += page .extract_text () or ""
20+ except Exception as e :
21+ print (f"Error reading { pdf_path } : { e } " )
22+ return text
23+
24+ def process_all_pdfs ():
25+ """Read all PDFs in input folder and save text to output folder."""
26+ pdf_files = [f for f in os .listdir (INPUT_FOLDER ) if f .lower ().endswith (".pdf" )]
27+
28+ if not pdf_files :
29+ print ("No PDF files found in 'pdfs/' folder." )
30+ return
31+
32+ for pdf_file in pdf_files :
33+ pdf_path = os .path .join (INPUT_FOLDER , pdf_file )
34+ txt_filename = os .path .splitext (pdf_file )[0 ] + ".txt"
35+ txt_path = os .path .join (OUTPUT_FOLDER , txt_filename )
36+
37+ print (f"Processing { pdf_file } ..." )
38+ text = extract_text_from_pdf (pdf_path )
39+
40+ with open (txt_path , "w" , encoding = "utf-8" ) as txt_file :
41+ txt_file .write (text )
42+
43+ print (f"Saved extracted text to { txt_path } " )
44+
45+ if __name__ == "__main__" :
46+ process_all_pdfs ()
47+
0 commit comments