diff --git a/rag/scraper/Scrape_pdf/Scrape_pdf.py b/rag/scraper/Scrape_pdf/Scrape_pdf.py index 205ea95..c5d0afa 100644 --- a/rag/scraper/Scrape_pdf/Scrape_pdf.py +++ b/rag/scraper/Scrape_pdf/Scrape_pdf.py @@ -4,6 +4,26 @@ from rag.scraper.Scrape_pdf.pdf_helper import missing_page_fill import os +def create_folder_from_filename(filename: str) -> str: + """Create a folder using the given filename. + + Args: + filename (str): The filename to be used for creating the folder. + + Returns: + str: The path to the created or existing folder. + """ + # Extract the folder name from the filename (remove extension if any) + folder_name = os.path.splitext(filename)[0] + + # Ensure the folder exists + if not os.path.exists(folder_name): + os.makedirs(folder_name) + print(f"Created folder: {folder_name}") + else: + print(f"Folder already exists: {folder_name}") + + return folder_name def add_remove_image_suffix(pdf_path: str) -> str: """Add '_remove_image' suffix to a PDF file path while keeping the .pdf extension.""" @@ -12,7 +32,7 @@ def add_remove_image_suffix(pdf_path: str) -> str: raise ValueError("The input file must be a PDF.") # Add the suffix before the file extension - new_path = input_path.with_stem(input_path.stem + "_remove_image") + new_path = input_path.with_stem(input_path.stem) return str(new_path) @@ -36,24 +56,30 @@ def remove_images(input_pdf, output_pdf): doc.save(output_pdf) doc.close() -def process_pdf(input_pdf: str) -> str: - """Process the PDF by removing images and saving to a new file with '_remove_image' suffix.""" - output_pdf = add_remove_image_suffix(input_pdf) +def process_pdf(input_pdf: str, output_folder: str) -> str: + """Process the PDF by removing images and saving to a new file in the specified folder.""" + # Ensure the output folder exists + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + output_pdf_name = add_remove_image_suffix(os.path.basename(input_pdf)) + output_pdf = os.path.join(output_folder, output_pdf_name) remove_images(input_pdf, output_pdf) return output_pdf def generate_mmd_file_path(folder_path): folder_name = os.path.basename(folder_path) - mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd") + mmd_file_path = os.path.join(folder_path, f"{folder_name}.mmd") return mmd_file_path input_pdf = "rag\scraper\Scrape_pdf\example_pdf.pdf" -folder_name = "rag\scraper\Scrape_pdf\example_pdf" + if __name__ == '__main__': - output_pdf = process_pdf(input_pdf) + folder_name = create_folder_from_filename(input_pdf) + output_pdf = process_pdf(input_pdf, folder_name) pdf_to_md(output_pdf, folder_name) md_file_path = generate_mmd_file_path(folder_name) missing_page_fill(output_pdf, folder_name, md_file_path) diff --git a/test.py b/test.py index 69f68ca..43d84de 100644 --- a/test.py +++ b/test.py @@ -1,3 +1,10 @@ -import nougat -print(dir(nougat)) +import tensorflow as tf +print("TensorFlow version:", tf.__version__) +print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) + +if len(tf.config.experimental.list_physical_devices('GPU')) == 0: + print("No GPU found. Please check your installation.") +else: + for device in tf.config.experimental.list_physical_devices('GPU'): + print("GPU Device:", device)