Skip to content

Commit

Permalink
try new functing
Browse files Browse the repository at this point in the history
  • Loading branch information
Catrunaround committed Jul 9, 2024
1 parent 439417f commit 23ffdd4
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 9 deletions.
40 changes: 33 additions & 7 deletions rag/scraper/Scrape_pdf/Scrape_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@
from rag.scraper.Scrape_pdf.pdf_helper import missing_page_fill
import os

def create_folder_from_filename(filename: str) -> str:
"""Create a folder using the given filename.
Args:
filename (str): The filename to be used for creating the folder.
Returns:
str: The path to the created or existing folder.
"""
# Extract the folder name from the filename (remove extension if any)
folder_name = os.path.splitext(filename)[0]

# Ensure the folder exists
if not os.path.exists(folder_name):
os.makedirs(folder_name)
print(f"Created folder: {folder_name}")
else:
print(f"Folder already exists: {folder_name}")

return folder_name

def add_remove_image_suffix(pdf_path: str) -> str:
"""Add '_remove_image' suffix to a PDF file path while keeping the .pdf extension."""
Expand All @@ -12,7 +32,7 @@ def add_remove_image_suffix(pdf_path: str) -> str:
raise ValueError("The input file must be a PDF.")

# Add the suffix before the file extension
new_path = input_path.with_stem(input_path.stem + "_remove_image")
new_path = input_path.with_stem(input_path.stem)

return str(new_path)

Expand All @@ -36,24 +56,30 @@ def remove_images(input_pdf, output_pdf):
doc.save(output_pdf)
doc.close()

def process_pdf(input_pdf: str) -> str:
"""Process the PDF by removing images and saving to a new file with '_remove_image' suffix."""
output_pdf = add_remove_image_suffix(input_pdf)
def process_pdf(input_pdf: str, output_folder: str) -> str:
"""Process the PDF by removing images and saving to a new file in the specified folder."""
# Ensure the output folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)

output_pdf_name = add_remove_image_suffix(os.path.basename(input_pdf))
output_pdf = os.path.join(output_folder, output_pdf_name)
remove_images(input_pdf, output_pdf)
return output_pdf

def generate_mmd_file_path(folder_path):
folder_name = os.path.basename(folder_path)
mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd")
mmd_file_path = os.path.join(folder_path, f"{folder_name}.mmd")
return mmd_file_path


input_pdf = "rag\scraper\Scrape_pdf\example_pdf.pdf"
folder_name = "rag\scraper\Scrape_pdf\example_pdf"



if __name__ == '__main__':
output_pdf = process_pdf(input_pdf)
folder_name = create_folder_from_filename(input_pdf)
output_pdf = process_pdf(input_pdf, folder_name)
pdf_to_md(output_pdf, folder_name)
md_file_path = generate_mmd_file_path(folder_name)
missing_page_fill(output_pdf, folder_name, md_file_path)
11 changes: 9 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import nougat
print(dir(nougat))
import tensorflow as tf

print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

if len(tf.config.experimental.list_physical_devices('GPU')) == 0:
print("No GPU found. Please check your installation.")
else:
for device in tf.config.experimental.list_physical_devices('GPU'):
print("GPU Device:", device)

0 comments on commit 23ffdd4

Please sign in to comment.