Merge pull request #10 from gabriel-batistuta/dir_to_html

gabriel-batistuta · Aug 10, 2024 · 11f0900 · 11f0900
2 parents e3cc693 + 1592700
commit 11f0900
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 9 deletions.
diff --git a/pypdf2htmlEX/core.py b/pypdf2htmlEX/core.py
@@ -1,6 +1,7 @@
 import subprocess
 import os
 from os.path import basename
+from typing import Union
 
 class PDF():
     def __init__(self, pdf_file_path:str, drm=None):
@@ -26,40 +27,55 @@ def add_extra_options(args, options):
         if self.drm is True:
             options = ["pdf2htmlEX", f"{self.file_path}", "--no-drm", "1"]
             options = add_extra_options(args, options)
+            print(options)
+            for i in options:
+                print(i, end=" ")
             subprocess.call(options)
         else:
             options = ["pdf2htmlEX", f"{self.file_path}"]
             options = add_extra_options(args, options)
             subprocess.call(options)
 
-    def to_html(self, file_path=None):
+    def to_html(self, dest_dir=None, pdf_filename=None, new_file_name=None):
         '''
         Converts the PDF file to HTML.
 
         Parameters:
-        file_path (str, optional): The path to save the converted HTML file. Defaults to None will save in the same directory that pdf_file_path of PDF class.
+        dest_dir (str, optional): The path to save the converted HTML file. Defaults to None will save in the same directory that pdf_file_path of PDF class.
         '''
 
-        if file_path:
-            args = ["--dest-dir", file_path]
+        if dest_dir:
+            args = ["--dest-dir", dest_dir]
             self.__make_html(args)
         else:
             self.__make_html()
+        if pdf_filename and new_file_name:
+            try:
+                pdf_filename = pdf_filename.replace(".pdf", ".html")
+                os.rename(f"{dest_dir}/{pdf_filename}", f"{dest_dir}/{new_file_name}")
+            except Exception as e:
+                print(f"Error renaming {dest_dir}/{pdf_filename}: {e}")
 
-def dir_to_html(dir_path, dest_dir=None):
+def dir_to_html(dir_path, dest_dir=None, new_file_name=None):
     '''
     Converts all PDF files in a directory to HTML.
 
     Parameters:
-    dir_path (str): The path to the directory containing PDF files.
     dest_dir (str, optional): The path to the destination directory to save the HTML files. Defaults to None will save in same dir.
+    new_file_name (str, optional): The new name for the HTML files. Generates sequential for pdfs, ex: pdf0, pdf1, pdf2. Defaults to None will maintly name them as is replacing .pdf for .html
     '''
 
     pdf_files = [f"{dir_path}/{file}" for file in os.listdir(dir_path) if file.endswith(".pdf")]
 
-    for pdf_file in pdf_files:
+    for i, pdf_file in enumerate(pdf_files):
         pdf = PDF(pdf_file, drm=True)
         if dest_dir:
-            pdf.to_html(file_path=f'{dest_dir}/{basename(pdf_file).replace(".pdf", ".html")}')
+            if new_file_name:
+                if '.html' not in new_file_name:
+                    pdf.to_html(dest_dir=dest_dir, pdf_filename=basename(pdf_file), new_file_name=f"{new_file_name}_{i+1}.html")
+                else:
+                    pdf.to_html(dest_dir=dest_dir, pdf_filename=basename(pdf_file), new_file_name=f'{new_file_name.replace(".html","")}_{i+1}.html')
+            else:
+                pdf.to_html(dest_dir=dest_dir)
         else:
             pdf.to_html()
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
     readme = file.read()
 
 setup(name='pypdf2htmlex',
-    version='1.6',
+    version='1.8',
     license='MIT License',
     author='Gabriel Batistuta',
     long_description=readme,