Skip to content

Commit

Permalink
Merge pull request #6 from gabriel-batistuta/dir_to_html
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-batistuta authored Aug 9, 2024
2 parents edc684d + 558aa25 commit 2a189f9
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 2 deletions.
3 changes: 2 additions & 1 deletion pypdf2htmlEX/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .core import PDF
from .core import PDF
from .core import dir_to_html
35 changes: 34 additions & 1 deletion pypdf2htmlEX/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import subprocess
import os
from os.path import basename

class PDF():
def __init__(self, pdf_file_path:str, drm=None):
Expand All @@ -7,6 +9,13 @@ def __init__(self, pdf_file_path:str, drm=None):

def __make_html(self, args=None):

'''
Private main function to make the HTML file from the PDF passing the parameters.
Parameters:
args (list, optional): Additional arguments to pass to pdf2htmlEX that is not in the library. Defaults to None.
'''

def add_extra_options(args, options):
if args:
options.extend(args)
Expand All @@ -24,9 +33,33 @@ def add_extra_options(args, options):
subprocess.call(options)

def to_html(self, file_path=None):
'''
Converts the PDF file to HTML.
Parameters:
file_path (str, optional): The path to save the converted HTML file. Defaults to None will save in the same directory that pdf_file_path of PDF class.
'''

if file_path:
args = ["--dest-dir", file_path]
self.__make_html(args)
else:
self.__make_html()


def dir_to_html(dir_path, dest_dir=None):
'''
Converts all PDF files in a directory to HTML.
Parameters:
dir_path (str): The path to the directory containing PDF files.
dest_dir (str, optional): The path to the destination directory to save the HTML files. Defaults to None will save in same dir.
'''

pdf_files = [f"{dir_path}/{file}" for file in os.listdir(dir_path) if file.endswith(".pdf")]

for pdf_file in pdf_files:
pdf = PDF(pdf_file, drm=True)
if dest_dir:
pdf.to_html(file_path=f'{dest_dir}/{basename(pdf_file).replace(".pdf", ".html")}')
else:
pdf.to_html()

0 comments on commit 2a189f9

Please sign in to comment.