Skip to content

Commit

Permalink
Merge pull request #10 from gabriel-batistuta/dir_to_html
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-batistuta authored Aug 10, 2024
2 parents e3cc693 + 1592700 commit 11f0900
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
32 changes: 24 additions & 8 deletions pypdf2htmlEX/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import subprocess
import os
from os.path import basename
from typing import Union

class PDF():
def __init__(self, pdf_file_path:str, drm=None):
Expand All @@ -26,40 +27,55 @@ def add_extra_options(args, options):
if self.drm is True:
options = ["pdf2htmlEX", f"{self.file_path}", "--no-drm", "1"]
options = add_extra_options(args, options)
print(options)
for i in options:
print(i, end=" ")
subprocess.call(options)
else:
options = ["pdf2htmlEX", f"{self.file_path}"]
options = add_extra_options(args, options)
subprocess.call(options)

def to_html(self, file_path=None):
def to_html(self, dest_dir=None, pdf_filename=None, new_file_name=None):
'''
Converts the PDF file to HTML.
Parameters:
file_path (str, optional): The path to save the converted HTML file. Defaults to None will save in the same directory that pdf_file_path of PDF class.
dest_dir (str, optional): The path to save the converted HTML file. Defaults to None will save in the same directory that pdf_file_path of PDF class.
'''

if file_path:
args = ["--dest-dir", file_path]
if dest_dir:
args = ["--dest-dir", dest_dir]
self.__make_html(args)
else:
self.__make_html()
if pdf_filename and new_file_name:
try:
pdf_filename = pdf_filename.replace(".pdf", ".html")
os.rename(f"{dest_dir}/{pdf_filename}", f"{dest_dir}/{new_file_name}")
except Exception as e:
print(f"Error renaming {dest_dir}/{pdf_filename}: {e}")

def dir_to_html(dir_path, dest_dir=None):
def dir_to_html(dir_path, dest_dir=None, new_file_name=None):
'''
Converts all PDF files in a directory to HTML.
Parameters:
dir_path (str): The path to the directory containing PDF files.
dest_dir (str, optional): The path to the destination directory to save the HTML files. Defaults to None will save in same dir.
new_file_name (str, optional): The new name for the HTML files. Generates sequential for pdfs, ex: pdf0, pdf1, pdf2. Defaults to None will maintly name them as is replacing .pdf for .html
'''

pdf_files = [f"{dir_path}/{file}" for file in os.listdir(dir_path) if file.endswith(".pdf")]

for pdf_file in pdf_files:
for i, pdf_file in enumerate(pdf_files):
pdf = PDF(pdf_file, drm=True)
if dest_dir:
pdf.to_html(file_path=f'{dest_dir}/{basename(pdf_file).replace(".pdf", ".html")}')
if new_file_name:
if '.html' not in new_file_name:
pdf.to_html(dest_dir=dest_dir, pdf_filename=basename(pdf_file), new_file_name=f"{new_file_name}_{i+1}.html")
else:
pdf.to_html(dest_dir=dest_dir, pdf_filename=basename(pdf_file), new_file_name=f'{new_file_name.replace(".html","")}_{i+1}.html')
else:
pdf.to_html(dest_dir=dest_dir)
else:
pdf.to_html()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
readme = file.read()

setup(name='pypdf2htmlex',
version='1.6',
version='1.8',
license='MIT License',
author='Gabriel Batistuta',
long_description=readme,
Expand Down

0 comments on commit 11f0900

Please sign in to comment.