diff --git a/README.md b/README.md index 02a63a5..2c95ff0 100644 --- a/README.md +++ b/README.md @@ -144,9 +144,10 @@ convert( ) ``` -The `ConversionConfig` class accepts the same parameters as the command line arguments: +The ConversionConfig class parameters: -- `pptx_path`: Path to the input PPTX file (required) +- `pptx`: A file-like object containing the PPTX data (required if `pptx_path` is not provided; not available via command line) +- `pptx_path`: Path to the input PPTX file (required if `pptx` is not provided or if using the command line) - `output_path`: Path for the output markdown file (required) - `image_dir`: Directory for extracted images (required) - `title_path`: Path to custom titles file @@ -165,7 +166,7 @@ The `ConversionConfig` class accepts the same parameters as the command line arg - `page`: Convert only specified page number - `keep_similar_titles`: Keep similar titles with "(cont.)" suffix - +Note: Provide either `pptx_path` or `pptx`, not both. If both are provided, `pptx_path` takes precedence. ## Detailed Parse Rules diff --git a/pptx2md/entry.py b/pptx2md/entry.py index 6ad1bb8..7cb03b2 100644 --- a/pptx2md/entry.py +++ b/pptx2md/entry.py @@ -26,7 +26,7 @@ def convert(config: ConversionConfig): if config.title_path: config.custom_titles = prepare_titles(config.title_path) - prs = load_pptx(config.pptx_path) + prs = load_pptx(config) logger.info("conversion started") diff --git a/pptx2md/parser.py b/pptx2md/parser.py index edcc10a..6ddecdc 100644 --- a/pptx2md/parser.py +++ b/pptx2md/parser.py @@ -150,8 +150,11 @@ def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageEl global picture_count - file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1]) - pic_name = file_prefix + f'_{picture_count}' + if config.pptx_path is None: + pic_name = f'img_{picture_count}' + else: + file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1]) + pic_name = file_prefix + f'_{picture_count}' pic_ext = shape.image.ext if not os.path.exists(config.image_dir): os.makedirs(config.image_dir) diff --git a/pptx2md/types.py b/pptx2md/types.py index 0de9e4f..e19c68e 100644 --- a/pptx2md/types.py +++ b/pptx2md/types.py @@ -14,17 +14,27 @@ from __future__ import annotations +import logging from enum import Enum from pathlib import Path +from io import BytesIO, BufferedReader from typing import List, Optional, Union -from pydantic import BaseModel +from pydantic import BaseModel, model_validator, ConfigDict + +logger = logging.getLogger(__name__) + + +FileLikeType = Union[BytesIO, BufferedReader] class ConversionConfig(BaseModel): """Configuration for PowerPoint to Markdown conversion.""" - pptx_path: Path + pptx: Optional[FileLikeType] = None + """File-like object of the pptx file to be converted""" + + pptx_path: Optional[Path] = None """Path to the pptx file to be converted""" output_path: Path @@ -81,6 +91,19 @@ class ConversionConfig(BaseModel): keep_similar_titles: bool = False """Keep similar titles (allow for repeated slide titles - One or more - Add (cont.) to the title)""" + @model_validator(mode="after") + def check_pptx_input(self): + if self.pptx is None and self.pptx_path is None: + raise ValueError("One of 'pptx' or 'pptx_path' must be supplied.") + elif self.pptx is not None and self.pptx_path is not None: + logger.warning( + "Both 'pptx' and 'pptx_path' are supplied. Using 'pptx_path' as the input file." + ) + self.pptx = None + return self + + model_config = ConfigDict(arbitrary_types_allowed=True) + class ElementType(str, Enum): Title = "Title" @@ -145,7 +168,9 @@ class TableElement(BaseElement): content: List[List[List[TextRun]]] # rows -> cols -> rich text -SlideElement = Union[TitleElement, ListItemElement, ParagraphElement, ImageElement, TableElement] +SlideElement = Union[ + TitleElement, ListItemElement, ParagraphElement, ImageElement, TableElement +] class SlideType(str, Enum): diff --git a/pptx2md/utils.py b/pptx2md/utils.py index c3daa1c..202292c 100644 --- a/pptx2md/utils.py +++ b/pptx2md/utils.py @@ -22,68 +22,96 @@ from pptx import Presentation +from pptx2md.types import ConversionConfig, FileLikeType + logger = logging.getLogger(__name__) def fix_null_rels(file_path): temp_dir_name = tempfile.mkdtemp() - shutil.unpack_archive(file_path, temp_dir_name, 'zip') + shutil.unpack_archive(file_path, temp_dir_name, "zip") rels = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(temp_dir_name) for f in filenames - if os.path.splitext(f)[1] == '.rels' + if os.path.splitext(f)[1] == ".rels" ] pat = re.compile(r'<\S*Relationship[^>]+Target\S*=\S*"NULL"[^>]*/>', re.I) for fn in rels: - f = open(fn, 'r+') + f = open(fn, "r+") content = f.read() res = pat.search(content) if res is not None: - content = pat.sub('', content) + content = pat.sub("", content) f.seek(0) f.truncate() f.write(content) f.close() tfn = uuid.uuid4().hex - shutil.make_archive(tfn, 'zip', temp_dir_name) + shutil.make_archive(tfn, "zip", temp_dir_name) shutil.rmtree(temp_dir_name) - tgt = f'{file_path[:-5]}_purged.pptx' - shutil.move(f'{tfn}.zip', tgt) + tgt = f"{file_path[:-5]}_purged.pptx" + shutil.move(f"{tfn}.zip", tgt) return tgt -def load_pptx(file_path: str) -> Presentation: +def load_pptx_from_io(file_like: FileLikeType) -> Presentation: + """Load a PowerPoint presentation from a file-like object.""" + try: + file_like.seek(0) + prs = Presentation(file_like) + except Exception as err: + raise ValueError( + "Invalid file-like object. Please provide a valid PPTX file." + ) from err + return prs + + +def load_pptx_from_path(file_path: Path) -> Presentation: + """Load a PowerPoint presentation from a file path.""" if not os.path.exists(file_path): - logger.error(f'source file {file_path} not exist!') - logger.error(f'absolute path: {os.path.abspath(file_path)}') + logger.error(f"source file {file_path} not exist!") + logger.error(f"absolute path: {os.path.abspath(file_path)}") raise FileNotFoundError(file_path) try: - prs = Presentation(file_path) + prs = Presentation(str(file_path)) except KeyError as err: - if len(err.args) > 0 and re.match(r'There is no item named .*NULL.* in the archive', str(err.args[0])): - logger.info('corrupted links found, trying to purge...') + if len(err.args) > 0 and re.match( + r"There is no item named .*NULL.* in the archive", str(err.args[0]) + ): + logger.info("corrupted links found, trying to purge...") try: res_path = fix_null_rels(file_path) - logger.info(f'purged file saved to {res_path}.') + logger.info(f"purged file saved to {res_path}.") prs = Presentation(res_path) except: logger.error( - 'failed to purge corrupted links, you can report this at https://github.com/ssine/pptx2md/issues') + "failed to purge corrupted links, you can report this at https://github.com/ssine/pptx2md/issues" + ) raise err else: - logger.error('unknown error, you can report this at https://github.com/ssine/pptx2md/issues') + logger.error( + "unknown error, you can report this at https://github.com/ssine/pptx2md/issues" + ) raise err return prs +def load_pptx(config: ConversionConfig) -> Presentation: + """Load a PowerPoint presentation from a file-like object or a file path.""" + if config.pptx: + return load_pptx_from_io(config.pptx) + elif config.pptx_path: + return load_pptx_from_path(config.pptx_path) + + def prepare_titles(title_path: Path) -> dict[str, int]: titles: dict[str, int] = {} - with open(title_path, 'r', encoding='utf8') as f: + with open(title_path, "r", encoding="utf8") as f: indent = -1 for line in f.readlines(): cnt = 0 - while line[cnt] == ' ': + while line[cnt] == " ": cnt += 1 if cnt == 0: titles[line.strip()] = 1 @@ -98,4 +126,4 @@ def prepare_titles(title_path: Path) -> dict[str, int]: def rgb_to_hex(rgb): r, g, b = rgb - return f'#{r:02x}{g:02x}{b:02x}' + return f"#{r:02x}{g:02x}{b:02x}"