From c70345b4ef1e43152e7f9ffb1881e5384e9094e9 Mon Sep 17 00:00:00 2001 From: HotTotem Date: Tue, 18 Feb 2025 13:32:36 +0000 Subject: [PATCH 1/3] added wikijs support, especially for image links in this first commit --- README.md | 4 +++- pptx2md/__main__.py | 2 ++ pptx2md/entry.py | 2 ++ pptx2md/outputter.py | 8 ++++++++ pptx2md/parser.py | 2 ++ pptx2md/types.py | 3 +++ 6 files changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 02a63a5..b0ba1ce 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ A tool to convert Powerpoint pptx file into markdown. * [Tiddlywiki](https://tiddlywiki.com/)'s wikitext * [Madoko](https://www.madoko.net/) * [Quarto](https://quarto.org/) +* [Wikijs](https://js.wiki/) Markdown with corrected image links for Wiki.js (forward slashes and lowercase names) _Please star this repo if you like it!_ @@ -87,7 +88,7 @@ Use it with `pptx2md [filename] -t titles.txt`. * `--enable-slides` deliniate slides `\n---\n`, this can help if you want to convert pptx slides to markdown slides * `--try-multi-column` try to detect multi-column slides (very slow) * `--min-block-size [size]` the minimum number of characters for a text block to be outputted -* `--wiki` / `--mdk` if you happen to be using tiddlywiki or madoko, this argument outputs the corresponding markup language +* `--wiki` / `--mdk` / `--wikijs` if you happen to be using tiddlywiki, madoko or wikijs this argument outputs the corresponding markup language * `--qmd` outputs to the qmd markup language used for [quarto](https://quarto.org/docs/presentations/revealjs/) powered presentations * `--page [number]` only convert the specified page * `--keep-similar-titles` keep similar titles and add "(cont.)" to repeated slide titles @@ -160,6 +161,7 @@ The `ConversionConfig` class accepts the same parameters as the command line arg - `try_multi_column`: Attempt to detect multi-column slides - `min_block_size`: Minimum text block size - `wiki`: Output in TiddlyWiki format +- `wikijs`: Output in Wiki.js format - `mdk`: Output in Madoko format - `qmd`: Output in Quarto format - `page`: Convert only specified page number diff --git a/pptx2md/__main__.py b/pptx2md/__main__.py index 2f5d592..ba348af 100644 --- a/pptx2md/__main__.py +++ b/pptx2md/__main__.py @@ -43,6 +43,7 @@ def parse_args() -> ConversionConfig: arg_parser.add_argument('--enable-slides', action="store_true", help='deliniate slides `\n---\n`') arg_parser.add_argument('--try-multi-column', action="store_true", help='try to detect multi-column slides') arg_parser.add_argument('--wiki', action="store_true", help='generate output as wikitext(TiddlyWiki)') + arg_parser.add_argument('--wikijs', action="store_true", help='generate output and imagelinks compatible to wikijs') arg_parser.add_argument('--mdk', action="store_true", help='generate output as madoko markdown') arg_parser.add_argument('--qmd', action="store_true", help='generate output as quarto markdown presentation') arg_parser.add_argument('--min-block-size', @@ -76,6 +77,7 @@ def parse_args() -> ConversionConfig: enable_slides=args.enable_slides, try_multi_column=args.try_multi_column, is_wiki=args.wiki, + is_wiki_js=args.wikijs, is_mdk=args.mdk, is_qmd=args.qmd, min_block_size=args.min_block_size, diff --git a/pptx2md/entry.py b/pptx2md/entry.py index 6ad1bb8..35ba915 100644 --- a/pptx2md/entry.py +++ b/pptx2md/entry.py @@ -40,6 +40,8 @@ def convert(config: ConversionConfig): if config.is_wiki: out = outputter.WikiFormatter(config) + elif config.is_wiki_js: + out = outputter.WikijsFormatter(config) elif config.is_mdk: out = outputter.MadokoFormatter(config) elif config.is_qmd: diff --git a/pptx2md/outputter.py b/pptx2md/outputter.py index 3beadd6..9200c1e 100644 --- a/pptx2md/outputter.py +++ b/pptx2md/outputter.py @@ -200,6 +200,14 @@ def get_escaped(self, text): text = re.sub(self.esc_re2, self.esc_repl, text) return text +class WikijsFormatter(MarkdownFormatter): + + def put_image(self, path, max_width=None): + if max_width is None: + self.ofile.write(f'![](/{urllib.parse.quote(path)})\n\n') + else: + self.ofile.write(f'\n\n') + class WikiFormatter(Formatter): # write outputs to wikitext diff --git a/pptx2md/parser.py b/pptx2md/parser.py index edcc10a..509a9db 100644 --- a/pptx2md/parser.py +++ b/pptx2md/parser.py @@ -152,6 +152,8 @@ def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageEl file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1]) pic_name = file_prefix + f'_{picture_count}' + if(config.is_wiki_js): + pic_name = pic_name.lower() pic_ext = shape.image.ext if not os.path.exists(config.image_dir): os.makedirs(config.image_dir) diff --git a/pptx2md/types.py b/pptx2md/types.py index 0de9e4f..7f0bd39 100644 --- a/pptx2md/types.py +++ b/pptx2md/types.py @@ -60,6 +60,9 @@ class ConversionConfig(BaseModel): is_wiki: bool = False """Generate output as wikitext (TiddlyWiki)""" + is_wiki_js: bool = False + """Generate output and especially img links compatible to Wiki.js""" + is_mdk: bool = False """Generate output as madoko markdown""" From 1ee46404b19a44f718516c2403c8b4d697e9869b Mon Sep 17 00:00:00 2001 From: HotTotem Date: Tue, 18 Mar 2025 15:04:34 +0000 Subject: [PATCH 2/3] fixed issues on windows with path --- pptx2md/parser.py | 4 +++- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pptx2md/parser.py b/pptx2md/parser.py index 509a9db..1cfa16f 100644 --- a/pptx2md/parser.py +++ b/pptx2md/parser.py @@ -20,6 +20,8 @@ from operator import attrgetter from typing import List, Union +from pathlib import Path + from PIL import Image from pptx import Presentation from pptx.enum.dml import MSO_COLOR_TYPE, MSO_THEME_COLOR @@ -160,7 +162,7 @@ def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageEl output_path = config.image_dir / f'{pic_name}.{pic_ext}' common_path = os.path.commonpath([config.output_path, config.image_dir]) - img_outputter_path = os.path.relpath(output_path, common_path) + img_outputter_path = Path(os.path.relpath(output_path, common_path)).as_posix() with open(output_path, 'wb') as f: f.write(shape.image.blob) picture_count += 1 diff --git a/pyproject.toml b/pyproject.toml index 9d21af0..410d78f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pptx2md" -version = "2.0.6" +version = "2.0.10" description = "This package converts pptx to markdown" repository = "https://github.com/ssine/pptx2md" authors = ["Liu Siyao "] From c0612ac0c62d5b8fafe498fbc5837e549e0ac3ad Mon Sep 17 00:00:00 2001 From: HotTotem Date: Tue, 18 Mar 2025 16:01:03 +0000 Subject: [PATCH 3/3] trying to remove master --- pptx2md/__main__.py | 1 + pptx2md/parser.py | 8 +++++++- pptx2md/types.py | 3 +++ pyproject.toml | 2 +- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pptx2md/__main__.py b/pptx2md/__main__.py index ba348af..d6713a9 100644 --- a/pptx2md/__main__.py +++ b/pptx2md/__main__.py @@ -74,6 +74,7 @@ def parse_args() -> ConversionConfig: disable_color=args.disable_color, disable_escaping=args.disable_escaping, disable_notes=args.disable_notes, + disable_master=args.disable_master, enable_slides=args.enable_slides, try_multi_column=args.try_multi_column, is_wiki=args.wiki, diff --git a/pptx2md/parser.py b/pptx2md/parser.py index 1cfa16f..3c3e6aa 100644 --- a/pptx2md/parser.py +++ b/pptx2md/parser.py @@ -244,7 +244,13 @@ def process_shapes(config: ConversionConfig, current_shapes, slide_id: int) -> L def parse(config: ConversionConfig, prs: Presentation) -> ParsedPresentation: result = ParsedPresentation(slides=[]) - + if(config.disable_master): + for slide in prs.slides: + for shape in slide.shapes: + # You can selectively remove shapes (like text, pictures, etc.) + if shape in prs.slide_master.slide_layouts[0]: + sp = shape + slide.shapes._spTree.remove(sp._element) for idx, slide in enumerate(tqdm(prs.slides, desc='Converting slides')): if config.page is not None and idx + 1 != config.page: continue diff --git a/pptx2md/types.py b/pptx2md/types.py index 7f0bd39..d02e82b 100644 --- a/pptx2md/types.py +++ b/pptx2md/types.py @@ -54,6 +54,9 @@ class ConversionConfig(BaseModel): disable_notes: bool = False """Do not add presenter notes""" + disable_master: bool = False + """Do not add master layout""" + enable_slides: bool = False """Deliniate slides with `\n---\n`""" diff --git a/pyproject.toml b/pyproject.toml index 410d78f..d502de4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pptx2md" -version = "2.0.10" +version = "2.0.18" description = "This package converts pptx to markdown" repository = "https://github.com/ssine/pptx2md" authors = ["Liu Siyao "]