diff --git a/README.md b/README.md index 02a63a5..509afb3 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,11 @@ You need to have _[Python](https://www.python.org/)_ with version later than __3 pip install pptx2md ``` +### Optional Dependencies + +- `inkscape` (for converting WMF images to SVG) + + ### Usage Once you have installed it, use the command `pptx2md [pptx filename]` to convert _pptx file_ into markdown. diff --git a/pptx2md/__main__.py b/pptx2md/__main__.py index 2f5d592..0e9c5f5 100644 --- a/pptx2md/__main__.py +++ b/pptx2md/__main__.py @@ -5,17 +5,13 @@ # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. import argparse import logging +import subprocess +import sys +import re from pathlib import Path - from pptx2md.entry import convert from pptx2md.log import setup_logging from pptx2md.types import ConversionConfig @@ -34,11 +30,9 @@ def parse_args() -> ConversionConfig: arg_parser.add_argument('--disable-image', action="store_true", help='disable image extraction') arg_parser.add_argument('--disable-wmf', action="store_true", - help='keep wmf formatted image untouched(avoid exceptions under linux)') + help='keep wmf formatted image untouched (avoid exceptions under linux)') arg_parser.add_argument('--disable-color', action="store_true", help='do not add color HTML tags') - arg_parser.add_argument('--disable-escaping', - action="store_true", - help='do not attempt to escape special characters') + arg_parser.add_argument('--disable-escaping', action="store_true", help='do not escape special characters') arg_parser.add_argument('--disable-notes', action="store_true", help='do not add presenter notes') arg_parser.add_argument('--enable-slides', action="store_true", help='deliniate slides `\n---\n`') arg_parser.add_argument('--try-multi-column', action="store_true", help='try to detect multi-column slides') @@ -48,18 +42,15 @@ def parse_args() -> ConversionConfig: arg_parser.add_argument('--min-block-size', type=int, default=15, - help='the minimum character number of a text block to be converted') + help='minimum characters per text block') arg_parser.add_argument("--page", type=int, default=None, help="only convert the specified page") - arg_parser.add_argument( - "--keep-similar-titles", - action="store_true", - help="keep similar titles (allow for repeated slide titles - One or more - Add (cont.) to the title)") + arg_parser.add_argument("--keep-similar-titles", action="store_true", + help="allow repeated slide titles (append '(cont.)')") args = arg_parser.parse_args() - # Determine output path if not specified + extension = '.tid' if args.wiki else '.qmd' if args.qmd else '.md' if args.output is None: - extension = '.tid' if args.wiki else '.qmd' if args.qmd else '.md' args.output = Path(f'out{extension}') return ConversionConfig( @@ -84,10 +75,52 @@ def parse_args() -> ConversionConfig: ) +def convert_wmf_to_svg(imgpath: Path): + for wmf in sorted(imgpath.glob("*.wmf")): + svg = wmf.with_suffix(".svg") + try: + subprocess.run([ + "inkscape", str(wmf), + "--export-type=svg", + "--export-filename", str(svg), + "--export-plain-svg" + ], check=True) + print(f"[INFO] Converted {wmf.name} → {svg.name}") + except subprocess.CalledProcessError as e: + print(f"[WARN] Failed to convert {wmf.name} to SVG: {e}") + + +def inject_svg_includes(md_file: Path, imgpath: Path, base_name: str): + lines = md_file.read_text(encoding="utf-8").splitlines() + new_lines = [] + slide_counter = 0 + + for line in lines: + if line.startswith("# "): + slide_counter += 1 + new_lines.append(line) + + svg_file = imgpath / f"{base_name}_{slide_counter}.svg" + if svg_file.exists(): + include_line = fr"\includesvg[width=0.9\linewidth]{{./{imgpath.name}/{svg_file.stem}}}" + if include_line not in new_lines: + new_lines.append("") + new_lines.append(f"") + new_lines.append(include_line) + new_lines.append("") + + md_file.write_text("\n".join(new_lines), encoding="utf-8") + + def main(): config = parse_args() convert(config) + if not config.disable_wmf: + convert_wmf_to_svg(config.image_dir) + inject_svg_includes(config.output_path, config.image_dir, config.pptx_path.stem) + if __name__ == '__main__': main() + diff --git a/pptx2md/parser.py b/pptx2md/parser.py index edcc10a..bf2c5e3 100644 --- a/pptx2md/parser.py +++ b/pptx2md/parser.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - import logging import os +import subprocess from functools import partial from operator import attrgetter from typing import List, Union @@ -43,15 +42,16 @@ ) logger = logging.getLogger(__name__) - picture_count = 0 def is_title(shape): - if shape.is_placeholder and (shape.placeholder_format.type == PP_PLACEHOLDER.TITLE or - shape.placeholder_format.type == PP_PLACEHOLDER.SUBTITLE or - shape.placeholder_format.type == PP_PLACEHOLDER.VERTICAL_TITLE or - shape.placeholder_format.type == PP_PLACEHOLDER.CENTER_TITLE): + if shape.is_placeholder and shape.placeholder_format.type in { + PP_PLACEHOLDER.TITLE, + PP_PLACEHOLDER.SUBTITLE, + PP_PLACEHOLDER.VERTICAL_TITLE, + PP_PLACEHOLDER.CENTER_TITLE, + }: return True return False @@ -77,17 +77,25 @@ def is_list_block(shape) -> bool: def is_accent(font): if font.underline or font.italic or ( - font.color.type == MSO_COLOR_TYPE.SCHEME and - (font.color.theme_color == MSO_THEME_COLOR.ACCENT_1 or font.color.theme_color == MSO_THEME_COLOR.ACCENT_2 or - font.color.theme_color == MSO_THEME_COLOR.ACCENT_3 or font.color.theme_color == MSO_THEME_COLOR.ACCENT_4 or - font.color.theme_color == MSO_THEME_COLOR.ACCENT_5 or font.color.theme_color == MSO_THEME_COLOR.ACCENT_6)): + font.color.type == MSO_COLOR_TYPE.SCHEME + and font.color.theme_color in { + MSO_THEME_COLOR.ACCENT_1, + MSO_THEME_COLOR.ACCENT_2, + MSO_THEME_COLOR.ACCENT_3, + MSO_THEME_COLOR.ACCENT_4, + MSO_THEME_COLOR.ACCENT_5, + MSO_THEME_COLOR.ACCENT_6, + } + ): return True return False def is_strong(font): - if font.bold or (font.color.type == MSO_COLOR_TYPE.SCHEME and (font.color.theme_color == MSO_THEME_COLOR.DARK_1 or - font.color.theme_color == MSO_THEME_COLOR.DARK_2)): + if font.bold or ( + font.color.type == MSO_COLOR_TYPE.SCHEME + and font.color.theme_color in {MSO_THEME_COLOR.DARK_1, MSO_THEME_COLOR.DARK_2} + ): return True return False @@ -135,7 +143,6 @@ def process_text_blocks(config: ConversionConfig, shape, slide_idx) -> List[Unio text = get_text_runs(para) results.append(ListItemElement(content=text, level=para.level)) else: - # paragraph block for para in shape.text_frame.paragraphs: if para.text.strip() == '': continue @@ -144,12 +151,24 @@ def process_text_blocks(config: ConversionConfig, shape, slide_idx) -> List[Unio return results +def convert_wmf_to_svg(wmf_path: str, svg_path: str) -> bool: + try: + result = subprocess.run( + ['inkscape', wmf_path, '--export-type=svg', '--export-filename=' + svg_path], + check=True, + capture_output=True + ) + return True + except Exception as e: + logger.warning(f"Failed to convert {wmf_path} to SVG: {e}") + return False + + def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageElement, None]: if config.disable_image: return None global picture_count - file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1]) pic_name = file_prefix + f'_{picture_count}' pic_ext = shape.image.ext @@ -159,37 +178,30 @@ def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageEl output_path = config.image_dir / f'{pic_name}.{pic_ext}' common_path = os.path.commonpath([config.output_path, config.image_dir]) img_outputter_path = os.path.relpath(output_path, common_path) + with open(output_path, 'wb') as f: f.write(shape.image.blob) picture_count += 1 - # normal images - if pic_ext != 'wmf': + if pic_ext == 'wmf': + svg_path = config.image_dir / f'{pic_name}.svg' + if convert_wmf_to_svg(str(output_path), str(svg_path)): + logger.info(f"Converted WMF {output_path} to SVG {svg_path}") + img_outputter_path = os.path.relpath(svg_path, common_path) + return ImageElement(path=img_outputter_path, width=config.image_width) + else: + logger.warning(f"Failed to convert WMF {output_path}, skipped.") + return None + else: return ImageElement(path=img_outputter_path, width=config.image_width) - # wmf images, try to convert, if failed, output as original - try: - try: - Image.open(output_path).save(os.path.splitext(output_path)[0] + '.png') - return ImageElement(path=os.path.splitext(img_outputter_path)[0] + '.png', width=config.image_width) - except Exception: # Image failed, try another - from wand.image import Image - with Image(filename=output_path) as img: - img.format = 'png' - img.save(filename=os.path.splitext(output_path)[0] + '.png') - logger.info(f'Image {output_path} in slide {slide_idx} converted to png.') - return ImageElement(path=os.path.splitext(img_outputter_path)[0] + '.png', width=config.image_width) - except Exception: - logger.warning(f'Cannot convert wmf image {output_path} in slide {slide_idx} to png, skipped.') - return None - def process_table(config: ConversionConfig, shape, slide_idx) -> Union[TableElement, None]: - table = [[sum([get_text_runs(p) - for p in cell.text_frame.paragraphs], []) - for cell in row.cells] - for row in shape.table.rows] - if len(table) > 0: + table = [ + [sum([get_text_runs(p) for p in cell.text_frame.paragraphs], []) for cell in row.cells] + for row in shape.table.rows + ] + if table: return TableElement(content=table) return None @@ -203,38 +215,37 @@ def ungroup_shapes(shapes) -> List[SlideElement]: else: res.append(shape) except Exception as e: - logger.warning(f'failed to load shape {shape}, skipped. error: {e}') + logger.warning(f'Failed to load shape {shape}, skipped: {e}') return res -def process_shapes(config: ConversionConfig, current_shapes, slide_id: int) -> List[SlideElement]: +def process_shapes(config: ConversionConfig, shapes, slide_idx: int) -> List[SlideElement]: results = [] - for shape in current_shapes: + for shape in shapes: if is_title(shape): - results.append(process_title(config, shape, slide_id)) + results.append(process_title(config, shape, slide_idx)) elif is_text_block(config, shape): - results.extend(process_text_blocks(config, shape, slide_id)) + results.extend(process_text_blocks(config, shape, slide_idx)) elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE: try: - pic = process_picture(config, shape, slide_id) + pic = process_picture(config, shape, slide_idx) if pic: results.append(pic) except AttributeError as e: - logger.warning(f'Failed to process picture, skipped: {e}') + logger.warning(f"Failed to process picture: {e}") elif shape.shape_type == MSO_SHAPE_TYPE.TABLE: - table = process_table(config, shape, slide_id) + table = process_table(config, shape, slide_idx) if table: results.append(table) else: try: ph = shape.placeholder_format - if ph.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image") and getattr(shape, "image"): - pic = process_picture(config, shape, slide_id) + if ph.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image"): + pic = process_picture(config, shape, slide_idx) if pic: results.append(pic) except: pass - return results @@ -244,28 +255,23 @@ def parse(config: ConversionConfig, prs: Presentation) -> ParsedPresentation: for idx, slide in enumerate(tqdm(prs.slides, desc='Converting slides')): if config.page is not None and idx + 1 != config.page: continue - shapes = [] + try: - shapes = sorted(ungroup_shapes(slide.shapes), key=attrgetter('top', 'left')) - except: - logger.warning('Bad shapes encountered in this slide. Please check or remove them and try again.') - logger.warning('shapes:') - try: - for sp in slide.shapes: - logger.warning(sp.shape_type) - logger.warning(sp.top, sp.left, sp.width, sp.height) - except: - logger.warning('failed to print all bad shapes.') + shapes = [ + sp for sp in ungroup_shapes(slide.shapes) + if getattr(sp, "top", None) is not None and getattr(sp, "left", None) is not None + ] + shapes.sort(key=attrgetter('top', 'left')) + except Exception as e: + logger.warning(f"Failed to sort shapes on slide {idx + 1}: {e}") + shapes = [] - if not config.try_multi_column: - result_slide = GeneralSlide(elements=process_shapes(config, shapes, idx + 1)) - else: + if config.try_multi_column: multi_column_slide = get_multi_column_slide_if_present( - prs, slide, partial(process_shapes, config=config, slide_id=idx + 1)) - if multi_column_slide: - result_slide = multi_column_slide - else: - result_slide = GeneralSlide(elements=process_shapes(config, shapes, idx + 1)) + prs, slide, partial(process_shapes, config=config, slide_idx=idx + 1)) + result_slide = multi_column_slide if multi_column_slide else GeneralSlide(elements=process_shapes(config, shapes, idx + 1)) + else: + result_slide = GeneralSlide(elements=process_shapes(config, shapes, idx + 1)) if not config.disable_notes and slide.has_notes_slide: text = slide.notes_slide.notes_text_frame.text @@ -275,3 +281,4 @@ def parse(config: ConversionConfig, prs: Presentation) -> ParsedPresentation: result.slides.append(result_slide) return result +