Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ You need to have _[Python](https://www.python.org/)_ with version later than __3
pip install pptx2md
```

### Optional Dependencies

- `inkscape` (for converting WMF images to SVG)


### Usage

Once you have installed it, use the command `pptx2md [pptx filename]` to convert _pptx file_ into markdown.
Expand Down
69 changes: 51 additions & 18 deletions pptx2md/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,13 @@
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import subprocess
import sys
import re
from pathlib import Path

from pptx2md.entry import convert
from pptx2md.log import setup_logging
from pptx2md.types import ConversionConfig
Expand All @@ -34,11 +30,9 @@ def parse_args() -> ConversionConfig:
arg_parser.add_argument('--disable-image', action="store_true", help='disable image extraction')
arg_parser.add_argument('--disable-wmf',
action="store_true",
help='keep wmf formatted image untouched(avoid exceptions under linux)')
help='keep wmf formatted image untouched (avoid exceptions under linux)')
arg_parser.add_argument('--disable-color', action="store_true", help='do not add color HTML tags')
arg_parser.add_argument('--disable-escaping',
action="store_true",
help='do not attempt to escape special characters')
arg_parser.add_argument('--disable-escaping', action="store_true", help='do not escape special characters')
arg_parser.add_argument('--disable-notes', action="store_true", help='do not add presenter notes')
arg_parser.add_argument('--enable-slides', action="store_true", help='deliniate slides `\n---\n`')
arg_parser.add_argument('--try-multi-column', action="store_true", help='try to detect multi-column slides')
Expand All @@ -48,18 +42,15 @@ def parse_args() -> ConversionConfig:
arg_parser.add_argument('--min-block-size',
type=int,
default=15,
help='the minimum character number of a text block to be converted')
help='minimum characters per text block')
arg_parser.add_argument("--page", type=int, default=None, help="only convert the specified page")
arg_parser.add_argument(
"--keep-similar-titles",
action="store_true",
help="keep similar titles (allow for repeated slide titles - One or more - Add (cont.) to the title)")
arg_parser.add_argument("--keep-similar-titles", action="store_true",
help="allow repeated slide titles (append '(cont.)')")

args = arg_parser.parse_args()

# Determine output path if not specified
extension = '.tid' if args.wiki else '.qmd' if args.qmd else '.md'
if args.output is None:
extension = '.tid' if args.wiki else '.qmd' if args.qmd else '.md'
args.output = Path(f'out{extension}')

return ConversionConfig(
Expand All @@ -84,10 +75,52 @@ def parse_args() -> ConversionConfig:
)


def convert_wmf_to_svg(imgpath: Path):
for wmf in sorted(imgpath.glob("*.wmf")):
svg = wmf.with_suffix(".svg")
try:
subprocess.run([
"inkscape", str(wmf),
"--export-type=svg",
"--export-filename", str(svg),
"--export-plain-svg"
], check=True)
print(f"[INFO] Converted {wmf.name} → {svg.name}")
except subprocess.CalledProcessError as e:
print(f"[WARN] Failed to convert {wmf.name} to SVG: {e}")


def inject_svg_includes(md_file: Path, imgpath: Path, base_name: str):
lines = md_file.read_text(encoding="utf-8").splitlines()
new_lines = []
slide_counter = 0

for line in lines:
if line.startswith("# "):
slide_counter += 1
new_lines.append(line)

svg_file = imgpath / f"{base_name}_{slide_counter}.svg"
if svg_file.exists():
include_line = fr"\includesvg[width=0.9\linewidth]{{./{imgpath.name}/{svg_file.stem}}}"
if include_line not in new_lines:
new_lines.append("")
new_lines.append(f"<!-- Converted WMF image from Slide {slide_counter} -->")
new_lines.append(include_line)
new_lines.append("")

md_file.write_text("\n".join(new_lines), encoding="utf-8")


def main():
config = parse_args()
convert(config)

if not config.disable_wmf:
convert_wmf_to_svg(config.image_dir)
inject_svg_includes(config.output_path, config.image_dir, config.pptx_path.stem)


if __name__ == '__main__':
main()

143 changes: 75 additions & 68 deletions pptx2md/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import logging
import os
import subprocess
from functools import partial
from operator import attrgetter
from typing import List, Union
Expand Down Expand Up @@ -43,15 +42,16 @@
)

logger = logging.getLogger(__name__)

picture_count = 0


def is_title(shape):
if shape.is_placeholder and (shape.placeholder_format.type == PP_PLACEHOLDER.TITLE or
shape.placeholder_format.type == PP_PLACEHOLDER.SUBTITLE or
shape.placeholder_format.type == PP_PLACEHOLDER.VERTICAL_TITLE or
shape.placeholder_format.type == PP_PLACEHOLDER.CENTER_TITLE):
if shape.is_placeholder and shape.placeholder_format.type in {
PP_PLACEHOLDER.TITLE,
PP_PLACEHOLDER.SUBTITLE,
PP_PLACEHOLDER.VERTICAL_TITLE,
PP_PLACEHOLDER.CENTER_TITLE,
}:
return True
return False

Expand All @@ -77,17 +77,25 @@ def is_list_block(shape) -> bool:

def is_accent(font):
if font.underline or font.italic or (
font.color.type == MSO_COLOR_TYPE.SCHEME and
(font.color.theme_color == MSO_THEME_COLOR.ACCENT_1 or font.color.theme_color == MSO_THEME_COLOR.ACCENT_2 or
font.color.theme_color == MSO_THEME_COLOR.ACCENT_3 or font.color.theme_color == MSO_THEME_COLOR.ACCENT_4 or
font.color.theme_color == MSO_THEME_COLOR.ACCENT_5 or font.color.theme_color == MSO_THEME_COLOR.ACCENT_6)):
font.color.type == MSO_COLOR_TYPE.SCHEME
and font.color.theme_color in {
MSO_THEME_COLOR.ACCENT_1,
MSO_THEME_COLOR.ACCENT_2,
MSO_THEME_COLOR.ACCENT_3,
MSO_THEME_COLOR.ACCENT_4,
MSO_THEME_COLOR.ACCENT_5,
MSO_THEME_COLOR.ACCENT_6,
}
):
return True
return False


def is_strong(font):
if font.bold or (font.color.type == MSO_COLOR_TYPE.SCHEME and (font.color.theme_color == MSO_THEME_COLOR.DARK_1 or
font.color.theme_color == MSO_THEME_COLOR.DARK_2)):
if font.bold or (
font.color.type == MSO_COLOR_TYPE.SCHEME
and font.color.theme_color in {MSO_THEME_COLOR.DARK_1, MSO_THEME_COLOR.DARK_2}
):
return True
return False

Expand Down Expand Up @@ -135,7 +143,6 @@ def process_text_blocks(config: ConversionConfig, shape, slide_idx) -> List[Unio
text = get_text_runs(para)
results.append(ListItemElement(content=text, level=para.level))
else:
# paragraph block
for para in shape.text_frame.paragraphs:
if para.text.strip() == '':
continue
Expand All @@ -144,12 +151,24 @@ def process_text_blocks(config: ConversionConfig, shape, slide_idx) -> List[Unio
return results


def convert_wmf_to_svg(wmf_path: str, svg_path: str) -> bool:
try:
result = subprocess.run(
['inkscape', wmf_path, '--export-type=svg', '--export-filename=' + svg_path],
check=True,
capture_output=True
)
return True
except Exception as e:
logger.warning(f"Failed to convert {wmf_path} to SVG: {e}")
return False


def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageElement, None]:
if config.disable_image:
return None

global picture_count

file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1])
pic_name = file_prefix + f'_{picture_count}'
pic_ext = shape.image.ext
Expand All @@ -159,37 +178,30 @@ def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageEl
output_path = config.image_dir / f'{pic_name}.{pic_ext}'
common_path = os.path.commonpath([config.output_path, config.image_dir])
img_outputter_path = os.path.relpath(output_path, common_path)

with open(output_path, 'wb') as f:
f.write(shape.image.blob)
picture_count += 1

# normal images
if pic_ext != 'wmf':
if pic_ext == 'wmf':
svg_path = config.image_dir / f'{pic_name}.svg'
if convert_wmf_to_svg(str(output_path), str(svg_path)):
logger.info(f"Converted WMF {output_path} to SVG {svg_path}")
img_outputter_path = os.path.relpath(svg_path, common_path)
return ImageElement(path=img_outputter_path, width=config.image_width)
else:
logger.warning(f"Failed to convert WMF {output_path}, skipped.")
return None
else:
return ImageElement(path=img_outputter_path, width=config.image_width)

# wmf images, try to convert, if failed, output as original
try:
try:
Image.open(output_path).save(os.path.splitext(output_path)[0] + '.png')
return ImageElement(path=os.path.splitext(img_outputter_path)[0] + '.png', width=config.image_width)
except Exception: # Image failed, try another
from wand.image import Image
with Image(filename=output_path) as img:
img.format = 'png'
img.save(filename=os.path.splitext(output_path)[0] + '.png')
logger.info(f'Image {output_path} in slide {slide_idx} converted to png.')
return ImageElement(path=os.path.splitext(img_outputter_path)[0] + '.png', width=config.image_width)
except Exception:
logger.warning(f'Cannot convert wmf image {output_path} in slide {slide_idx} to png, skipped.')
return None


def process_table(config: ConversionConfig, shape, slide_idx) -> Union[TableElement, None]:
table = [[sum([get_text_runs(p)
for p in cell.text_frame.paragraphs], [])
for cell in row.cells]
for row in shape.table.rows]
if len(table) > 0:
table = [
[sum([get_text_runs(p) for p in cell.text_frame.paragraphs], []) for cell in row.cells]
for row in shape.table.rows
]
if table:
return TableElement(content=table)
return None

Expand All @@ -203,38 +215,37 @@ def ungroup_shapes(shapes) -> List[SlideElement]:
else:
res.append(shape)
except Exception as e:
logger.warning(f'failed to load shape {shape}, skipped. error: {e}')
logger.warning(f'Failed to load shape {shape}, skipped: {e}')
return res


def process_shapes(config: ConversionConfig, current_shapes, slide_id: int) -> List[SlideElement]:
def process_shapes(config: ConversionConfig, shapes, slide_idx: int) -> List[SlideElement]:
results = []
for shape in current_shapes:
for shape in shapes:
if is_title(shape):
results.append(process_title(config, shape, slide_id))
results.append(process_title(config, shape, slide_idx))
elif is_text_block(config, shape):
results.extend(process_text_blocks(config, shape, slide_id))
results.extend(process_text_blocks(config, shape, slide_idx))
elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
try:
pic = process_picture(config, shape, slide_id)
pic = process_picture(config, shape, slide_idx)
if pic:
results.append(pic)
except AttributeError as e:
logger.warning(f'Failed to process picture, skipped: {e}')
logger.warning(f"Failed to process picture: {e}")
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
table = process_table(config, shape, slide_id)
table = process_table(config, shape, slide_idx)
if table:
results.append(table)
else:
try:
ph = shape.placeholder_format
if ph.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image") and getattr(shape, "image"):
pic = process_picture(config, shape, slide_id)
if ph.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image"):
pic = process_picture(config, shape, slide_idx)
if pic:
results.append(pic)
except:
pass

return results


Expand All @@ -244,28 +255,23 @@ def parse(config: ConversionConfig, prs: Presentation) -> ParsedPresentation:
for idx, slide in enumerate(tqdm(prs.slides, desc='Converting slides')):
if config.page is not None and idx + 1 != config.page:
continue
shapes = []

try:
shapes = sorted(ungroup_shapes(slide.shapes), key=attrgetter('top', 'left'))
except:
logger.warning('Bad shapes encountered in this slide. Please check or remove them and try again.')
logger.warning('shapes:')
try:
for sp in slide.shapes:
logger.warning(sp.shape_type)
logger.warning(sp.top, sp.left, sp.width, sp.height)
except:
logger.warning('failed to print all bad shapes.')
shapes = [
sp for sp in ungroup_shapes(slide.shapes)
if getattr(sp, "top", None) is not None and getattr(sp, "left", None) is not None
]
shapes.sort(key=attrgetter('top', 'left'))
except Exception as e:
logger.warning(f"Failed to sort shapes on slide {idx + 1}: {e}")
shapes = []

if not config.try_multi_column:
result_slide = GeneralSlide(elements=process_shapes(config, shapes, idx + 1))
else:
if config.try_multi_column:
multi_column_slide = get_multi_column_slide_if_present(
prs, slide, partial(process_shapes, config=config, slide_id=idx + 1))
if multi_column_slide:
result_slide = multi_column_slide
else:
result_slide = GeneralSlide(elements=process_shapes(config, shapes, idx + 1))
prs, slide, partial(process_shapes, config=config, slide_idx=idx + 1))
result_slide = multi_column_slide if multi_column_slide else GeneralSlide(elements=process_shapes(config, shapes, idx + 1))
else:
result_slide = GeneralSlide(elements=process_shapes(config, shapes, idx + 1))

if not config.disable_notes and slide.has_notes_slide:
text = slide.notes_slide.notes_text_frame.text
Expand All @@ -275,3 +281,4 @@ def parse(config: ConversionConfig, prs: Presentation) -> ParsedPresentation:
result.slides.append(result_slide)

return result