From 77dc4d17680ff48c803ca2165000448a8ce80204 Mon Sep 17 00:00:00 2001 From: Ethan Bishop Date: Fri, 10 Jan 2025 17:42:28 +0000 Subject: [PATCH] Patch issue with non-breaking spaces in `pdf2HTMLEX`. Add command line argument to convert complex SVGs images to bitmaps. --- CHANGELOG.md | 2 ++ src/Pdf2Html/Controllers/RootController.cs | 2 +- src/Pdf2Html/Dockerfile | 1 + .../pdf2htmlEX/patches/unicode.h.patch | 19 +++++++++++++++++++ tests/E2E.Tests/Resources/CS_cheat_sheet.html | 4 ++-- 5 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c83d9d..6e7236d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ * Switch base images to Ubuntu Noble (24.04 LTS). * Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support. * All patches are in this source tree, and are applied to directly to the source of the upstream tag during build. +* Patch issue with non-breaking spaces in `pdf2HTMLEX`. +* Convert complex SVGs images to bitmaps. ## 0.1.0 diff --git a/src/Pdf2Html/Controllers/RootController.cs b/src/Pdf2Html/Controllers/RootController.cs index 2af0958..9c0ec6a 100644 --- a/src/Pdf2Html/Controllers/RootController.cs +++ b/src/Pdf2Html/Controllers/RootController.cs @@ -63,7 +63,7 @@ public async Task Post() private async Task<(bool Success, ICollection logs)> ConvertAsync(string inputFile, string outputFile) { using var p = new Process(); - const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --decompose-ligature 1 --tounicode 1"; + const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1"; p.StartInfo = new ProcessStartInfo { FileName = "pdf2htmlEX", diff --git a/src/Pdf2Html/Dockerfile b/src/Pdf2Html/Dockerfile index 144fb09..496c5af 100644 --- a/src/Pdf2Html/Dockerfile +++ b/src/Pdf2Html/Dockerfile @@ -19,6 +19,7 @@ RUN patch ./buildScripts/versionEnvs ./patches/versionEnvs.patch RUN patch ./buildScripts/buildPoppler ./patches/buildPoppler.patch RUN patch ./buildScripts/getBuildToolsApt ./patches/getBuildToolsApt.patch RUN patch ./buildScripts/getDevLibrariesApt ./patches/getDevLibrariesApt.patch +RUN patch ./pdf2htmlEX/src/util/unicode.h ./patches/unicode.h.patch RUN patch ./pdf2htmlEX/CMakeLists.txt ./patches/CMakeLists.patch RUN ./buildScripts/versionEnvs diff --git a/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch b/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch new file mode 100644 index 0000000..4601815 --- /dev/null +++ b/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch @@ -0,0 +1,19 @@ +@@ -39,9 +39,6 @@ namespace pdf2htmlEX { + * moz: + * p2h: [------------------] [-] [-] [-----------------] + * +- * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified, +- * \n and \r can break line, \t can shift text, so they are considered illegal. +- * + * Resources (retrieved at 2015-03-16) + * * webkit + * * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 ) +@@ -58,7 +55,7 @@ namespace pdf2htmlEX { + */ + inline bool is_illegal_unicode(Unicode c) + { +- return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD) ++ return (c < 0x20) || (c >= 0x7F && c < 0xA0) || (c == 0xAD) + || (c >= 0x300 && c <= 0x36f) // DCRH Combining diacriticals + || (c >= 0x1ab0 && c <= 0x1aff) // DCRH Combining diacriticals + || (c >= 0x1dc0 && c <= 0x1dff) // DCRH Combining diacriticals diff --git a/tests/E2E.Tests/Resources/CS_cheat_sheet.html b/tests/E2E.Tests/Resources/CS_cheat_sheet.html index 82dd2c8..b35cd44 100644 --- a/tests/E2E.Tests/Resources/CS_cheat_sheet.html +++ b/tests/E2E.Tests/Resources/CS_cheat_sheet.html @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8937481da9ecc248172cd308abec6283e0b2820ea24bf159b602c9d99cdcf9e2 -size 1203138 +oid sha256:ff65d9e1cc4864dc0db647594c33c01333faa20e0e104379b42ae2b8e9694c0a +size 1086803