Skip to content

Commit 77dc4d1

Browse files
author
Ethan Bishop
committed
Patch issue with non-breaking spaces in pdf2HTMLEX.
Add command line argument to convert complex SVGs images to bitmaps.
1 parent df353d6 commit 77dc4d1

5 files changed

Lines changed: 25 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* Switch base images to Ubuntu Noble (24.04 LTS).
77
* Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support.
88
* All patches are in this source tree, and are applied to directly to the source of the upstream tag during build.
9+
* Patch issue with non-breaking spaces in `pdf2HTMLEX`.
10+
* Convert complex SVGs images to bitmaps.
911

1012
## 0.1.0
1113

src/Pdf2Html/Controllers/RootController.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public async Task<ActionResult> Post()
6363
private async Task<(bool Success, ICollection<string> logs)> ConvertAsync(string inputFile, string outputFile)
6464
{
6565
using var p = new Process();
66-
const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --decompose-ligature 1 --tounicode 1";
66+
const string conversionOptions = "--embed-javascript=0 --process-outline=0 --printing=0 --bg-format=svg --svg-node-count-limit=100 --decompose-ligature 1 --tounicode 1";
6767
p.StartInfo = new ProcessStartInfo
6868
{
6969
FileName = "pdf2htmlEX",

src/Pdf2Html/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ RUN patch ./buildScripts/versionEnvs ./patches/versionEnvs.patch
1919
RUN patch ./buildScripts/buildPoppler ./patches/buildPoppler.patch
2020
RUN patch ./buildScripts/getBuildToolsApt ./patches/getBuildToolsApt.patch
2121
RUN patch ./buildScripts/getDevLibrariesApt ./patches/getDevLibrariesApt.patch
22+
RUN patch ./pdf2htmlEX/src/util/unicode.h ./patches/unicode.h.patch
2223
RUN patch ./pdf2htmlEX/CMakeLists.txt ./patches/CMakeLists.patch
2324

2425
RUN ./buildScripts/versionEnvs
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
@@ -39,9 +39,6 @@ namespace pdf2htmlEX {
2+
* moz:
3+
* p2h: [------------------] [-] [-] [-----------------]
4+
*
5+
- * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified,
6+
- * \n and \r can break line, \t can shift text, so they are considered illegal.
7+
- *
8+
* Resources (retrieved at 2015-03-16)
9+
* * webkit
10+
* * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 )
11+
@@ -58,7 +55,7 @@ namespace pdf2htmlEX {
12+
*/
13+
inline bool is_illegal_unicode(Unicode c)
14+
{
15+
- return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD)
16+
+ return (c < 0x20) || (c >= 0x7F && c < 0xA0) || (c == 0xAD)
17+
|| (c >= 0x300 && c <= 0x36f) // DCRH Combining diacriticals
18+
|| (c >= 0x1ab0 && c <= 0x1aff) // DCRH Combining diacriticals
19+
|| (c >= 0x1dc0 && c <= 0x1dff) // DCRH Combining diacriticals
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:8937481da9ecc248172cd308abec6283e0b2820ea24bf159b602c9d99cdcf9e2
3-
size 1203138
2+
oid sha256:ff65d9e1cc4864dc0db647594c33c01333faa20e0e104379b42ae2b8e9694c0a
3+
size 1086803

0 commit comments

Comments
 (0)