Skip to content

Commit ac3f10e

Browse files
committed
feat: Refactor text justification logic and enhance glyph width measurement for improved PDF rendering
1 parent 58e938d commit ac3f10e

2 files changed

Lines changed: 145 additions & 89 deletions

File tree

src/MiniPdf/DocxToPdfConverter.cs

Lines changed: 15 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1381,48 +1381,10 @@ private static void RenderMultiFormatRuns(RenderState state, DocxParagraph parag
13811381
var needsPerLineAlignment = false;
13821382
var isJustified = paragraph.Alignment == "both";
13831383
if (isJustified)
1384-
{
1385-
// Keep justify width calculation consistent with wrapping logic so
1386-
// run boundaries do not accumulate gaps before bold/non-bold segments.
1387-
static float JustifyEntryWidth((string Text, float X, float Y, float FontSize, PdfColor? Color, bool Bold, bool Italic, bool Underline, float CharSpacing, string? FontName, float? MaxWidth, float? UlWidth) e, bool useCalibri)
1388-
{
1389-
var w = EstimateWrapTextWidth(e.Text, e.FontSize, e.Bold, e.CharSpacing, useCalibri);
1390-
if (useCalibri && e.FontName != null
1391-
&& !e.FontName.Contains("Calibri", StringComparison.OrdinalIgnoreCase))
1392-
{
1393-
w *= e.Bold ? 1.12f : 1.04f;
1394-
}
1395-
return w;
1396-
}
1397-
1398-
var useCalibriJustify = state.Options.UseCalibriWidths;
1399-
float totalTextWidth = 0;
1400-
int totalSpaces = 0;
1401-
foreach (var e in lineEntries)
1402-
{
1403-
totalTextWidth += JustifyEntryWidth(e, useCalibriJustify);
1404-
totalSpaces += e.Text.Count(c => c == ' ');
1405-
}
1406-
float justifyWordSpacing = 0;
1407-
if (!isLastLine && totalSpaces > 0)
1408-
{
1409-
var extraSpace = lw - totalTextWidth;
1410-
if (extraSpace > 0)
1411-
justifyWordSpacing = extraSpace / totalSpaces;
1412-
}
1413-
1414-
float entryX = lineEntries[0].X;
1415-
foreach (var e in lineEntries)
1416-
{
1417-
state.CurrentPage!.AddText(e.Text, entryX, e.Y, e.FontSize, e.Color,
1418-
bold: e.Bold, italic: e.Italic, underline: e.Underline, charSpacing: e.CharSpacing,
1419-
wordSpacing: justifyWordSpacing,
1420-
preferredFontName: e.FontName, maxWidth: null, underlineWidth: e.UlWidth);
1421-
entryX += JustifyEntryWidth(e, useCalibriJustify)
1422-
+ justifyWordSpacing * e.Text.Count(c => c == ' ');
1423-
}
1424-
}
1425-
else if (paragraph.Alignment is "center" or "right")
1384+
{
1385+
needsPerLineAlignment = true;
1386+
}
1387+
else if (paragraph.Alignment is "center" or "right")
14261388
{
14271389
var totalWidth = 0f;
14281390
string? prevRunTextPre = null;
@@ -1484,15 +1446,17 @@ void FlushLineEntries(bool isLastLine = false)
14841446

14851447
if (isJustified)
14861448
{
1487-
// Keep justify width calculation consistent with wrapping logic so
1488-
// run boundaries do not accumulate gaps before bold/non-bold segments.
1489-
static float JustifyEntryWidth((string Text, float X, float Y, float FontSize, PdfColor? Color, bool Bold, bool Italic, bool Underline, float CharSpacing, string? FontName, float? MaxWidth, float? UlWidth) e, bool useCalibri)
1449+
// Compute word spacing from Calibri-estimated widths (same as wrapping).
1450+
// Each entry gets maxWidth so PdfWriter can Tz-scale each run to
1451+
// exactly its allocated width using the actual embedded font metrics,
1452+
// eliminating gaps at run boundaries (e.g. regular→bold transitions).
1453+
static float WrapEntryWidth((string Text, float X, float Y, float FontSize, PdfColor? Color, bool Bold, bool Italic, bool Underline, float CharSpacing, string? FontName, float? MaxWidth, float? UlWidth) e, bool useCalibri)
14901454
{
14911455
var w = EstimateWrapTextWidth(e.Text, e.FontSize, e.Bold, e.CharSpacing, useCalibri);
14921456
if (useCalibri && e.FontName != null
14931457
&& !e.FontName.Contains("Calibri", StringComparison.OrdinalIgnoreCase))
14941458
{
1495-
w *= e.Bold ? 1.12f : 1.04f;
1459+
w *= e.Bold ? 1.06f : 1.00f;
14961460
}
14971461
return w;
14981462
}
@@ -1502,7 +1466,7 @@ static float JustifyEntryWidth((string Text, float X, float Y, float FontSize, P
15021466
int totalSpaces = 0;
15031467
foreach (var e in lineEntries)
15041468
{
1505-
totalTextWidth += JustifyEntryWidth(e, useCalibriJustify);
1469+
totalTextWidth += WrapEntryWidth(e, useCalibriJustify);
15061470
totalSpaces += e.Text.Count(c => c == ' ');
15071471
}
15081472
float justifyWordSpacing = 0;
@@ -1516,11 +1480,12 @@ static float JustifyEntryWidth((string Text, float X, float Y, float FontSize, P
15161480
float entryX = lineEntries[0].X;
15171481
foreach (var e in lineEntries)
15181482
{
1483+
var estW = WrapEntryWidth(e, useCalibriJustify);
15191484
state.CurrentPage!.AddText(e.Text, entryX, e.Y, e.FontSize, e.Color,
15201485
bold: e.Bold, italic: e.Italic, underline: e.Underline, charSpacing: e.CharSpacing,
15211486
wordSpacing: justifyWordSpacing,
1522-
preferredFontName: e.FontName, maxWidth: null, underlineWidth: e.UlWidth);
1523-
entryX += JustifyEntryWidth(e, useCalibriJustify)
1487+
preferredFontName: e.FontName, maxWidth: estW, underlineWidth: e.UlWidth);
1488+
entryX += estW
15241489
+ justifyWordSpacing * e.Text.Count(c => c == ' ');
15251490
}
15261491
}
@@ -1679,7 +1644,7 @@ static float JustifyEntryWidth((string Text, float X, float Y, float FontSize, P
16791644
var nonCalibriWidthFactor = 1f;
16801645
if (useCalibri && run.FontName != null
16811646
&& !run.FontName.Contains("Calibri", StringComparison.OrdinalIgnoreCase))
1682-
nonCalibriWidthFactor = run.Bold ? 1.12f : 1.04f;
1647+
nonCalibriWidthFactor = run.Bold ? 1.06f : 1.00f;
16831648

16841649
for (var wi = 0; wi < words.Length; wi++)
16851650
{

src/MiniPdf/PdfWriter.cs

Lines changed: 130 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ private sealed class EmbeddedFontInfo
3434
public int[] Bbox = [-166, -225, 1000, 931];
3535
/// <summary>Maps Unicode code point → CID. BMP chars use identity; non-BMP use PUA slots.</summary>
3636
public Dictionary<int, int> CpToCid = new();
37+
/// <summary>Glyph advance widths indexed by glyph ID (from hmtx table).</summary>
38+
public ushort[] Advances = [];
39+
/// <summary>Maps Unicode code point → glyph ID (from cmap table).</summary>
40+
public Dictionary<int, ushort> Cmap = new();
41+
/// <summary>Font units per em (from head table).</summary>
42+
public int UnitsPerEm = 1000;
3743
// PDF object numbers (assigned during Write)
3844
public int ToUnicodeObj, DescriptorObj, CidFontObj, Type0Obj, FontFileObj, CidToGidObj;
3945
}
@@ -420,6 +426,9 @@ void LoadFontFile(string path)
420426
CapHeight = (int)(capH * scale),
421427
Bbox = [.. bbox.Select(v => (int)(v * scale))],
422428
CpToCid = cpToCid,
429+
Advances = advances,
430+
Cmap = cmap,
431+
UnitsPerEm = upm,
423432
});
424433
}
425434

@@ -992,25 +1001,74 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
9921001
sb.Append("\n");
9931002
sb.Append("2 Tr\n"); // rendering mode: fill + stroke
9941003
}
995-
// Apply word spacing (Tw) for justified text
996-
if (block.WordSpacing != 0)
997-
sb.Append($"{block.WordSpacing.ToString("F2", CultureInfo.InvariantCulture)} Tw\n");
998-
// Always set character spacing to prevent Tc from previous
999-
// text blocks leaking through the graphics state.
1004+
// Determine the block's preferred font slot for font-aware
1005+
// width computation and Tz scaling.
1006+
var blockPrefSlot = -1;
1007+
if (fontNameToSlot != null && !string.IsNullOrWhiteSpace(block.PreferredFontName))
1008+
{
1009+
if (hasBoldItalicFontVariant)
1010+
fontNameToSlot.TryGetValue(boldItalicFontKey!, out blockPrefSlot);
1011+
if (blockPrefSlot < 0 && hasBoldFontVariant)
1012+
fontNameToSlot.TryGetValue(boldFontKey!, out blockPrefSlot);
1013+
if (blockPrefSlot < 0 && hasItalicFontVariant)
1014+
fontNameToSlot.TryGetValue(italicFontKey!, out blockPrefSlot);
1015+
if (blockPrefSlot < 0)
1016+
fontNameToSlot.TryGetValue(block.PreferredFontName!, out blockPrefSlot);
1017+
}
1018+
1019+
// For CID/Identity-H fonts, Tw (word spacing) does NOT work —
1020+
// the PDF spec applies Tw only to single-byte 0x20.
1021+
// Instead: use Tz to correct glyph width (actual vs layout estimate),
1022+
// and TJ displacement values to add word spacing at space boundaries.
1023+
// TJ displacements are scaled by Tz/100, so we compensate for that.
1024+
var wordSpacingTJ = 0; // for CID path only
1025+
var cidTzPercent = 100.0;
1026+
{
1027+
EmbeddedFontInfo? efForCid = null;
1028+
if (blockPrefSlot >= 0 && embeddedFonts != null && blockPrefSlot < embeddedFonts.Count)
1029+
efForCid = embeddedFonts[blockPrefSlot];
1030+
if (efForCid != null && block.MaxWidth.HasValue)
1031+
{
1032+
var actualGlyphWidth = MeasureEmbeddedFontWidth(block.Text, block.FontSize, efForCid);
1033+
if (actualGlyphWidth > 0)
1034+
cidTzPercent = (double)block.MaxWidth.Value / actualGlyphWidth * 100.0;
1035+
// Clamp: only compress, never expand beyond 100%
1036+
if (cidTzPercent > 100.0) cidTzPercent = 100.0;
1037+
}
1038+
if (block.WordSpacing > 0)
1039+
{
1040+
// TJ displacement = -(ws / fontSize / (Tz/100)) * 1000
1041+
// because PDF applies × Tz/100 to TJ values in text space
1042+
var tzFactor = cidTzPercent / 100.0;
1043+
wordSpacingTJ = -(int)Math.Round((double)block.WordSpacing / block.FontSize / tzFactor * 1000.0);
1044+
}
1045+
}
1046+
// Don't emit Tw for CID path (handled by TJ). Tc is still needed.
10001047
sb.Append($"{block.CharSpacing.ToString("F2", CultureInfo.InvariantCulture)} Tc\n");
1001-
// Apply horizontal scaling if text overflows MaxWidth;
1002-
// always reset Tz to prevent scaling from previous blocks leaking.
1048+
// Tz: for CID path, use computed cidTzPercent.
1049+
// For WinAnsi fallback with no embedded font, keep compress-only Tz.
10031050
if (block.MaxWidth.HasValue)
10041051
{
1005-
var naturalWidth = MeasureTextWidth(block.Text, block.FontSize, block.CharSpacing, bold: block.Bold);
1006-
if (naturalWidth > block.MaxWidth.Value && naturalWidth > 0)
1052+
EmbeddedFontInfo? efForTz = null;
1053+
if (blockPrefSlot >= 0 && embeddedFonts != null && blockPrefSlot < embeddedFonts.Count)
1054+
efForTz = embeddedFonts[blockPrefSlot];
1055+
if (efForTz != null)
10071056
{
1008-
var tzPercent = (block.MaxWidth.Value / naturalWidth) * 100.0;
1009-
sb.Append($"{tzPercent.ToString("F1", CultureInfo.InvariantCulture)} Tz\n");
1057+
sb.Append($"{cidTzPercent.ToString("F1", CultureInfo.InvariantCulture)} Tz\n");
10101058
}
10111059
else
10121060
{
1013-
sb.Append("100.0 Tz\n");
1061+
// Fallback: Helvetica metrics, compress-only Tz
1062+
var naturalWidth = MeasureTextWidth(block.Text, block.FontSize, block.CharSpacing, bold: block.Bold);
1063+
if (naturalWidth > block.MaxWidth.Value && naturalWidth > 0)
1064+
{
1065+
var tzPercent = (block.MaxWidth.Value / naturalWidth) * 100.0;
1066+
sb.Append($"{tzPercent.ToString("F1", CultureInfo.InvariantCulture)} Tz\n");
1067+
}
1068+
else
1069+
{
1070+
sb.Append("100.0 Tz\n");
1071+
}
10141072
}
10151073
}
10161074
else
@@ -1021,23 +1079,8 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
10211079

10221080
// Split text into runs by font slot. Default all chars to slot 0 (F2).
10231081
var codePoints = ShapeArabicCodePoints(EnumerateCodePoints(block.Text).ToList());
1024-
// Per-block font preference: if the block specifies a preferred font,
1025-
// try to use that font's slot for each codepoint (if the font includes it).
1026-
var blockPrefSlot = -1;
1027-
if (fontNameToSlot != null && !string.IsNullOrWhiteSpace(block.PreferredFontName))
1028-
{
1029-
// Use the bold italic font variant slot if available (highest priority).
1030-
if (hasBoldItalicFontVariant)
1031-
fontNameToSlot.TryGetValue(boldItalicFontKey!, out blockPrefSlot);
1032-
// Use the bold font variant slot if available; otherwise fall back to regular.
1033-
if (blockPrefSlot < 0 && hasBoldFontVariant)
1034-
fontNameToSlot.TryGetValue(boldFontKey!, out blockPrefSlot);
1035-
// Use the italic font variant slot if available.
1036-
if (blockPrefSlot < 0 && hasItalicFontVariant)
1037-
fontNameToSlot.TryGetValue(italicFontKey!, out blockPrefSlot);
1038-
if (blockPrefSlot < 0)
1039-
fontNameToSlot.TryGetValue(block.PreferredFontName!, out blockPrefSlot);
1040-
}
1082+
// Per-block font preference: blockPrefSlot was already determined
1083+
// above for Tz computation. Re-use it for run assignment.
10411084
var runs = new List<(int fontSlot, List<int> cps)>();
10421085
foreach (var cp in codePoints)
10431086
{
@@ -1062,20 +1105,50 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
10621105
{
10631106
var fontName = $"F{run.fontSlot + 2}";
10641107
sb.Append($"/{fontName} {fontSize} Tf\n");
1065-
sb.Append('<');
1066-
foreach (var cp in run.cps)
1108+
// Use TJ (array form) to insert word spacing at space boundaries.
1109+
// Tw doesn't work for CID/Identity-H fonts, so we use TJ
1110+
// displacement values to add spacing after each space character.
1111+
if (wordSpacingTJ != 0)
1112+
{
1113+
sb.Append('[');
1114+
sb.Append('<');
1115+
foreach (var cp in run.cps)
1116+
{
1117+
var cid = cp;
1118+
if (embeddedFonts != null && run.fontSlot < embeddedFonts.Count)
1119+
{
1120+
var ef = embeddedFonts[run.fontSlot];
1121+
if (ef.CpToCid.TryGetValue(cp, out var mapped))
1122+
cid = mapped;
1123+
}
1124+
sb.Append(cid.ToString("X4"));
1125+
// Insert TJ displacement after space characters
1126+
if (cp == ' ')
1127+
{
1128+
sb.Append('>');
1129+
sb.Append(wordSpacingTJ.ToString(CultureInfo.InvariantCulture));
1130+
sb.Append('<');
1131+
}
1132+
}
1133+
sb.Append(">] TJ\n");
1134+
}
1135+
else
10671136
{
1068-
// Map code point to CID via the font's CpToCid table
1069-
var cid = cp;
1070-
if (embeddedFonts != null && run.fontSlot < embeddedFonts.Count)
1137+
// No word spacing — use simple Tj
1138+
sb.Append('<');
1139+
foreach (var cp in run.cps)
10711140
{
1072-
var ef = embeddedFonts[run.fontSlot];
1073-
if (ef.CpToCid.TryGetValue(cp, out var mapped))
1074-
cid = mapped;
1141+
var cid = cp;
1142+
if (embeddedFonts != null && run.fontSlot < embeddedFonts.Count)
1143+
{
1144+
var ef = embeddedFonts[run.fontSlot];
1145+
if (ef.CpToCid.TryGetValue(cp, out var mapped))
1146+
cid = mapped;
1147+
}
1148+
sb.Append(cid.ToString("X4"));
10751149
}
1076-
sb.Append(cid.ToString("X4"));
1150+
sb.Append("> Tj\n");
10771151
}
1078-
sb.Append("> Tj\n");
10791152
}
10801153

10811154
if (block.Bold)
@@ -1110,6 +1183,24 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
11101183
return sb.ToString();
11111184
}
11121185

1186+
/// <summary>
1187+
/// Measures text width using an embedded font's actual glyph advance widths.
1188+
/// Returns the width in points for the given font size, excluding Tc/Tw contributions.
1189+
/// </summary>
1190+
private static double MeasureEmbeddedFontWidth(string text, float fontSize, EmbeddedFontInfo ef)
1191+
{
1192+
double total = 0;
1193+
foreach (var ch in text)
1194+
{
1195+
int cp = ch;
1196+
if (ef.Cmap.TryGetValue(cp, out var gid) && gid < ef.Advances.Length)
1197+
total += ef.Advances[gid];
1198+
else
1199+
total += ef.UnitsPerEm / 2; // fallback: half em
1200+
}
1201+
return total * fontSize / ef.UnitsPerEm;
1202+
}
1203+
11131204
/// <summary>
11141205
/// Measures the natural rendering width of text in Helvetica at the given font size.
11151206
/// Uses the standard Helvetica character width table.

0 commit comments

Comments
 (0)