Skip to content

Commit 8813f83

Browse files
committed
preserve & unescaped in a/@href: metanorma/metanorma-pdfa#21
1 parent 924c5c4 commit 8813f83

2 files changed

Lines changed: 48 additions & 1 deletion

File tree

lib/html2doc/xml.rb

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,49 @@ def to_xhtml(xml)
1515
end
1616
xml = xml.gsub(/<!--\s*\[([^\<\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
1717
.gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
18+
# Escape & to &amp; in href attributes before XML parsing to prevent stripping
19+
xml = escape_amp_in_hrefs(xml)
1820
Nokogiri::XML.parse(xml)
1921
end
2022

23+
# Escape plain & to &amp; in href attributes
24+
# This prevents Nokogiri from stripping invalid HTML entities during XML parsing
25+
def escape_amp_in_hrefs(html)
26+
# Match href="..." and href='...' separately
27+
html.gsub(/(href\s*=\s*")([^"]*)"|(href\s*=\s*')([^']*)'/) do
28+
if Regexp.last_match(1)
29+
"#{Regexp.last_match(1)}#{Regexp.last_match(2).gsub('&', '&amp;')}\""
30+
else
31+
"#{Regexp.last_match(3)}#{Regexp.last_match(4).gsub('&', '&amp;')}'"
32+
end
33+
end
34+
end
35+
2136
DOCTYPE = <<~DOCTYPE.freeze
2237
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2338
DOCTYPE
2439

2540
def from_xhtml(xml)
26-
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
41+
result = xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
2742
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
2843
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
2944
.gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
3045
.gsub("\n--&gt;\n", "\n-->\n")
46+
# Unescape &amp; to & in href attributes for proper URL handling
47+
unescape_amp_in_hrefs(result)
48+
end
49+
50+
# Unescape &amp; to & in href attributes only
51+
# This ensures URLs work correctly in Word while preserving &amp; in text
52+
def unescape_amp_in_hrefs(html)
53+
# Match href="..." and href='...' separately
54+
html.gsub(/(href\s*=\s*")([^"]*)"|(href\s*=\s*')([^']*)'/) do
55+
if Regexp.last_match(1)
56+
"#{Regexp.last_match(1)}#{Regexp.last_match(2).gsub('&amp;', '&')}\""
57+
else
58+
"#{Regexp.last_match(3)}#{Regexp.last_match(4).gsub('&amp;', '&')}'"
59+
end
60+
end
3161
end
3262

3363
def msword_fix(doc)

spec/html2doc_spec.rb

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,23 @@ def image_clean(xml)
864864
# expect{ Html2Doc.process(html_input(simple_body), filename: "test") }
865865
# .to output("https://example.com/19160-6.svg: SVG not supported\n").to_stderr
866866
# end
867+
868+
it "processes ampersands in href" do
869+
simple_body = '<div><p class="MsoNormal">a &amp; b</p>
870+
<p class="MsoNormal"><a name="_" id="_"></a> <a href="https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&display=swap">https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&amp;display=swap</a></p>
871+
</div>'
872+
Html2Doc.new(filename: "test").process(html_input(simple_body))
873+
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
874+
.to match_fuzzy(<<~OUTPUT)
875+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
876+
#{word_body('<div><p class="MsoNormal">a &amp; b</p>
877+
<p class="MsoNormal"><a name="_" id="_"></a> <a href="https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&display=swap">https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&amp;display=swap</a></p>
878+
</div>',
879+
'<div style="mso-element:footnote-list"/>')}
880+
#{WORD_FTR1}
881+
OUTPUT
882+
end
883+
867884

868885
it "processes epub:type footnotes" do
869886
simple_body = '<div>This is a very simple

0 commit comments

Comments
 (0)