preserve & unescaped in a/@href: metanorma/metanorma-pdfa#21

opoudjis · opoudjis · commit 8813f83ef593 · 2026-03-05T01:41:40.000+11:00
diff --git a/lib/html2doc/xml.rb b/lib/html2doc/xml.rb
@@ -15,19 +15,49 @@ def to_xhtml(xml)
     end
     xml = xml.gsub(/<!--\s*\[([^\<\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
       .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
+    # Escape & to &amp; in href attributes before XML parsing to prevent stripping
+    xml = escape_amp_in_hrefs(xml)
     Nokogiri::XML.parse(xml)
   end
 
+  # Escape plain & to &amp; in href attributes
+  # This prevents Nokogiri from stripping invalid HTML entities during XML parsing
+  def escape_amp_in_hrefs(html)
+    # Match href="..." and href='...' separately
+    html.gsub(/(href\s*=\s*")([^"]*)"|(href\s*=\s*')([^']*)'/) do
+      if Regexp.last_match(1)
+        "#{Regexp.last_match(1)}#{Regexp.last_match(2).gsub('&', '&amp;')}\""
+      else
+        "#{Regexp.last_match(3)}#{Regexp.last_match(4).gsub('&', '&amp;')}'"
+      end
+    end
+  end
+
   DOCTYPE = <<~DOCTYPE.freeze
     <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
   DOCTYPE
 
   def from_xhtml(xml)
-    xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
+    result = xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
       .sub(DOCTYPE, "").gsub(%{ />}, "/>")
       .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
       .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
       .gsub("\n--&gt;\n", "\n-->\n")
+    # Unescape &amp; to & in href attributes for proper URL handling
+    unescape_amp_in_hrefs(result)
+  end
+
+  # Unescape &amp; to & in href attributes only
+  # This ensures URLs work correctly in Word while preserving &amp; in text
+  def unescape_amp_in_hrefs(html)
+    # Match href="..." and href='...' separately
+    html.gsub(/(href\s*=\s*")([^"]*)"|(href\s*=\s*')([^']*)'/) do
+      if Regexp.last_match(1)
+        "#{Regexp.last_match(1)}#{Regexp.last_match(2).gsub('&amp;', '&')}\""
+      else
+        "#{Regexp.last_match(3)}#{Regexp.last_match(4).gsub('&amp;', '&')}'"
+      end
+    end
   end
 
   def msword_fix(doc)
diff --git a/spec/html2doc_spec.rb b/spec/html2doc_spec.rb
@@ -864,6 +864,23 @@ def image_clean(xml)
   #     expect{ Html2Doc.process(html_input(simple_body), filename: "test") }
   #     .to output("https://example.com/19160-6.svg: SVG not supported\n").to_stderr
   #   end
+  
+  it "processes ampersands in href" do
+    simple_body = '<div><p class="MsoNormal">a &amp; b</p>
+    <p class="MsoNormal"><a name="_" id="_"></a>    <a href="https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&display=swap">https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&amp;display=swap</a></p>
+     </div>'
+    Html2Doc.new(filename: "test").process(html_input(simple_body))
+    expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
+      .to match_fuzzy(<<~OUTPUT)
+        #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
+        #{word_body('<div><p class="MsoNormal">a &amp; b</p>
+    <p class="MsoNormal"><a name="_" id="_"></a>    <a href="https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&display=swap">https://fonts.googleapis.com/css?family=Space+Mono:400,400i,700,700i&amp;display=swap</a></p>
+     </div>',
+                    '<div style="mso-element:footnote-list"/>')}
+        #{WORD_FTR1}
+      OUTPUT
+  end
+
 
   it "processes epub:type footnotes" do
     simple_body = '<div>This is a very simple