@@ -15,19 +15,49 @@ def to_xhtml(xml)
1515 end
1616 xml = xml . gsub ( /<!--\s *\[ ([^\< \] ]+)\] >/ , "<!-- MSWORD-COMMENT \\ 1 -->" )
1717 . gsub ( /<!\s *\[ endif\] \s *-->/ , "<!-- MSWORD-COMMENT-END -->" )
18+ # Escape & to & in href attributes before XML parsing to prevent stripping
19+ xml = escape_amp_in_hrefs ( xml )
1820 Nokogiri ::XML . parse ( xml )
1921 end
2022
23+ # Escape plain & to & in href attributes
24+ # This prevents Nokogiri from stripping invalid HTML entities during XML parsing
25+ def escape_amp_in_hrefs ( html )
26+ # Match href="..." and href='...' separately
27+ html . gsub ( /(href\s *=\s *")([^"]*)"|(href\s *=\s *')([^']*)'/ ) do
28+ if Regexp . last_match ( 1 )
29+ "#{ Regexp . last_match ( 1 ) } #{ Regexp . last_match ( 2 ) . gsub ( '&' , '&' ) } \" "
30+ else
31+ "#{ Regexp . last_match ( 3 ) } #{ Regexp . last_match ( 4 ) . gsub ( '&' , '&' ) } '"
32+ end
33+ end
34+ end
35+
2136 DOCTYPE = <<~DOCTYPE . freeze
2237 <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2338 DOCTYPE
2439
2540 def from_xhtml ( xml )
26- xml . to_xml . sub ( %{ xmlns="http://www.w3.org/1999/xhtml"} , "" )
41+ result = xml . to_xml . sub ( %{ xmlns="http://www.w3.org/1999/xhtml"} , "" )
2742 . sub ( DOCTYPE , "" ) . gsub ( %{ />} , "/>" )
2843 . gsub ( /<!-- MSWORD-COMMENT (.+?) -->/ , "<!--[\\ 1]>" )
2944 . gsub ( /<!-- MSWORD-COMMENT-END -->/ , "<![endif]-->" )
3045 . gsub ( "\n -->\n " , "\n -->\n " )
46+ # Unescape & to & in href attributes for proper URL handling
47+ unescape_amp_in_hrefs ( result )
48+ end
49+
50+ # Unescape & to & in href attributes only
51+ # This ensures URLs work correctly in Word while preserving & in text
52+ def unescape_amp_in_hrefs ( html )
53+ # Match href="..." and href='...' separately
54+ html . gsub ( /(href\s *=\s *")([^"]*)"|(href\s *=\s *')([^']*)'/ ) do
55+ if Regexp . last_match ( 1 )
56+ "#{ Regexp . last_match ( 1 ) } #{ Regexp . last_match ( 2 ) . gsub ( '&' , '&' ) } \" "
57+ else
58+ "#{ Regexp . last_match ( 3 ) } #{ Regexp . last_match ( 4 ) . gsub ( '&' , '&' ) } '"
59+ end
60+ end
3161 end
3262
3363 def msword_fix ( doc )
0 commit comments