33from itertools import chain
44
55from lxml import etree
6- from lxml .etree import _Comment , tostring
6+ from lxml .etree import Comment , tostring
77from lxml .html import fragment_fromstring
88
99
1010def lxml_dump_node (node ):
11- return lxml_stringify_node (node )
11+ return lxml_stringify_node (node , root_node = False )
1212
1313
1414# This code is taken from Python 3.7. The addition is escaping of the tab
@@ -25,6 +25,8 @@ def lxml_escape_for_html(string: str) -> str:
2525 string = string .replace (">" , ">" )
2626 string = string .replace ('"' , """ )
2727 string = string .replace ("'" , "'" )
28+ # Non-breaking space character.
29+ string = string .replace ("\xa0 " , " " )
2830 # Invisible tab character
2931 string = string .replace ("\t " , "	" )
3032 return string
@@ -120,16 +122,53 @@ def _lxml_stringify_reqif_ns_node(node):
120122 return string
121123
122124
123- def lxml_stringify_node (node ):
125+ def lxml_stringify_node (node , root_node = True ):
126+ """
127+ Stringify a given LXML node.
128+
129+ :param node:
130+ :param root_node: Needed to track whether a node is the first among the
131+ nodes being stringified. Some ReqIF producers do not use a
132+ global xmlns="http://www.w3.org/1999/xhtml" namespace.
133+ Instead, they assign this namespace only to the very first
134+ node inside the ATTRIBUTE-VALUE-XHTML/THE-VALUE tag, for
135+ example: <div xmlns="http://www.w3.org/1999/xhtml">...
136+ Tracking this root node ensures that the xmlns attribute
137+ is assigned only to the first node and not to all
138+ subsequent nodes.
139+ :return:
140+ """
141+ output = ""
142+
143+ # Some ReqIF producers add <!-- --> comments but these comment nodes
144+ # cannot be handled using etree.QName(node).localname like used further
145+ # below. Handling them separately with this dedicated branch.
146+ # A user report that helped to discover this case:
147+ # https://github.com/strictdoc-project/reqif/issues/205
148+ if lxml_is_comment_node (node ):
149+ assert node .text is not None
150+ output = f"<!--{ node .text } -->"
151+ if node .tail is not None :
152+ output += lxml_escape_for_html (node .tail )
153+ return output
154+
124155 nskey = None
156+ nsvalue = None
125157 if len (node .nsmap ) > 0 :
126- nskey = next (iter (node .nsmap .keys ()))
127- output = ""
158+ nskey , nsvalue = next (iter (node .nsmap .items ()))
159+
128160 node_no_ns_tag = etree .QName (node ).localname
129- tag = f"{ nskey } :{ node_no_ns_tag } " if node .tag [0 ] == "{" else node .tag
161+ tag = (
162+ f"{ nskey } :{ node_no_ns_tag } "
163+ if node .tag [0 ] == "{" and nskey is not None
164+ else node_no_ns_tag
165+ )
130166 output += f"<{ tag } "
131167 for attribute , attribute_value in node .attrib .items ():
132168 output += f' { attribute } ="{ lxml_escape_for_html (attribute_value )} "'
169+ if nsvalue is not None and root_node :
170+ output += f' xmlns="{ nsvalue } "'
171+
133172 # <object> is surprisingly a tag that must have a closing tag even if it
134173 # is empty. If self-closed, it breaks all the following markup.
135174 if (
@@ -141,7 +180,7 @@ def lxml_stringify_node(node):
141180 if node .text is not None :
142181 output += lxml_escape_for_html (node .text )
143182 for child in node .getchildren ():
144- output += lxml_stringify_node (child )
183+ output += lxml_stringify_node (child , root_node = False )
145184 output += f"</{ tag } >"
146185 else :
147186 output += "/>"
@@ -221,5 +260,4 @@ def lxml_strip_namespace_from_xml(root_xml, full=False):
221260
222261
223262def lxml_is_comment_node (xml_node ):
224- # FIXME: Accessing a "_"-marked Comment class of lxml is not great.
225- return isinstance (xml_node , _Comment )
263+ return xml_node .tag is Comment
0 commit comments