Merge pull request #380 from rursache/feature/auto-detect-xml-parse

aehlke · web-flow · commit dba183c96b2d · 2026-03-06T16:26:52.000-05:00
Auto-detect XML in parse() and add parseHTML() APIs
diff --git a/README.md b/README.md
@@ -79,11 +79,11 @@ print(try document.title()) // Output: Example
 ```
 
 ---
-### Parse an XML Document
+### Automatic Format Detection
 
-Use the XML parser when working with feeds, OPML, or other non-HTML documents. The default `SwiftSoup.parse(...)`
-entry points apply HTML5 parsing rules, so tags like `<link>` and `<img>` will be treated as HTML tags instead of
-generic XML elements.
+`SwiftSoup.parse(...)` automatically detects XML input by looking for an `<?xml` declaration at the start of the
+content. When detected, the XML parser is used; otherwise the HTML parser is applied. This means feeds, OPML, and
+other XML documents with a standard XML declaration "just work":
 
 ```swift
 import SwiftSoup
@@ -98,11 +98,26 @@ let xml = """
 </opml>
 """
 
-let document = try SwiftSoup.parseXML(xml)
+let document = try SwiftSoup.parse(xml) // auto-detects XML
 print(try document.select("link").first()?.text()) // Output: I'm link
 print(try document.select("body > img").first()?.text()) // Output: I'm img
 ```
 
+### Explicit Parse Modes
+
+Use `parseXML(...)` or `parseHTML(...)` when you want to force a specific parser regardless of the content:
+
+```swift
+// Force XML parsing (no HTML5 tag normalization)
+let xmlDoc = try SwiftSoup.parseXML(xmlString)
+
+// Force HTML parsing (always applies HTML5 rules, even if input has <?xml>)
+let htmlDoc = try SwiftSoup.parseHTML(htmlString)
+
+// Explicit parser argument (unchanged from before)
+let doc = try SwiftSoup.parse(input, baseUri, Parser.xmlParser())
+```
+
 ---
 ### Parse HTML from a URL
 
diff --git a/Sources/SwiftSoup.swift b/Sources/SwiftSoup.swift
@@ -8,28 +8,37 @@
 import Foundation
 
 	/**
-	 Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
-     For XML input, use ``parseXML(_:_:)`` or ``parse(_:_:_:)`` with ``Parser/xmlParser()``.
-	 
-	 - parameter html:    HTML to parse
-	 - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
-	   before the HTML declares a `<base href>` tag.
-	 - returns: sane HTML
+	 Parse markup into a Document with automatic format detection. If the input starts with an XML declaration
+	 (`<?xml`), the XML parser is used; otherwise the HTML parser is applied. Use ``parseHTML(_:_:)`` or
+	 ``parseXML(_:_:)`` to force a specific parser.
+
+	 - parameter html:    markup to parse
+	 - parameter baseUri: The URL where the markup was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
+	   before the markup declares a `<base href>` tag.
+	 - returns: parsed Document
 	*/
 	public func parse(_ html: String, _ baseUri: String) throws -> Document {
+		if looksLikeXml(html) {
+			return try Parser.xmlParser().parseInput(html, baseUri)
+		}
 		return try Parser.parse(html, baseUri)
 	}
 
 	/**
-	 Parse Data into a Document. The parser will make a sensible, balanced document tree out of any HTML.
-     For XML input, use ``parseXML(_:_:)`` or ``parse(_:_:_:)`` with ``Parser/xmlParser()``.
-	 
-	 - parameter data: Data to parse
-	 - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
-	   before the HTML declares a `<base href>` tag.
-	 - returns: sane HTML
+	 Parse Data into a Document with automatic format detection. If the input starts with an XML declaration
+	 (`<?xml`), the XML parser is used; otherwise the HTML parser is applied. Use ``parseHTML(_:_:)`` or
+	 ``parseXML(_:_:)`` to force a specific parser.
+
+	 - parameter data: data to parse
+	 - parameter baseUri: The URL where the markup was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
+	   before the markup declares a `<base href>` tag.
+	 - returns: parsed Document
 	*/
     public func parse(_ data: Data, _ baseUri: String) throws -> Document {
+        let bytes = [UInt8](data)
+        if looksLikeXml(bytes) {
+            return try Parser.xmlParser().parseInput(bytes, baseUri)
+        }
         return try Parser.parse(data, baseUri)
     }
 
@@ -84,29 +93,27 @@ import Foundation
     }
 
 	/**
-	 Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
-	 `<base href>` tag.
-     For XML input, use ``parseXML(_:)``.
-	 
-	 - parameter html: HTML to parse
-	 - returns: sane HTML
+	 Parse markup into a Document with automatic format detection. As no base URI is specified, absolute URL
+	 detection relies on the markup including a `<base href>` tag.
+
+	 - parameter html: markup to parse
+	 - returns: parsed Document
 	 - seealso: ``parse(_:_:)-(String,String)``
 	*/
 	public func parse(_ html: String) throws -> Document {
-		return try Parser.parse(html, "")
+		return try parse(html, "")
 	}
 
     /**
-	 Parse Data into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
-	 `<base href>` tag.
-     For XML input, use ``parseXML(_:)``.
-	 
-	 - parameter data: Data to parse
-	 - returns: sane HTML
+	 Parse Data into a Document with automatic format detection. As no base URI is specified, absolute URL
+	 detection relies on the markup including a `<base href>` tag.
+
+	 - parameter data: data to parse
+	 - returns: parsed Document
 	 - seealso: ``parse(_:_:)-(String,String)``
     */
 	public func parse(_ data: Data) throws -> Document {
-		return try Parser.parse(data, "")
+		return try parse(data, "")
 	}
 
     /**
@@ -176,8 +183,75 @@ import Foundation
         return try parser.parseInput([UInt8](Data(contentsOf: url)), url.absoluteString)
     }
 
+    // MARK: - Explicit HTML parsing
+
+    /**
+     Parse HTML into a Document using the HTML parser. Unlike ``parse(_:_:)``, this always uses the HTML parser
+     regardless of the input content (no auto-detection).
 
+     - parameter html: HTML to parse
+     - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs.
+     - returns: parsed HTML document
+     */
+    public func parseHTML(_ html: String, _ baseUri: String) throws -> Document {
+        return try Parser.parse(html, baseUri)
+    }
+
+    /**
+     Parse HTML into a Document using the HTML parser. As no base URI is specified, absolute URL detection relies on
+     the HTML including a `<base href>` tag.
 
+     - parameter html: HTML to parse
+     - returns: parsed HTML document
+     */
+    public func parseHTML(_ html: String) throws -> Document {
+        return try parseHTML(html, "")
+    }
+
+    /**
+     Parse HTML data into a Document using the HTML parser.
+
+     - parameter data: HTML data to parse
+     - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs.
+     - returns: parsed HTML document
+     */
+    public func parseHTML(_ data: Data, _ baseUri: String) throws -> Document {
+        return try Parser.parse(data, baseUri)
+    }
+
+    /**
+     Parse HTML data into a Document using the HTML parser. As no base URI is specified, absolute URL detection relies
+     on the HTML including a `<base href>` tag.
+
+     - parameter data: HTML data to parse
+     - returns: parsed HTML document
+     */
+    public func parseHTML(_ data: Data) throws -> Document {
+        return try parseHTML(data, "")
+    }
+
+    // MARK: - Format detection
+
+    private func looksLikeXml(_ string: String) -> Bool {
+        var i = string.startIndex
+        while i < string.endIndex && string[i].isWhitespace {
+            i = string.index(after: i)
+        }
+        return string[i...].hasPrefix("<?xml")
+    }
+
+    private func looksLikeXml(_ bytes: [UInt8]) -> Bool {
+        let xmlDecl: [UInt8] = [0x3C, 0x3F, 0x78, 0x6D, 0x6C] // <?xml
+        var i = 0
+        while i < bytes.count && (bytes[i] == 0x20 || bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0D) {
+            i += 1
+        }
+        guard i + xmlDecl.count <= bytes.count else { return false }
+        for j in 0..<xmlDecl.count {
+            if bytes[i + j] != xmlDecl[j] { return false }
+        }
+        return true
+    }
 
 	//todo:
 //	/**
diff --git a/Tests/SwiftSoupTests/XmlTreeBuilderTest.swift b/Tests/SwiftSoupTests/XmlTreeBuilderTest.swift
@@ -83,6 +83,75 @@ class XmlTreeBuilderTest: XCTestCase {
         XCTAssertEqual(try explicitDoc.outerHtml(), try convenienceDoc.outerHtml())
     }
 
+    // MARK: - Auto-detection tests
+
+    func testParseAutoDetectsXmlDeclaration() throws {
+        let doc = try SwiftSoup.parse(issue309Xml)
+
+        XCTAssertEqual("I'm link", try doc.select("link").first()?.text())
+        XCTAssertEqual("I'm img", try doc.select("img").first()?.text())
+        XCTAssertEqual("I'm image", try doc.select("image").first()?.text())
+        XCTAssertEqual(7, try doc.select("body outline").count)
+        XCTAssertEqual(OutputSettings.Syntax.xml, doc.outputSettings().syntax())
+    }
+
+    func testParseAutoDetectsXmlWithLeadingWhitespace() throws {
+        let xml = "\n  \t <?xml version=\"1.0\"?><root><item>Hello</item></root>"
+        let doc = try SwiftSoup.parse(xml)
+
+        XCTAssertEqual("Hello", try doc.select("item").first()?.text())
+        XCTAssertEqual(OutputSettings.Syntax.xml, doc.outputSettings().syntax())
+    }
+
+    func testParseAutoDetectsHtmlWithoutXmlDeclaration() throws {
+        let html = "<html><head><title>Test</title></head><body><p>Hello</p></body></html>"
+        let doc = try SwiftSoup.parse(html)
+
+        XCTAssertEqual("Test", try doc.title())
+        XCTAssertEqual("Hello", try doc.select("p").first()?.text())
+    }
+
+    func testParseAutoDetectsHtmlDoctype() throws {
+        let html = "<!DOCTYPE html><html><body><link rel=\"stylesheet\"><p>Hello</p></body></html>"
+        let doc = try SwiftSoup.parse(html)
+
+        XCTAssertEqual("Hello", try doc.select("p").first()?.text())
+    }
+
+    func testParseAutoDetectionMatchesExplicitXmlParser() throws {
+        let autoDoc = try SwiftSoup.parse(issue309Xml)
+        let explicitDoc = try SwiftSoup.parse(issue309Xml, "", Parser.xmlParser())
+
+        XCTAssertEqual(try explicitDoc.outerHtml(), try autoDoc.outerHtml())
+    }
+
+    func testParseAutoDetectionDataOverload() throws {
+        let data = issue309Xml.data(using: .utf8)!
+        let doc = try SwiftSoup.parse(data)
+
+        XCTAssertEqual("I'm link", try doc.select("link").first()?.text())
+        XCTAssertEqual(OutputSettings.Syntax.xml, doc.outputSettings().syntax())
+    }
+
+    // MARK: - Explicit parseHTML tests
+
+    func testParseHTMLForcesHtmlParserEvenForXmlInput() throws {
+        let doc = try SwiftSoup.parseHTML(issue309Xml)
+
+        // HTML parser treats <link> as a void element, so it won't contain text
+        XCTAssertNotEqual("I'm link", try doc.select("link").first()?.text())
+    }
+
+    func testParseHTMLNormalizesDocument() throws {
+        let html = "<p>Hello"
+        let doc = try SwiftSoup.parseHTML(html)
+
+        // HTML parser adds html/head/body structure
+        XCTAssertEqual(1, try doc.select("head").count)
+        XCTAssertEqual(1, try doc.select("body").count)
+        XCTAssertEqual("Hello", try doc.select("p").first()?.text())
+    }
+
 	//TODO: nabil
 	//	public void testSupplyParserToConnection() throws IOException {
 	//	String xmlUrl = "http://direct.infohound.net/tools/jsoup-xml-test.xml";