Skip to content

Commit dba183c

Browse files
authored
Merge pull request #380 from rursache/feature/auto-detect-xml-parse
Auto-detect XML in parse() and add parseHTML() APIs
2 parents 3840e70 + 9f089a0 commit dba183c

3 files changed

Lines changed: 191 additions & 33 deletions

File tree

README.md

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,11 @@ print(try document.title()) // Output: Example
7979
```
8080

8181
---
82-
### Parse an XML Document
82+
### Automatic Format Detection
8383

84-
Use the XML parser when working with feeds, OPML, or other non-HTML documents. The default `SwiftSoup.parse(...)`
85-
entry points apply HTML5 parsing rules, so tags like `<link>` and `<img>` will be treated as HTML tags instead of
86-
generic XML elements.
84+
`SwiftSoup.parse(...)` automatically detects XML input by looking for an `<?xml` declaration at the start of the
85+
content. When detected, the XML parser is used; otherwise the HTML parser is applied. This means feeds, OPML, and
86+
other XML documents with a standard XML declaration "just work":
8787

8888
```swift
8989
import SwiftSoup
@@ -98,11 +98,26 @@ let xml = """
9898
</opml>
9999
"""
100100

101-
let document = try SwiftSoup.parseXML(xml)
101+
let document = try SwiftSoup.parse(xml) // auto-detects XML
102102
print(try document.select("link").first()?.text()) // Output: I'm link
103103
print(try document.select("body > img").first()?.text()) // Output: I'm img
104104
```
105105

106+
### Explicit Parse Modes
107+
108+
Use `parseXML(...)` or `parseHTML(...)` when you want to force a specific parser regardless of the content:
109+
110+
```swift
111+
// Force XML parsing (no HTML5 tag normalization)
112+
let xmlDoc = try SwiftSoup.parseXML(xmlString)
113+
114+
// Force HTML parsing (always applies HTML5 rules, even if input has <?xml>)
115+
let htmlDoc = try SwiftSoup.parseHTML(htmlString)
116+
117+
// Explicit parser argument (unchanged from before)
118+
let doc = try SwiftSoup.parse(input, baseUri, Parser.xmlParser())
119+
```
120+
106121
---
107122
### Parse HTML from a URL
108123

Sources/SwiftSoup.swift

Lines changed: 102 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,37 @@
88
import Foundation
99

1010
/**
11-
Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
12-
For XML input, use ``parseXML(_:_:)`` or ``parse(_:_:_:)`` with ``Parser/xmlParser()``.
13-
14-
- parameter html: HTML to parse
15-
- parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
16-
before the HTML declares a `<base href>` tag.
17-
- returns: sane HTML
11+
Parse markup into a Document with automatic format detection. If the input starts with an XML declaration
12+
(`<?xml`), the XML parser is used; otherwise the HTML parser is applied. Use ``parseHTML(_:_:)`` or
13+
``parseXML(_:_:)`` to force a specific parser.
14+
15+
- parameter html: markup to parse
16+
- parameter baseUri: The URL where the markup was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
17+
before the markup declares a `<base href>` tag.
18+
- returns: parsed Document
1819
*/
1920
public func parse(_ html: String, _ baseUri: String) throws -> Document {
21+
if looksLikeXml(html) {
22+
return try Parser.xmlParser().parseInput(html, baseUri)
23+
}
2024
return try Parser.parse(html, baseUri)
2125
}
2226

2327
/**
24-
Parse Data into a Document. The parser will make a sensible, balanced document tree out of any HTML.
25-
For XML input, use ``parseXML(_:_:)`` or ``parse(_:_:_:)`` with ``Parser/xmlParser()``.
26-
27-
- parameter data: Data to parse
28-
- parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
29-
before the HTML declares a `<base href>` tag.
30-
- returns: sane HTML
28+
Parse Data into a Document with automatic format detection. If the input starts with an XML declaration
29+
(`<?xml`), the XML parser is used; otherwise the HTML parser is applied. Use ``parseHTML(_:_:)`` or
30+
``parseXML(_:_:)`` to force a specific parser.
31+
32+
- parameter data: data to parse
33+
- parameter baseUri: The URL where the markup was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
34+
before the markup declares a `<base href>` tag.
35+
- returns: parsed Document
3136
*/
3237
public func parse(_ data: Data, _ baseUri: String) throws -> Document {
38+
let bytes = [UInt8](data)
39+
if looksLikeXml(bytes) {
40+
return try Parser.xmlParser().parseInput(bytes, baseUri)
41+
}
3342
return try Parser.parse(data, baseUri)
3443
}
3544

@@ -84,29 +93,27 @@ import Foundation
8493
}
8594

8695
/**
87-
Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
88-
`<base href>` tag.
89-
For XML input, use ``parseXML(_:)``.
90-
91-
- parameter html: HTML to parse
92-
- returns: sane HTML
96+
Parse markup into a Document with automatic format detection. As no base URI is specified, absolute URL
97+
detection relies on the markup including a `<base href>` tag.
98+
99+
- parameter html: markup to parse
100+
- returns: parsed Document
93101
- seealso: ``parse(_:_:)-(String,String)``
94102
*/
95103
public func parse(_ html: String) throws -> Document {
96-
return try Parser.parse(html, "")
104+
return try parse(html, "")
97105
}
98106

99107
/**
100-
Parse Data into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
101-
`<base href>` tag.
102-
For XML input, use ``parseXML(_:)``.
103-
104-
- parameter data: Data to parse
105-
- returns: sane HTML
108+
Parse Data into a Document with automatic format detection. As no base URI is specified, absolute URL
109+
detection relies on the markup including a `<base href>` tag.
110+
111+
- parameter data: data to parse
112+
- returns: parsed Document
106113
- seealso: ``parse(_:_:)-(String,String)``
107114
*/
108115
public func parse(_ data: Data) throws -> Document {
109-
return try Parser.parse(data, "")
116+
return try parse(data, "")
110117
}
111118

112119
/**
@@ -176,8 +183,75 @@ import Foundation
176183
return try parser.parseInput([UInt8](Data(contentsOf: url)), url.absoluteString)
177184
}
178185

186+
// MARK: - Explicit HTML parsing
187+
188+
/**
189+
Parse HTML into a Document using the HTML parser. Unlike ``parse(_:_:)``, this always uses the HTML parser
190+
regardless of the input content (no auto-detection).
179191

192+
- parameter html: HTML to parse
193+
- parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs.
194+
- returns: parsed HTML document
195+
*/
196+
public func parseHTML(_ html: String, _ baseUri: String) throws -> Document {
197+
return try Parser.parse(html, baseUri)
198+
}
199+
200+
/**
201+
Parse HTML into a Document using the HTML parser. As no base URI is specified, absolute URL detection relies on
202+
the HTML including a `<base href>` tag.
180203

204+
- parameter html: HTML to parse
205+
- returns: parsed HTML document
206+
*/
207+
public func parseHTML(_ html: String) throws -> Document {
208+
return try parseHTML(html, "")
209+
}
210+
211+
/**
212+
Parse HTML data into a Document using the HTML parser.
213+
214+
- parameter data: HTML data to parse
215+
- parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs.
216+
- returns: parsed HTML document
217+
*/
218+
public func parseHTML(_ data: Data, _ baseUri: String) throws -> Document {
219+
return try Parser.parse(data, baseUri)
220+
}
221+
222+
/**
223+
Parse HTML data into a Document using the HTML parser. As no base URI is specified, absolute URL detection relies
224+
on the HTML including a `<base href>` tag.
225+
226+
- parameter data: HTML data to parse
227+
- returns: parsed HTML document
228+
*/
229+
public func parseHTML(_ data: Data) throws -> Document {
230+
return try parseHTML(data, "")
231+
}
232+
233+
// MARK: - Format detection
234+
235+
private func looksLikeXml(_ string: String) -> Bool {
236+
var i = string.startIndex
237+
while i < string.endIndex && string[i].isWhitespace {
238+
i = string.index(after: i)
239+
}
240+
return string[i...].hasPrefix("<?xml")
241+
}
242+
243+
private func looksLikeXml(_ bytes: [UInt8]) -> Bool {
244+
let xmlDecl: [UInt8] = [0x3C, 0x3F, 0x78, 0x6D, 0x6C] // <?xml
245+
var i = 0
246+
while i < bytes.count && (bytes[i] == 0x20 || bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0D) {
247+
i += 1
248+
}
249+
guard i + xmlDecl.count <= bytes.count else { return false }
250+
for j in 0..<xmlDecl.count {
251+
if bytes[i + j] != xmlDecl[j] { return false }
252+
}
253+
return true
254+
}
181255

182256
//todo:
183257
// /**

Tests/SwiftSoupTests/XmlTreeBuilderTest.swift

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,75 @@ class XmlTreeBuilderTest: XCTestCase {
8383
XCTAssertEqual(try explicitDoc.outerHtml(), try convenienceDoc.outerHtml())
8484
}
8585

86+
// MARK: - Auto-detection tests
87+
88+
func testParseAutoDetectsXmlDeclaration() throws {
89+
let doc = try SwiftSoup.parse(issue309Xml)
90+
91+
XCTAssertEqual("I'm link", try doc.select("link").first()?.text())
92+
XCTAssertEqual("I'm img", try doc.select("img").first()?.text())
93+
XCTAssertEqual("I'm image", try doc.select("image").first()?.text())
94+
XCTAssertEqual(7, try doc.select("body outline").count)
95+
XCTAssertEqual(OutputSettings.Syntax.xml, doc.outputSettings().syntax())
96+
}
97+
98+
func testParseAutoDetectsXmlWithLeadingWhitespace() throws {
99+
let xml = "\n \t <?xml version=\"1.0\"?><root><item>Hello</item></root>"
100+
let doc = try SwiftSoup.parse(xml)
101+
102+
XCTAssertEqual("Hello", try doc.select("item").first()?.text())
103+
XCTAssertEqual(OutputSettings.Syntax.xml, doc.outputSettings().syntax())
104+
}
105+
106+
func testParseAutoDetectsHtmlWithoutXmlDeclaration() throws {
107+
let html = "<html><head><title>Test</title></head><body><p>Hello</p></body></html>"
108+
let doc = try SwiftSoup.parse(html)
109+
110+
XCTAssertEqual("Test", try doc.title())
111+
XCTAssertEqual("Hello", try doc.select("p").first()?.text())
112+
}
113+
114+
func testParseAutoDetectsHtmlDoctype() throws {
115+
let html = "<!DOCTYPE html><html><body><link rel=\"stylesheet\"><p>Hello</p></body></html>"
116+
let doc = try SwiftSoup.parse(html)
117+
118+
XCTAssertEqual("Hello", try doc.select("p").first()?.text())
119+
}
120+
121+
func testParseAutoDetectionMatchesExplicitXmlParser() throws {
122+
let autoDoc = try SwiftSoup.parse(issue309Xml)
123+
let explicitDoc = try SwiftSoup.parse(issue309Xml, "", Parser.xmlParser())
124+
125+
XCTAssertEqual(try explicitDoc.outerHtml(), try autoDoc.outerHtml())
126+
}
127+
128+
func testParseAutoDetectionDataOverload() throws {
129+
let data = issue309Xml.data(using: .utf8)!
130+
let doc = try SwiftSoup.parse(data)
131+
132+
XCTAssertEqual("I'm link", try doc.select("link").first()?.text())
133+
XCTAssertEqual(OutputSettings.Syntax.xml, doc.outputSettings().syntax())
134+
}
135+
136+
// MARK: - Explicit parseHTML tests
137+
138+
func testParseHTMLForcesHtmlParserEvenForXmlInput() throws {
139+
let doc = try SwiftSoup.parseHTML(issue309Xml)
140+
141+
// HTML parser treats <link> as a void element, so it won't contain text
142+
XCTAssertNotEqual("I'm link", try doc.select("link").first()?.text())
143+
}
144+
145+
func testParseHTMLNormalizesDocument() throws {
146+
let html = "<p>Hello"
147+
let doc = try SwiftSoup.parseHTML(html)
148+
149+
// HTML parser adds html/head/body structure
150+
XCTAssertEqual(1, try doc.select("head").count)
151+
XCTAssertEqual(1, try doc.select("body").count)
152+
XCTAssertEqual("Hello", try doc.select("p").first()?.text())
153+
}
154+
86155
//TODO: nabil
87156
// public void testSupplyParserToConnection() throws IOException {
88157
// String xmlUrl = "http://direct.infohound.net/tools/jsoup-xml-test.xml";

0 commit comments

Comments
 (0)