|
8 | 8 | import Foundation |
9 | 9 |
|
10 | 10 | /** |
11 | | - Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. |
12 | | - For XML input, use ``parseXML(_:_:)`` or ``parse(_:_:_:)`` with ``Parser/xmlParser()``. |
13 | | - |
14 | | - - parameter html: HTML to parse |
15 | | - - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur |
16 | | - before the HTML declares a `<base href>` tag. |
17 | | - - returns: sane HTML |
| 11 | + Parse markup into a Document with automatic format detection. If the input starts with an XML declaration |
| 12 | + (`<?xml`), the XML parser is used; otherwise the HTML parser is applied. Use ``parseHTML(_:_:)`` or |
| 13 | + ``parseXML(_:_:)`` to force a specific parser. |
| 14 | + |
| 15 | + - parameter html: markup to parse |
| 16 | + - parameter baseUri: The URL where the markup was retrieved from. Used to resolve relative URLs to absolute URLs, that occur |
| 17 | + before the markup declares a `<base href>` tag. |
| 18 | + - returns: parsed Document |
18 | 19 | */ |
19 | 20 | public func parse(_ html: String, _ baseUri: String) throws -> Document { |
| 21 | + if looksLikeXml(html) { |
| 22 | + return try Parser.xmlParser().parseInput(html, baseUri) |
| 23 | + } |
20 | 24 | return try Parser.parse(html, baseUri) |
21 | 25 | } |
22 | 26 |
|
23 | 27 | /** |
24 | | - Parse Data into a Document. The parser will make a sensible, balanced document tree out of any HTML. |
25 | | - For XML input, use ``parseXML(_:_:)`` or ``parse(_:_:_:)`` with ``Parser/xmlParser()``. |
26 | | - |
27 | | - - parameter data: Data to parse |
28 | | - - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur |
29 | | - before the HTML declares a `<base href>` tag. |
30 | | - - returns: sane HTML |
| 28 | + Parse Data into a Document with automatic format detection. If the input starts with an XML declaration |
| 29 | + (`<?xml`), the XML parser is used; otherwise the HTML parser is applied. Use ``parseHTML(_:_:)`` or |
| 30 | + ``parseXML(_:_:)`` to force a specific parser. |
| 31 | + |
| 32 | + - parameter data: data to parse |
| 33 | + - parameter baseUri: The URL where the markup was retrieved from. Used to resolve relative URLs to absolute URLs, that occur |
| 34 | + before the markup declares a `<base href>` tag. |
| 35 | + - returns: parsed Document |
31 | 36 | */ |
32 | 37 | public func parse(_ data: Data, _ baseUri: String) throws -> Document { |
| 38 | + let bytes = [UInt8](data) |
| 39 | + if looksLikeXml(bytes) { |
| 40 | + return try Parser.xmlParser().parseInput(bytes, baseUri) |
| 41 | + } |
33 | 42 | return try Parser.parse(data, baseUri) |
34 | 43 | } |
35 | 44 |
|
@@ -84,29 +93,27 @@ import Foundation |
84 | 93 | } |
85 | 94 |
|
86 | 95 | /** |
87 | | - Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a |
88 | | - `<base href>` tag. |
89 | | - For XML input, use ``parseXML(_:)``. |
90 | | - |
91 | | - - parameter html: HTML to parse |
92 | | - - returns: sane HTML |
| 96 | + Parse markup into a Document with automatic format detection. As no base URI is specified, absolute URL |
| 97 | + detection relies on the markup including a `<base href>` tag. |
| 98 | + |
| 99 | + - parameter html: markup to parse |
| 100 | + - returns: parsed Document |
93 | 101 | - seealso: ``parse(_:_:)-(String,String)`` |
94 | 102 | */ |
95 | 103 | public func parse(_ html: String) throws -> Document { |
96 | | - return try Parser.parse(html, "") |
| 104 | + return try parse(html, "") |
97 | 105 | } |
98 | 106 |
|
99 | 107 | /** |
100 | | - Parse Data into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a |
101 | | - `<base href>` tag. |
102 | | - For XML input, use ``parseXML(_:)``. |
103 | | - |
104 | | - - parameter data: Data to parse |
105 | | - - returns: sane HTML |
| 108 | + Parse Data into a Document with automatic format detection. As no base URI is specified, absolute URL |
| 109 | + detection relies on the markup including a `<base href>` tag. |
| 110 | + |
| 111 | + - parameter data: data to parse |
| 112 | + - returns: parsed Document |
106 | 113 | - seealso: ``parse(_:_:)-(String,String)`` |
107 | 114 | */ |
108 | 115 | public func parse(_ data: Data) throws -> Document { |
109 | | - return try Parser.parse(data, "") |
| 116 | + return try parse(data, "") |
110 | 117 | } |
111 | 118 |
|
112 | 119 | /** |
@@ -176,8 +183,75 @@ import Foundation |
176 | 183 | return try parser.parseInput([UInt8](Data(contentsOf: url)), url.absoluteString) |
177 | 184 | } |
178 | 185 |
|
| 186 | + // MARK: - Explicit HTML parsing |
| 187 | + |
| 188 | + /** |
| 189 | + Parse HTML into a Document using the HTML parser. Unlike ``parse(_:_:)``, this always uses the HTML parser |
| 190 | + regardless of the input content (no auto-detection). |
179 | 191 |
|
| 192 | + - parameter html: HTML to parse |
| 193 | + - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs. |
| 194 | + - returns: parsed HTML document |
| 195 | + */ |
| 196 | + public func parseHTML(_ html: String, _ baseUri: String) throws -> Document { |
| 197 | + return try Parser.parse(html, baseUri) |
| 198 | + } |
| 199 | + |
| 200 | + /** |
| 201 | + Parse HTML into a Document using the HTML parser. As no base URI is specified, absolute URL detection relies on |
| 202 | + the HTML including a `<base href>` tag. |
180 | 203 |
|
| 204 | + - parameter html: HTML to parse |
| 205 | + - returns: parsed HTML document |
| 206 | + */ |
| 207 | + public func parseHTML(_ html: String) throws -> Document { |
| 208 | + return try parseHTML(html, "") |
| 209 | + } |
| 210 | + |
| 211 | + /** |
| 212 | + Parse HTML data into a Document using the HTML parser. |
| 213 | + |
| 214 | + - parameter data: HTML data to parse |
| 215 | + - parameter baseUri: The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs. |
| 216 | + - returns: parsed HTML document |
| 217 | + */ |
| 218 | + public func parseHTML(_ data: Data, _ baseUri: String) throws -> Document { |
| 219 | + return try Parser.parse(data, baseUri) |
| 220 | + } |
| 221 | + |
| 222 | + /** |
| 223 | + Parse HTML data into a Document using the HTML parser. As no base URI is specified, absolute URL detection relies |
| 224 | + on the HTML including a `<base href>` tag. |
| 225 | + |
| 226 | + - parameter data: HTML data to parse |
| 227 | + - returns: parsed HTML document |
| 228 | + */ |
| 229 | + public func parseHTML(_ data: Data) throws -> Document { |
| 230 | + return try parseHTML(data, "") |
| 231 | + } |
| 232 | + |
| 233 | + // MARK: - Format detection |
| 234 | + |
| 235 | + private func looksLikeXml(_ string: String) -> Bool { |
| 236 | + var i = string.startIndex |
| 237 | + while i < string.endIndex && string[i].isWhitespace { |
| 238 | + i = string.index(after: i) |
| 239 | + } |
| 240 | + return string[i...].hasPrefix("<?xml") |
| 241 | + } |
| 242 | + |
| 243 | + private func looksLikeXml(_ bytes: [UInt8]) -> Bool { |
| 244 | + let xmlDecl: [UInt8] = [0x3C, 0x3F, 0x78, 0x6D, 0x6C] // <?xml |
| 245 | + var i = 0 |
| 246 | + while i < bytes.count && (bytes[i] == 0x20 || bytes[i] == 0x09 || bytes[i] == 0x0A || bytes[i] == 0x0D) { |
| 247 | + i += 1 |
| 248 | + } |
| 249 | + guard i + xmlDecl.count <= bytes.count else { return false } |
| 250 | + for j in 0..<xmlDecl.count { |
| 251 | + if bytes[i + j] != xmlDecl[j] { return false } |
| 252 | + } |
| 253 | + return true |
| 254 | + } |
181 | 255 |
|
182 | 256 | //todo: |
183 | 257 | // /** |
|
0 commit comments