forked from php-embed/Embed
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDocument.php
More file actions
177 lines (153 loc) · 5.53 KB
/
Document.php
File metadata and controls
177 lines (153 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
<?php
declare(strict_types = 1);
namespace Embed;
use DOMDocument;
use DOMNode;
use DOMXPath;
use HtmlParser\Parser;
use Psr\Http\Message\UriInterface;
use RuntimeException;
use Symfony\Component\CssSelector\CssSelectorConverter;
class Document
{
private static CssSelectorConverter $cssConverter;
private Extractor $extractor;
private DOMDocument $document;
private DOMXPath $xpath;
public function __construct(Extractor $extractor)
{
$this->extractor = $extractor;
$html = (string) $extractor->getResponse()->getBody();
$html = str_replace('<br>', "\n<br>", $html);
$html = str_replace('<br ', "\n<br ", $html);
$encoding = null;
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
$encoding = $this->getValidEncoding($encoding);
}
if (is_null($encoding) && !empty($html)) {
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
$encoding = $this->getValidEncoding($encoding);
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
}
/**
* Get valid encoding name if it exists, otherwise return null
*
* Uses mb_encoding_aliases() to verify the encoding is valid.
*
* TODO: When dropping PHP 7.4 support, remove the PHP_VERSION_ID < 80000 branch.
* PHP version differences:
* - PHP 7.4: mb_encoding_aliases() returns false for invalid encoding and throws Warning for empty string
* - PHP 8.0+: mb_encoding_aliases() throws ValueError for invalid/empty encoding
*
* @see https://www.php.net/manual/en/function.mb-encoding-aliases.php
*/
private function getValidEncoding(?string $encoding): ?string
{
if (PHP_VERSION_ID < 80000) {
// PHP 7.4: Check return value (false = invalid encoding)
// Need to check empty() first to avoid Warning
// TODO: Remove this entire branch when PHP 7.4 support is dropped
if (empty($encoding)) {
return null;
}
$ret = mb_encoding_aliases($encoding);
if ($ret === false) {
return null;
} else {
return $encoding;
}
} else {
// PHP 8.0+: ValueError exception is thrown for invalid/empty encoding
try {
$aliases = mb_encoding_aliases($encoding ?? '');
// Check if aliases array is not empty (valid encoding should have at least one alias)
return !empty($aliases) ? $encoding : null;
} catch (\ValueError $exception) {
return null;
}
}
}
private function initXPath()
{
$this->xpath = new DOMXPath($this->document);
$this->xpath->registerNamespace('php', 'http://php.net/xpath');
$this->xpath->registerPhpFunctions();
}
public function __clone()
{
$this->document = clone $this->document;
$this->initXPath();
}
public function remove(string $query): void
{
$nodes = iterator_to_array($this->xpath->query($query), false);
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
}
public function removeCss(string $query): void
{
$this->remove(self::cssToXpath($query));
}
public function getDocument(): DOMDocument
{
return $this->document;
}
/**
* Helper to build xpath queries easily and case insensitive
*/
private static function buildQuery(string $startQuery, array $attributes): string
{
$selector = [$startQuery];
foreach ($attributes as $name => $value) {
$selector[] = sprintf('[php:functionString("strtolower", @%s)="%s"]', $name, mb_strtolower($value));
}
return implode('', $selector);
}
/**
* Select a element in the dom
*/
public function select(string $query, ?array $attributes = null, ?DOMNode $context = null): QueryResult
{
if (!empty($attributes)) {
$query = self::buildQuery($query, $attributes);
}
return new QueryResult($this->xpath->query($query, $context), $this->extractor);
}
/**
* Select a element in the dom using a css selector
*/
public function selectCss(string $query, ?DOMNode $context = null): QueryResult
{
return $this->select(self::cssToXpath($query), null, $context);
}
/**
* Shortcut to select a <link> element and return the href
*/
public function link(string $rel, array $extra = []): ?UriInterface
{
return $this->select('.//link', ['rel' => $rel] + $extra)->url('href');
}
public function __toString(): string
{
return Parser::stringify($this->getDocument());
}
private static function cssToXpath(string $selector): string
{
if (!isset(self::$cssConverter)) {
if (!class_exists(CssSelectorConverter::class)) {
throw new RuntimeException('You need to install "symfony/css-selector" to use css selectors');
}
self::$cssConverter = new CssSelectorConverter();
}
return self::$cssConverter->toXpath($selector);
}
}