Skip to content

Commit 25684d4

Browse files
jniocheclaude
andauthored
Replace XSoup with JSoup built-in XPath support (#1856)
* Replace XSoup with JSoup built-in XPath support Removes the us.codecraft:xsoup dependency and replaces it with JSoup's native selectXpath() API. Non-standard XSoup functions (tidyText, allText, html) are handled by stripping the suffix and mapping to the equivalent JSoup Element methods. Element names in XPath expressions are lowercased to match JSoup's normalized DOM. Fixes #1407 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Fix formatting Signed-off-by: Julien Nioche <julien@digitalpebble.com> --------- Signed-off-by: Julien Nioche <julien@digitalpebble.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a08a456 commit 25684d4

2 files changed

Lines changed: 113 additions & 23 deletions

File tree

core/pom.xml

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ under the License.
4949
<cli.version>1.11.0</cli.version>
5050
<okhttp.version>5.3.2</okhttp.version>
5151
<caffeine.version>3.2.3</caffeine.version>
52-
<xsoup.version>0.3.7</xsoup.version>
52+
5353
<guava.version>33.5.0-jre</guava.version>
5454
<jacoco.haltOnFailure>true</jacoco.haltOnFailure>
5555
<jacoco.classRatio>0.72</jacoco.classRatio>
@@ -218,18 +218,6 @@ under the License.
218218
<version>${commons.lang.version}</version>
219219
</dependency>
220220

221-
<dependency>
222-
<groupId>us.codecraft</groupId>
223-
<artifactId>xsoup</artifactId>
224-
<version>${xsoup.version}</version>
225-
<exclusions>
226-
<!-- We declare a newer version of jsoup -->
227-
<exclusion>
228-
<groupId>org.jsoup</groupId>
229-
<artifactId>jsoup</artifactId>
230-
</exclusion>
231-
</exclusions>
232-
</dependency>
233221

234222
<dependency>
235223
<groupId>com.squareup.okhttp3</groupId>

core/src/main/java/org/apache/stormcrawler/jsoup/XPathFilter.java

Lines changed: 112 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,24 @@
1818
package org.apache.stormcrawler.jsoup;
1919

2020
import com.fasterxml.jackson.databind.JsonNode;
21-
import java.io.IOException;
2221
import java.util.ArrayList;
2322
import java.util.HashMap;
2423
import java.util.List;
2524
import java.util.Map;
2625
import java.util.Map.Entry;
26+
import java.util.regex.Matcher;
27+
import java.util.regex.Pattern;
2728
import org.apache.stormcrawler.Metadata;
2829
import org.apache.stormcrawler.parse.JSoupFilter;
2930
import org.apache.stormcrawler.parse.ParseData;
3031
import org.apache.stormcrawler.parse.ParseResult;
3132
import org.apache.stormcrawler.util.AbstractConfigurable;
3233
import org.jetbrains.annotations.NotNull;
34+
import org.jsoup.nodes.Document;
35+
import org.jsoup.nodes.Element;
36+
import org.jsoup.select.Elements;
3337
import org.slf4j.Logger;
3438
import org.slf4j.LoggerFactory;
35-
import us.codecraft.xsoup.XPathEvaluator;
36-
import us.codecraft.xsoup.Xsoup;
3739

3840
/** Reads a XPATH patterns and stores the value found in web page as metadata. */
3941
public class XPathFilter extends AbstractConfigurable implements JSoupFilter {
@@ -42,21 +44,121 @@ public class XPathFilter extends AbstractConfigurable implements JSoupFilter {
4244

4345
protected final Map<String, List<LabelledExpression>> expressions = new HashMap<>();
4446

47+
/**
48+
* Supported extraction functions that can be appended to XPath expressions. These provide
49+
* backward compatibility with the non-standard XSoup functions.
50+
*/
51+
enum EvalFunction {
52+
/** Extracts cleaned, whitespace-normalized text from the element. */
53+
TIDY_TEXT,
54+
/** Extracts all text content from the element (same as TIDY_TEXT for JSoup). */
55+
ALL_TEXT,
56+
/** Extracts the inner HTML of the element. */
57+
HTML,
58+
/** Extracts an attribute value from the element. */
59+
ATTR,
60+
/** Returns the element's own text representation. */
61+
NONE;
62+
63+
String evaluate(Element element, String attrName) {
64+
switch (this) {
65+
case TIDY_TEXT:
66+
case ALL_TEXT:
67+
return element.text();
68+
case HTML:
69+
return element.html();
70+
case ATTR:
71+
return element.attr(attrName);
72+
default:
73+
return element.text();
74+
}
75+
}
76+
}
77+
78+
/** Pattern to match trailing function calls like /tidyText(), /html(), /allText(). */
79+
private static final Pattern FUNCTION_SUFFIX =
80+
Pattern.compile("/(tidyText|allText|html)\\(\\)$");
81+
82+
/** Pattern to match trailing attribute selectors like /@content. */
83+
private static final Pattern ATTR_SUFFIX = Pattern.compile("/@([\\w-]+)$");
84+
85+
/**
86+
* Pattern matching XPath element names (e.g. SPAN in //SPAN[@class="x"]). Matches sequences of
87+
* word characters that follow / or // and are not attribute references (/@).
88+
*/
89+
private static final Pattern ELEMENT_NAME = Pattern.compile("(?<=/)(?!@)([A-Z][A-Za-z0-9]*)");
90+
91+
/**
92+
* Lowercases element names in an XPath expression to match JSoup's normalized tag names. For
93+
* example, {@code //SPAN[@class="concept"]} becomes {@code //span[@class="concept"]}.
94+
*/
95+
static String lowercaseElementNames(String xpath) {
96+
return ELEMENT_NAME
97+
.matcher(xpath)
98+
.replaceAll(m -> m.group().toLowerCase(java.util.Locale.ROOT));
99+
}
100+
45101
static class LabelledExpression {
46102

47103
String key;
48-
49-
private XPathEvaluator expression;
50104
private String xpath;
105+
private EvalFunction evalFunction;
106+
private String attrName;
51107

52108
private LabelledExpression(String key, String xpath) {
53109
this.key = key;
54-
this.xpath = xpath;
55-
this.expression = Xsoup.compile(xpath);
110+
parseExpression(xpath);
56111
}
57112

58-
List<String> evaluate(org.jsoup.nodes.Document doc) throws IOException {
59-
return expression.evaluate(doc).list();
113+
private void parseExpression(String rawXpath) {
114+
// Lowercase element names so that XPath expressions with uppercase
115+
// tag names (e.g. //SPAN) work with JSoup's lowercase-normalized DOM
116+
rawXpath = lowercaseElementNames(rawXpath);
117+
118+
// Check for custom function suffixes: /tidyText(), /allText(), /html()
119+
Matcher funcMatcher = FUNCTION_SUFFIX.matcher(rawXpath);
120+
if (funcMatcher.find()) {
121+
this.xpath = rawXpath.substring(0, funcMatcher.start());
122+
switch (funcMatcher.group(1)) {
123+
case "tidyText":
124+
this.evalFunction = EvalFunction.TIDY_TEXT;
125+
break;
126+
case "allText":
127+
this.evalFunction = EvalFunction.ALL_TEXT;
128+
break;
129+
case "html":
130+
this.evalFunction = EvalFunction.HTML;
131+
break;
132+
default:
133+
this.evalFunction = EvalFunction.NONE;
134+
}
135+
return;
136+
}
137+
138+
// Check for attribute selectors: /@attrName
139+
Matcher attrMatcher = ATTR_SUFFIX.matcher(rawXpath);
140+
if (attrMatcher.find()) {
141+
this.xpath = rawXpath.substring(0, attrMatcher.start());
142+
this.evalFunction = EvalFunction.ATTR;
143+
this.attrName = attrMatcher.group(1);
144+
return;
145+
}
146+
147+
// No special suffix — use as-is
148+
this.xpath = rawXpath;
149+
this.evalFunction = EvalFunction.NONE;
150+
}
151+
152+
List<String> evaluate(Document doc) {
153+
Elements elements = doc.selectXpath(xpath);
154+
List<String> results = new ArrayList<>();
155+
for (Element element : elements) {
156+
String value = evalFunction.evaluate(element, attrName);
157+
if (value != null) {
158+
results.add(value);
159+
}
160+
}
161+
return results;
60162
}
61163

62164
public String toString() {
@@ -109,7 +211,7 @@ public void filter(
109211
metadata.addValues(le.key, values);
110212
break;
111213
}
112-
} catch (IOException e) {
214+
} catch (Exception e) {
113215
LOG.error("Error evaluating {}: {}", le.key, e);
114216
}
115217
}

0 commit comments

Comments
 (0)