Replace XSoup with JSoup built-in XPath support (#1856)

jnioche · claude · web-flow · commit 25684d470338 · 2026-03-31T15:42:03.000+02:00
* Replace XSoup with JSoup built-in XPath support Removes the us.codecraft:xsoup dependency and replaces it with JSoup's native selectXpath() API. Non-standard XSoup functions (tidyText, allText, html) are handled by stripping the suffix and mapping to the equivalent JSoup Element methods. Element names in XPath expressions are lowercased to match JSoup's normalized DOM. Fixes #1407 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Fix formatting Signed-off-by: Julien Nioche <julien@digitalpebble.com> --------- Signed-off-by: Julien Nioche <julien@digitalpebble.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/core/pom.xml b/core/pom.xml
@@ -49,7 +49,7 @@ under the License.
         <cli.version>1.11.0</cli.version>
         <okhttp.version>5.3.2</okhttp.version>
         <caffeine.version>3.2.3</caffeine.version>
-        <xsoup.version>0.3.7</xsoup.version>
+
         <guava.version>33.5.0-jre</guava.version>
         <jacoco.haltOnFailure>true</jacoco.haltOnFailure>
         <jacoco.classRatio>0.72</jacoco.classRatio>
@@ -218,18 +218,6 @@ under the License.
             <version>${commons.lang.version}</version>
         </dependency>
 
-        <dependency>
-            <groupId>us.codecraft</groupId>
-            <artifactId>xsoup</artifactId>
-            <version>${xsoup.version}</version>
-            <exclusions>
-                <!-- We declare a newer version of jsoup -->
-                <exclusion>
-                    <groupId>org.jsoup</groupId>
-                    <artifactId>jsoup</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
 
         <dependency>
             <groupId>com.squareup.okhttp3</groupId>
diff --git a/core/src/main/java/org/apache/stormcrawler/jsoup/XPathFilter.java b/core/src/main/java/org/apache/stormcrawler/jsoup/XPathFilter.java
@@ -18,22 +18,24 @@
 package org.apache.stormcrawler.jsoup;
 
 import com.fasterxml.jackson.databind.JsonNode;
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.parse.JSoupFilter;
 import org.apache.stormcrawler.parse.ParseData;
 import org.apache.stormcrawler.parse.ParseResult;
 import org.apache.stormcrawler.util.AbstractConfigurable;
 import org.jetbrains.annotations.NotNull;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import us.codecraft.xsoup.XPathEvaluator;
-import us.codecraft.xsoup.Xsoup;
 
 /** Reads a XPATH patterns and stores the value found in web page as metadata. */
 public class XPathFilter extends AbstractConfigurable implements JSoupFilter {
@@ -42,21 +44,121 @@ public class XPathFilter extends AbstractConfigurable implements JSoupFilter {
 
     protected final Map<String, List<LabelledExpression>> expressions = new HashMap<>();
 
+    /**
+     * Supported extraction functions that can be appended to XPath expressions. These provide
+     * backward compatibility with the non-standard XSoup functions.
+     */
+    enum EvalFunction {
+        /** Extracts cleaned, whitespace-normalized text from the element. */
+        TIDY_TEXT,
+        /** Extracts all text content from the element (same as TIDY_TEXT for JSoup). */
+        ALL_TEXT,
+        /** Extracts the inner HTML of the element. */
+        HTML,
+        /** Extracts an attribute value from the element. */
+        ATTR,
+        /** Returns the element's own text representation. */
+        NONE;
+
+        String evaluate(Element element, String attrName) {
+            switch (this) {
+                case TIDY_TEXT:
+                case ALL_TEXT:
+                    return element.text();
+                case HTML:
+                    return element.html();
+                case ATTR:
+                    return element.attr(attrName);
+                default:
+                    return element.text();
+            }
+        }
+    }
+
+    /** Pattern to match trailing function calls like /tidyText(), /html(), /allText(). */
+    private static final Pattern FUNCTION_SUFFIX =
+            Pattern.compile("/(tidyText|allText|html)\\(\\)$");
+
+    /** Pattern to match trailing attribute selectors like /@content. */
+    private static final Pattern ATTR_SUFFIX = Pattern.compile("/@([\\w-]+)$");
+
+    /**
+     * Pattern matching XPath element names (e.g. SPAN in //SPAN[@class="x"]). Matches sequences of
+     * word characters that follow / or // and are not attribute references (/@).
+     */
+    private static final Pattern ELEMENT_NAME = Pattern.compile("(?<=/)(?!@)([A-Z][A-Za-z0-9]*)");
+
+    /**
+     * Lowercases element names in an XPath expression to match JSoup's normalized tag names. For
+     * example, {@code //SPAN[@class="concept"]} becomes {@code //span[@class="concept"]}.
+     */
+    static String lowercaseElementNames(String xpath) {
+        return ELEMENT_NAME
+                .matcher(xpath)
+                .replaceAll(m -> m.group().toLowerCase(java.util.Locale.ROOT));
+    }
+
     static class LabelledExpression {
 
         String key;
-
-        private XPathEvaluator expression;
         private String xpath;
+        private EvalFunction evalFunction;
+        private String attrName;
 
         private LabelledExpression(String key, String xpath) {
             this.key = key;
-            this.xpath = xpath;
-            this.expression = Xsoup.compile(xpath);
+            parseExpression(xpath);
         }
 
-        List<String> evaluate(org.jsoup.nodes.Document doc) throws IOException {
-            return expression.evaluate(doc).list();
+        private void parseExpression(String rawXpath) {
+            // Lowercase element names so that XPath expressions with uppercase
+            // tag names (e.g. //SPAN) work with JSoup's lowercase-normalized DOM
+            rawXpath = lowercaseElementNames(rawXpath);
+
+            // Check for custom function suffixes: /tidyText(), /allText(), /html()
+            Matcher funcMatcher = FUNCTION_SUFFIX.matcher(rawXpath);
+            if (funcMatcher.find()) {
+                this.xpath = rawXpath.substring(0, funcMatcher.start());
+                switch (funcMatcher.group(1)) {
+                    case "tidyText":
+                        this.evalFunction = EvalFunction.TIDY_TEXT;
+                        break;
+                    case "allText":
+                        this.evalFunction = EvalFunction.ALL_TEXT;
+                        break;
+                    case "html":
+                        this.evalFunction = EvalFunction.HTML;
+                        break;
+                    default:
+                        this.evalFunction = EvalFunction.NONE;
+                }
+                return;
+            }
+
+            // Check for attribute selectors: /@attrName
+            Matcher attrMatcher = ATTR_SUFFIX.matcher(rawXpath);
+            if (attrMatcher.find()) {
+                this.xpath = rawXpath.substring(0, attrMatcher.start());
+                this.evalFunction = EvalFunction.ATTR;
+                this.attrName = attrMatcher.group(1);
+                return;
+            }
+
+            // No special suffix — use as-is
+            this.xpath = rawXpath;
+            this.evalFunction = EvalFunction.NONE;
+        }
+
+        List<String> evaluate(Document doc) {
+            Elements elements = doc.selectXpath(xpath);
+            List<String> results = new ArrayList<>();
+            for (Element element : elements) {
+                String value = evalFunction.evaluate(element, attrName);
+                if (value != null) {
+                    results.add(value);
+                }
+            }
+            return results;
         }
 
         public String toString() {
@@ -109,7 +211,7 @@ public void filter(
                         metadata.addValues(le.key, values);
                         break;
                     }
-                } catch (IOException e) {
+                } catch (Exception e) {
                     LOG.error("Error evaluating {}: {}", le.key, e);
                 }
             }