1818package org .apache .stormcrawler .jsoup ;
1919
2020import com .fasterxml .jackson .databind .JsonNode ;
21- import java .io .IOException ;
2221import java .util .ArrayList ;
2322import java .util .HashMap ;
2423import java .util .List ;
2524import java .util .Map ;
2625import java .util .Map .Entry ;
26+ import java .util .regex .Matcher ;
27+ import java .util .regex .Pattern ;
2728import org .apache .stormcrawler .Metadata ;
2829import org .apache .stormcrawler .parse .JSoupFilter ;
2930import org .apache .stormcrawler .parse .ParseData ;
3031import org .apache .stormcrawler .parse .ParseResult ;
3132import org .apache .stormcrawler .util .AbstractConfigurable ;
3233import org .jetbrains .annotations .NotNull ;
34+ import org .jsoup .nodes .Document ;
35+ import org .jsoup .nodes .Element ;
36+ import org .jsoup .select .Elements ;
3337import org .slf4j .Logger ;
3438import org .slf4j .LoggerFactory ;
35- import us .codecraft .xsoup .XPathEvaluator ;
36- import us .codecraft .xsoup .Xsoup ;
3739
3840/** Reads a XPATH patterns and stores the value found in web page as metadata. */
3941public class XPathFilter extends AbstractConfigurable implements JSoupFilter {
@@ -42,21 +44,121 @@ public class XPathFilter extends AbstractConfigurable implements JSoupFilter {
4244
4345 protected final Map <String , List <LabelledExpression >> expressions = new HashMap <>();
4446
47+ /**
48+ * Supported extraction functions that can be appended to XPath expressions. These provide
49+ * backward compatibility with the non-standard XSoup functions.
50+ */
51+ enum EvalFunction {
52+ /** Extracts cleaned, whitespace-normalized text from the element. */
53+ TIDY_TEXT ,
54+ /** Extracts all text content from the element (same as TIDY_TEXT for JSoup). */
55+ ALL_TEXT ,
56+ /** Extracts the inner HTML of the element. */
57+ HTML ,
58+ /** Extracts an attribute value from the element. */
59+ ATTR ,
60+ /** Returns the element's own text representation. */
61+ NONE ;
62+
63+ String evaluate (Element element , String attrName ) {
64+ switch (this ) {
65+ case TIDY_TEXT :
66+ case ALL_TEXT :
67+ return element .text ();
68+ case HTML :
69+ return element .html ();
70+ case ATTR :
71+ return element .attr (attrName );
72+ default :
73+ return element .text ();
74+ }
75+ }
76+ }
77+
78+ /** Pattern to match trailing function calls like /tidyText(), /html(), /allText(). */
79+ private static final Pattern FUNCTION_SUFFIX =
80+ Pattern .compile ("/(tidyText|allText|html)\\ (\\ )$" );
81+
82+ /** Pattern to match trailing attribute selectors like /@content. */
83+ private static final Pattern ATTR_SUFFIX = Pattern .compile ("/@([\\ w-]+)$" );
84+
85+ /**
86+ * Pattern matching XPath element names (e.g. SPAN in //SPAN[@class="x"]). Matches sequences of
87+ * word characters that follow / or // and are not attribute references (/@).
88+ */
89+ private static final Pattern ELEMENT_NAME = Pattern .compile ("(?<=/)(?!@)([A-Z][A-Za-z0-9]*)" );
90+
91+ /**
92+ * Lowercases element names in an XPath expression to match JSoup's normalized tag names. For
93+ * example, {@code //SPAN[@class="concept"]} becomes {@code //span[@class="concept"]}.
94+ */
95+ static String lowercaseElementNames (String xpath ) {
96+ return ELEMENT_NAME
97+ .matcher (xpath )
98+ .replaceAll (m -> m .group ().toLowerCase (java .util .Locale .ROOT ));
99+ }
100+
45101 static class LabelledExpression {
46102
47103 String key ;
48-
49- private XPathEvaluator expression ;
50104 private String xpath ;
105+ private EvalFunction evalFunction ;
106+ private String attrName ;
51107
52108 private LabelledExpression (String key , String xpath ) {
53109 this .key = key ;
54- this .xpath = xpath ;
55- this .expression = Xsoup .compile (xpath );
110+ parseExpression (xpath );
56111 }
57112
58- List <String > evaluate (org .jsoup .nodes .Document doc ) throws IOException {
59- return expression .evaluate (doc ).list ();
113+ private void parseExpression (String rawXpath ) {
114+ // Lowercase element names so that XPath expressions with uppercase
115+ // tag names (e.g. //SPAN) work with JSoup's lowercase-normalized DOM
116+ rawXpath = lowercaseElementNames (rawXpath );
117+
118+ // Check for custom function suffixes: /tidyText(), /allText(), /html()
119+ Matcher funcMatcher = FUNCTION_SUFFIX .matcher (rawXpath );
120+ if (funcMatcher .find ()) {
121+ this .xpath = rawXpath .substring (0 , funcMatcher .start ());
122+ switch (funcMatcher .group (1 )) {
123+ case "tidyText" :
124+ this .evalFunction = EvalFunction .TIDY_TEXT ;
125+ break ;
126+ case "allText" :
127+ this .evalFunction = EvalFunction .ALL_TEXT ;
128+ break ;
129+ case "html" :
130+ this .evalFunction = EvalFunction .HTML ;
131+ break ;
132+ default :
133+ this .evalFunction = EvalFunction .NONE ;
134+ }
135+ return ;
136+ }
137+
138+ // Check for attribute selectors: /@attrName
139+ Matcher attrMatcher = ATTR_SUFFIX .matcher (rawXpath );
140+ if (attrMatcher .find ()) {
141+ this .xpath = rawXpath .substring (0 , attrMatcher .start ());
142+ this .evalFunction = EvalFunction .ATTR ;
143+ this .attrName = attrMatcher .group (1 );
144+ return ;
145+ }
146+
147+ // No special suffix — use as-is
148+ this .xpath = rawXpath ;
149+ this .evalFunction = EvalFunction .NONE ;
150+ }
151+
152+ List <String > evaluate (Document doc ) {
153+ Elements elements = doc .selectXpath (xpath );
154+ List <String > results = new ArrayList <>();
155+ for (Element element : elements ) {
156+ String value = evalFunction .evaluate (element , attrName );
157+ if (value != null ) {
158+ results .add (value );
159+ }
160+ }
161+ return results ;
60162 }
61163
62164 public String toString () {
@@ -109,7 +211,7 @@ public void filter(
109211 metadata .addValues (le .key , values );
110212 break ;
111213 }
112- } catch (IOException e ) {
214+ } catch (Exception e ) {
113215 LOG .error ("Error evaluating {}: {}" , le .key , e );
114216 }
115217 }
0 commit comments