Skip to content

Commit c7f337e

Browse files
committed
Add HTMLAgilityPack engine xpath/css query selector patterns
1 parent de8eb02 commit c7f337e

10 files changed

Lines changed: 470 additions & 0 deletions

File tree

RegExpressWPFNET/RegExpressWPFNET.slnx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
<Folder Name="/RegexEngines/Fortran/">
3838
<Project Path="RegexEngines/Fortran/FortranPlugin/FortranPlugin.csproj" />
3939
</Folder>
40+
<Folder Name="/RegexEngines/HtmlAgilityPack/">
41+
<Project Path="RegexEngines/HtmlAgilityPack/HtmlAgilityPackPlugin/HtmlAgilityPackPlugin.csproj" />
42+
</Folder>
4043
<Folder Name="/RegexEngines/Hyperscan/">
4144
<Project Path="RegexEngines/Hyperscan/HyperscanPlugin/HyperscanPlugin.csproj">
4245
<BuildDependency Project="RegexEngines/Hyperscan/HyperscanWorker/HyperscanWorker.vcxproj" />
@@ -142,6 +145,7 @@
142145
<BuildDependency Project="RegexEngines/DotNETFramework4_8/DotNETFrameworkPlugin/DotNETFrameworkPlugin.csproj" />
143146
<BuildDependency Project="RegexEngines/DotNETFramework4_8/DotNETFrameworkWorker/DotNETFrameworkWorker.csproj" />
144147
<BuildDependency Project="RegexEngines/Fortran/FortranPlugin/FortranPlugin.csproj" />
148+
<BuildDependency Project="RegexEngines/HtmlAgilityPack/HtmlAgilityPackPlugin/HtmlAgilityPackPlugin.csproj" />
145149
<BuildDependency Project="RegexEngines/Hyperscan/HyperscanPlugin/HyperscanPlugin.csproj" />
146150
<BuildDependency Project="RegexEngines/Hyperscan/HyperscanWorker/HyperscanWorker.vcxproj" />
147151
<BuildDependency Project="RegexEngines/ICU/ICUPlugin/ICUPlugin.csproj" />

RegExpressWPFNET/RegExpressWPFNET/Engines.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
{
44
"path": "Engines\\DotNET9\\DotNET9Plugin.dll"
55
},
6+
{
7+
"path": "Engines\\HtmlAgilityPack\\HtmlAgilityPackPlugin.dll",
8+
"no_fm": true
9+
},
610
{
711
"path": "Engines\\DotNETFramework4_8\\DotNETFrameworkPlugin.dll",
812
"no_fm": true
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
using RegExpressLibrary;
2+
using RegExpressLibrary.Matches;
3+
using RegExpressLibrary.SyntaxColouring;
4+
using System;
5+
using System.Collections.Generic;
6+
using System.Text.Json;
7+
using System.Windows.Controls;
8+
9+
10+
namespace HtmlAgilityPackPlugin
11+
{
12+
class Engine : IRegexEngine, IOurAIEngine
13+
14+
15+
{
16+
string IOurAIEngine.AIPatternType => Options.SelectorMode == SelectorMode.XPath ? "html xpath" : "html css query selector";
17+
string IOurAIEngine.AIPatternCodeblockType => Options.SelectorMode == SelectorMode.XPath ? "xpath" : "css";
18+
string IOurAIEngine.AIAdditionalSystemPrompt => "";
19+
static readonly Lazy<string?> LazyVersion = new( Matcher.GetVersion );
20+
21+
Options mOptions = new( );
22+
readonly Lazy<UCOptions> mOptionsControl;
23+
24+
public Engine( )
25+
{
26+
mOptionsControl = new Lazy<UCOptions>( ( ) =>
27+
{
28+
UCOptions oc = new( );
29+
oc.SetOptions( Options );
30+
oc.Changed += OptionsControl_Changed;
31+
32+
return oc;
33+
} );
34+
}
35+
36+
public Options Options
37+
{
38+
get
39+
{
40+
return mOptions;
41+
}
42+
set
43+
{
44+
mOptions = value;
45+
46+
if( mOptionsControl.IsValueCreated ) mOptionsControl.Value.SetOptions( mOptions );
47+
}
48+
}
49+
50+
#region IRegexEngine
51+
52+
public string Kind => "HtmlAgilityPack";
53+
54+
public string? Version => LazyVersion.Value;
55+
56+
public string Name => "HtmlAgilityPack";
57+
58+
public string Subtitle => mOptions.SelectorMode == SelectorMode.XPath ? "XPath" : "CSS";
59+
60+
public RegexEngineCapabilityEnum Capabilities => RegexEngineCapabilityEnum.Default;// | RegexEngineCapabilityEnum.NoCaptures;
61+
62+
public string? NoteForCaptures => "This engine uses XPath or CSS selectors to select HTML nodes, not regex patterns.";
63+
64+
public event RegexEngineOptionsChanged? OptionsChanged;
65+
#pragma warning disable 0067
66+
public event EventHandler? FeatureMatrixReady;
67+
#pragma warning restore 0067
68+
69+
public Control GetOptionsControl( )
70+
{
71+
return mOptionsControl.Value;
72+
}
73+
74+
public string? ExportOptions( )
75+
{
76+
string json = JsonSerializer.Serialize( Options, JsonUtilities.JsonOptions );
77+
78+
return json;
79+
}
80+
81+
public void ImportOptions( string? json )
82+
{
83+
if( string.IsNullOrWhiteSpace( json ) )
84+
{
85+
Options = new Options( );
86+
}
87+
else
88+
{
89+
try
90+
{
91+
Options = JsonSerializer.Deserialize<Options>( json, JsonUtilities.JsonOptions )!;
92+
}
93+
catch( Exception ex )
94+
{
95+
// ignore versioning errors, for example
96+
if( InternalConfig.HandleException( ex ) )
97+
throw;
98+
99+
Options = new Options( );
100+
}
101+
}
102+
}
103+
104+
public RegexMatches GetMatches( ICancellable cnc, string pattern, string text )
105+
{
106+
return Matcher.GetMatches( cnc, pattern, text, Options );
107+
}
108+
109+
public SyntaxOptions GetSyntaxOptions( )
110+
{
111+
// XPath and CSS selectors have their own syntax, not regex syntax
112+
// Return Literal mode so no regex syntax highlighting is applied
113+
return new SyntaxOptions
114+
{
115+
Literal = true,
116+
XLevel = XLevelEnum.none,
117+
FeatureMatrix = new FeatureMatrix( )
118+
};
119+
}
120+
121+
public IReadOnlyList<FeatureMatrixVariant> GetFeatureMatrices( )
122+
{
123+
// This engine doesn't use regex, so return empty feature matrix
124+
return [];
125+
}
126+
127+
public void SetIgnoreCase( bool yes )
128+
{
129+
// XPath/CSS selectors don't have a direct ignore case option
130+
// but we could potentially add this in the future
131+
}
132+
133+
public void SetIgnorePatternWhitespace( bool yes )
134+
{
135+
// Not applicable for XPath/CSS selectors
136+
}
137+
138+
#endregion
139+
140+
private void OptionsControl_Changed( object? sender, RegexEngineOptionsChangedArgs args )
141+
{
142+
OptionsChanged?.Invoke( this, args );
143+
}
144+
}
145+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<Nullable>enable</Nullable>
5+
<UseWPF>true</UseWPF>
6+
<EnableDynamicLoading>true</EnableDynamicLoading>
7+
<RunPostBuildEvent>OnOutputUpdated</RunPostBuildEvent>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="Universal.HtmlAgilityPack" Version="1.0.2">
12+
<PrivateAssets>All</PrivateAssets>
13+
</PackageReference>
14+
</ItemGroup>
15+
16+
<ItemGroup>
17+
<ProjectReference Include="..\..\..\RegExpressLibrary\RegExpressLibrary.csproj">
18+
<Private>false</Private>
19+
<ExcludeAssets>runtime</ExcludeAssets>
20+
</ProjectReference>
21+
</ItemGroup>
22+
23+
</Project>
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using HtmlAgilityPack;
5+
using RegExpressLibrary;
6+
using RegExpressLibrary.Matches;
7+
using RegExpressLibrary.Matches.Simple;
8+
using Universal.HtmlAgilityPack;
9+
10+
11+
namespace HtmlAgilityPackPlugin
12+
{
13+
static class Matcher
14+
{
15+
public static RegexMatches GetMatches( ICancellable cnc, string pattern, string text, Options options )
16+
{
17+
if( string.IsNullOrWhiteSpace( pattern ) )
18+
{
19+
return RegexMatches.Empty;
20+
}
21+
22+
if( string.IsNullOrEmpty( text ) )
23+
{
24+
return RegexMatches.Empty;
25+
}
26+
27+
try
28+
{
29+
var doc = new HtmlDocument( );
30+
doc.LoadHtml( text );
31+
32+
IEnumerable<HtmlNode> nodes;
33+
34+
if( options.SelectorMode == SelectorMode.XPath )
35+
{
36+
nodes = doc.DocumentNode.SelectNodes( pattern );
37+
}
38+
else // CssSelector
39+
{
40+
// CSS selectors are case-insensitive for tag names in HTML
41+
// HtmlAgilityPack normalizes tag names to lowercase, so convert selector to lowercase
42+
nodes = doc.DocumentNode.QuerySelectorAll( pattern.ToLowerInvariant( ) );
43+
}
44+
45+
if( nodes == null || !nodes.Any( ) )
46+
{
47+
return RegexMatches.Empty;
48+
}
49+
50+
if( cnc.IsCancellationRequested ) return RegexMatches.Empty;
51+
52+
var matches = new List<IMatch>( );
53+
var sourceTextGetter = new SimpleTextGetter( text );
54+
55+
foreach( var node in nodes )
56+
{
57+
if( cnc.IsCancellationRequested ) return RegexMatches.Empty;
58+
59+
// Get the position of the node in the original text (for highlighting)
60+
int index = node.StreamPosition;
61+
string outerHtml = node.OuterHtml;
62+
int length = outerHtml.Length;
63+
64+
// Validate and clamp the index/length to avoid out-of-bounds
65+
if( index < 0 ) index = 0;
66+
if( index > text.Length ) continue;
67+
if( index + length > text.Length ) length = text.Length - index;
68+
if( length <= 0 ) continue;
69+
70+
// Match is always the full element (OuterHtml)
71+
var match = SimpleMatch.Create( index, length, sourceTextGetter );
72+
73+
// Add default group (group 0 - the full match) - this is skipped in display but needed for structure
74+
match.AddGroup( index, length, true, "" );
75+
76+
// Add named "Value" group based on output mode
77+
if( options.OutputMode == OutputMode.InnerHtml )
78+
{
79+
string innerHtml = node.InnerHtml;
80+
// Use SimpleTextGetterWithOffset to return the innerHtml as the group's Value
81+
// The index/length point to the element in source for highlighting
82+
var valueTextGetter = new SimpleTextGetterWithOffset( index, innerHtml );
83+
match.AddGroup( index, innerHtml.Length, true, "Value", valueTextGetter );
84+
}
85+
else if( options.OutputMode == OutputMode.InnerText )
86+
{
87+
string innerText = node.InnerText;
88+
// Use SimpleTextGetterWithOffset to return the innerText as the group's Value
89+
var valueTextGetter = new SimpleTextGetterWithOffset( index, innerText );
90+
match.AddGroup( index, innerText.Length, true, "Value", valueTextGetter );
91+
}
92+
93+
matches.Add( match );
94+
}
95+
96+
return new RegexMatches( matches.Count, matches );
97+
}
98+
catch( Exception ex )
99+
{
100+
throw new Exception( $"Error processing selector: {ex.Message}", ex );
101+
}
102+
}
103+
104+
public static string? GetVersion( )
105+
{
106+
try
107+
{
108+
var assembly = typeof( HtmlDocument ).Assembly;
109+
var version = assembly.GetName( ).Version;
110+
return version?.ToString( );
111+
}
112+
catch
113+
{
114+
return null;
115+
}
116+
}
117+
}
118+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
namespace HtmlAgilityPackPlugin
2+
{
3+
public enum SelectorMode
4+
{
5+
XPath,
6+
CssSelector
7+
}
8+
9+
public enum OutputMode
10+
{
11+
OuterHtml,
12+
InnerHtml,
13+
InnerText
14+
}
15+
16+
sealed class Options
17+
{
18+
public SelectorMode SelectorMode { get; set; } = SelectorMode.XPath;
19+
public OutputMode OutputMode { get; set; } = OutputMode.OuterHtml;
20+
21+
public Options Clone( )
22+
{
23+
return (Options)MemberwiseClone( );
24+
}
25+
}
26+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
using RegExpressLibrary;
2+
using System.Collections.Generic;
3+
4+
5+
namespace HtmlAgilityPackPlugin
6+
{
7+
public class Plugin : RegexPlugin
8+
{
9+
#region RegexPlugin
10+
11+
public override IReadOnlyList<IRegexEngine> GetEngines( )
12+
{
13+
return new[] { new Engine( ) };
14+
}
15+
16+
#endregion
17+
}
18+
}

0 commit comments

Comments
 (0)