Skip to content

Commit 6db3678

Browse files
Copilotgfs
andauthored
Add custom extractor interface for user-defined archive formats (#177)
* Add custom extractor support with interfaces and tests --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: gfs <98900+gfs@users.noreply.github.com>
1 parent 697b882 commit 6db3678

5 files changed

Lines changed: 484 additions & 3 deletions

File tree

README.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,88 @@ catch(OverflowException)
162162
```
163163
</details>
164164

165+
<details>
166+
<summary>Custom Extractors for Additional File Types</summary>
167+
<br/>
168+
You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats.
169+
170+
To create a custom extractor, implement the `ICustomAsyncExtractor` interface and register it with the extractor:
171+
172+
```csharp
173+
using Microsoft.CST.RecursiveExtractor;
174+
using Microsoft.CST.RecursiveExtractor.Extractors;
175+
using System.IO;
176+
using System.Collections.Generic;
177+
using System.Linq;
178+
179+
// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC"
180+
public class MyCustomExtractor : ICustomAsyncExtractor
181+
{
182+
private readonly Extractor context;
183+
private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC");
184+
185+
public MyCustomExtractor(Extractor ctx)
186+
{
187+
context = ctx;
188+
}
189+
190+
// Check if this extractor can handle the file based on binary signatures
191+
public bool CanExtract(Stream stream)
192+
{
193+
if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
194+
{
195+
return false;
196+
}
197+
198+
var initialPosition = stream.Position;
199+
try
200+
{
201+
stream.Position = 0;
202+
var buffer = new byte[MAGIC_BYTES.Length];
203+
var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
204+
205+
return bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES);
206+
}
207+
finally
208+
{
209+
// Always restore the original position
210+
stream.Position = initialPosition;
211+
}
212+
}
213+
214+
// Implement extraction logic
215+
public IEnumerable<FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
216+
{
217+
// Your extraction logic here
218+
// For example, parse the archive and yield FileEntry objects for each contained file
219+
yield break;
220+
}
221+
222+
public async IAsyncEnumerable<FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
223+
{
224+
// Your async extraction logic here
225+
yield break;
226+
}
227+
}
228+
229+
// Register the custom extractor via constructor
230+
var customExtractor = new MyCustomExtractor(null);
231+
var extractor = new Extractor(new[] { customExtractor });
232+
233+
// Now the extractor will use your custom extractor for files matching your CanExtract criteria
234+
var results = extractor.Extract("path/to/custom/archive.myarc");
235+
```
236+
237+
Key points:
238+
- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format
239+
- Always preserve the stream's original position in `CanExtract`
240+
- Custom extractors are provided via the constructor as an `IEnumerable<ICustomAsyncExtractor>`
241+
- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors)
242+
- Multiple custom extractors can be registered; they are checked in the order provided
243+
- Custom extractors are invoked for both synchronous and asynchronous extraction paths
244+
245+
</details>
246+
165247
## Exceptions
166248
RecursiveExtractor protects against [ZipSlip](https://snyk.io/research/zip-slip-vulnerability), [Quines, and Zip Bombs](https://en.wikipedia.org/wiki/Zip_bomb).
167249
Calls to Extract will throw an `OverflowException` when a Quine or Zip bomb is detected and a `TimeOutException` if `EnableTiming` is set and the specified time period has elapsed before completion.
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
using Microsoft.CST.RecursiveExtractor;
2+
using Microsoft.CST.RecursiveExtractor.Extractors;
3+
using Microsoft.VisualStudio.TestTools.UnitTesting;
4+
using System;
5+
using System.Collections.Generic;
6+
using System.IO;
7+
using System.Linq;
8+
using System.Threading.Tasks;
9+
10+
namespace RecursiveExtractor.Tests.ExtractorTests;
11+
12+
[TestClass]
13+
public class CustomExtractorTests
14+
{
15+
/// <summary>
16+
/// A simple test custom extractor that extracts files with a specific magic number
17+
/// For testing purposes, it recognizes files starting with "CUSTOM1"
18+
/// </summary>
19+
private class TestCustomExtractor : ICustomAsyncExtractor
20+
{
21+
private readonly Extractor context;
22+
private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1");
23+
24+
public TestCustomExtractor(Extractor ctx)
25+
{
26+
context = ctx;
27+
}
28+
29+
public bool CanExtract(Stream stream)
30+
{
31+
if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
32+
{
33+
return false;
34+
}
35+
36+
var initialPosition = stream.Position;
37+
try
38+
{
39+
stream.Position = 0;
40+
var buffer = new byte[MAGIC_BYTES.Length];
41+
var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
42+
43+
if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
44+
{
45+
return true;
46+
}
47+
return false;
48+
}
49+
finally
50+
{
51+
stream.Position = initialPosition;
52+
}
53+
}
54+
55+
public IEnumerable<FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
56+
{
57+
// For this test, we just return a synthetic file entry showing the custom extractor worked
58+
var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
59+
yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
60+
}
61+
62+
public async IAsyncEnumerable<FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
63+
{
64+
// For this test, we just return a synthetic file entry showing the custom extractor worked
65+
var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
66+
yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
67+
await Task.CompletedTask;
68+
}
69+
}
70+
71+
/// <summary>
72+
/// A second test custom extractor that recognizes files starting with "CUSTOM2"
73+
/// </summary>
74+
private class SecondTestCustomExtractor : ICustomAsyncExtractor
75+
{
76+
private readonly Extractor context;
77+
private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2");
78+
79+
public SecondTestCustomExtractor(Extractor ctx)
80+
{
81+
context = ctx;
82+
}
83+
84+
public bool CanExtract(Stream stream)
85+
{
86+
if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
87+
{
88+
return false;
89+
}
90+
91+
var initialPosition = stream.Position;
92+
try
93+
{
94+
stream.Position = 0;
95+
var buffer = new byte[MAGIC_BYTES.Length];
96+
var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
97+
98+
if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
99+
{
100+
return true;
101+
}
102+
return false;
103+
}
104+
finally
105+
{
106+
stream.Position = initialPosition;
107+
}
108+
}
109+
110+
public IEnumerable<FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
111+
{
112+
var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
113+
yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
114+
}
115+
116+
public async IAsyncEnumerable<FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
117+
{
118+
var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
119+
yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
120+
await Task.CompletedTask;
121+
}
122+
}
123+
124+
[TestMethod]
125+
public void Constructor_WithCustomExtractors_RegistersExtractors()
126+
{
127+
var customExtractor = new TestCustomExtractor(null!);
128+
var extractor = new Extractor(new[] { customExtractor });
129+
130+
Assert.AreEqual(1, extractor.CustomExtractors.Count);
131+
}
132+
133+
[TestMethod]
134+
public void Constructor_WithMultipleCustomExtractors_RegistersAll()
135+
{
136+
var customExtractor1 = new TestCustomExtractor(null!);
137+
var customExtractor2 = new SecondTestCustomExtractor(null!);
138+
var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor1, customExtractor2 });
139+
140+
Assert.AreEqual(2, extractor.CustomExtractors.Count);
141+
}
142+
143+
[TestMethod]
144+
public void Constructor_WithNullInCollection_IgnoresNull()
145+
{
146+
var customExtractor = new TestCustomExtractor(null!);
147+
var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor, null! });
148+
149+
Assert.AreEqual(1, extractor.CustomExtractors.Count);
150+
}
151+
152+
[TestMethod]
153+
public void Constructor_WithNullCollection_CreatesEmptyExtractor()
154+
{
155+
var extractor = new Extractor((IEnumerable<ICustomAsyncExtractor>)null!);
156+
157+
Assert.AreEqual(0, extractor.CustomExtractors.Count);
158+
}
159+
160+
[TestMethod]
161+
public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor()
162+
{
163+
var customExtractor = new TestCustomExtractor(null!);
164+
var extractor = new Extractor(new[] { customExtractor });
165+
166+
// Create a test file with the custom magic bytes
167+
var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
168+
var results = extractor.Extract("test.custom", testData).ToList();
169+
170+
Assert.AreEqual(1, results.Count);
171+
Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
172+
173+
// Read the content to verify it was processed by our custom extractor
174+
using var reader = new StreamReader(results[0].Content);
175+
results[0].Content.Position = 0;
176+
var content = reader.ReadToEnd();
177+
Assert.AreEqual("Extracted by TestCustomExtractor", content);
178+
}
179+
180+
[TestMethod]
181+
public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor()
182+
{
183+
var customExtractor = new TestCustomExtractor(null!);
184+
var extractor = new Extractor(new[] { customExtractor });
185+
186+
// Create a test file with the custom magic bytes
187+
var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
188+
var results = await extractor.ExtractAsync("test.custom", testData).ToListAsync();
189+
190+
Assert.AreEqual(1, results.Count);
191+
Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
192+
193+
// Read the content to verify it was processed by our custom extractor
194+
using var reader = new StreamReader(results[0].Content);
195+
results[0].Content.Position = 0;
196+
var content = reader.ReadToEnd();
197+
Assert.AreEqual("Extracted by TestCustomExtractor", content);
198+
}
199+
200+
[TestMethod]
201+
public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile()
202+
{
203+
var customExtractor = new TestCustomExtractor(null!);
204+
var extractor = new Extractor(new[] { customExtractor });
205+
206+
// Create a test file that doesn't match the custom magic bytes
207+
var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data");
208+
var results = extractor.Extract("test.txt", testData).ToList();
209+
210+
// Should return the original file since no custom extractor matched
211+
Assert.AreEqual(1, results.Count);
212+
Assert.AreEqual("test.txt", results[0].Name);
213+
214+
// Verify it's the original content
215+
using var reader = new StreamReader(results[0].Content);
216+
results[0].Content.Position = 0;
217+
var content = reader.ReadToEnd();
218+
Assert.AreEqual("NOTCUSTOM This is test data", content);
219+
}
220+
221+
[TestMethod]
222+
public void Extract_MultipleCustomExtractors_UsesCorrectOne()
223+
{
224+
var extractor = new Extractor(new ICustomAsyncExtractor[]
225+
{
226+
new TestCustomExtractor(null!),
227+
new SecondTestCustomExtractor(null!)
228+
});
229+
230+
// Test with first custom format
231+
var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data");
232+
var results1 = extractor.Extract("test1.custom", testData1).ToList();
233+
Assert.AreEqual(1, results1.Count);
234+
Assert.AreEqual("extracted_from_custom.txt", results1[0].Name);
235+
236+
// Test with second custom format
237+
var testData2 = System.Text.Encoding.ASCII.GetBytes("CUSTOM2 data");
238+
var results2 = extractor.Extract("test2.custom", testData2).ToList();
239+
Assert.AreEqual(1, results2.Count);
240+
Assert.AreEqual("extracted_from_second_custom.txt", results2[0].Name);
241+
}
242+
243+
[TestMethod]
244+
public void Extract_NoCustomExtractors_ReturnsOriginalFile()
245+
{
246+
var extractor = new Extractor();
247+
248+
// Don't add any custom extractors
249+
var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
250+
var results = extractor.Extract("test.custom", testData).ToList();
251+
252+
// Should return the original file since no custom extractor is registered
253+
Assert.AreEqual(1, results.Count);
254+
Assert.AreEqual("test.custom", results[0].Name);
255+
}
256+
257+
[TestMethod]
258+
public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor()
259+
{
260+
var customExtractor = new TestCustomExtractor(null!);
261+
var extractor = new Extractor(new[] { customExtractor });
262+
263+
// Test with a real ZIP file - should use built-in extractor, not custom
264+
var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip");
265+
if (File.Exists(path))
266+
{
267+
var results = extractor.Extract(path).ToList();
268+
269+
// Should extract the ZIP normally, not use the custom extractor
270+
Assert.IsTrue(results.Count > 0);
271+
Assert.IsTrue(results.Any(r => r.Name.Contains("EmptyFile")));
272+
}
273+
}
274+
}

0 commit comments

Comments
 (0)