Skip to content

Commit e138786

Browse files
authored
Fix source document ID generation for filesystem compatibility (#1240)
2 parents 533f95f + 5e66ff1 commit e138786

3 files changed

Lines changed: 85 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22

33
All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
44

5+
## Unreleased [patch]
6+
7+
> Development of this release was supported by [Reset Tech](https://www.reset.tech).
8+
9+
### Fixed
10+
11+
- Fix snapshot storage failure when source document URLs contain characters forbidden in filenames
12+
- Fix duplicate file extension in snapshot filenames when source document URLs end with an extension
13+
514
## 11.0.1 - 2026-04-06
615

716
> Development of this release was supported by [Reset Tech](https://www.reset.tech).

src/archivist/services/sourceDocument.js

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import path from 'path';
2+
3+
import mime from 'mime';
4+
15
export default class SourceDocument {
26
/**
37
* Represents a source document containing web content and metadata for extraction.
@@ -21,7 +25,7 @@ export default class SourceDocument {
2125
this.filters = filters;
2226
this.content = content;
2327
this.mimeType = mimeType;
24-
this.id = new URL(location).pathname.split('/').filter(Boolean).join('-');
28+
this.id = SourceDocument.generateId(location);
2529
}
2630

2731
get cssSelectors() {
@@ -60,6 +64,20 @@ export default class SourceDocument {
6064
return [selector];
6165
}
6266

67+
static generateId(location) {
68+
const ILLEGAL_CHARACTERS = /[\\:"<>|*?]/g; // Characters forbidden in filenames for cross-platform compatibility; see https://github.com/actions/toolkit/blob/main/packages/artifact/src/internal/upload/path-and-artifact-name-validation.ts
69+
70+
const pathname = decodeURIComponent(new URL(location).pathname);
71+
const extension = path.extname(pathname);
72+
const pathnameWithoutExtension = mime.getType(extension) ? pathname.slice(0, -extension.length) : pathname; // Remove file extension when it corresponds to a known MIME type, as the extension is not part of the document's identity but a web server implementation detail
73+
74+
return pathnameWithoutExtension
75+
.split('/')
76+
.filter(Boolean)
77+
.join('-')
78+
.replace(ILLEGAL_CHARACTERS, '_');
79+
}
80+
6381
toPersistence() {
6482
return {
6583
fetch: this.location,

src/archivist/services/sourceDocument.test.js

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,63 @@ describe('SourceDocument', () => {
157157
});
158158
});
159159

160+
describe('#generateId', () => {
161+
it('generates ID from URL pathname', () => {
162+
expect(new SourceDocument({ location: 'https://example.com/legal/terms' }).id).to.equal('legal-terms');
163+
});
164+
165+
it('returns empty string for root URL', () => {
166+
expect(new SourceDocument({ location: 'https://example.com/' }).id).to.equal('');
167+
});
168+
169+
it('decodes URL-encoded characters', () => {
170+
expect(new SourceDocument({ location: 'https://example.com/terms%20of%20service' }).id).to.equal('terms of service');
171+
});
172+
173+
it('removes known file extension from URL pathname', () => {
174+
expect(new SourceDocument({ location: 'https://example.com/en.html' }).id).to.equal('en');
175+
});
176+
177+
it('removes only the last file extension', () => {
178+
expect(new SourceDocument({ location: 'https://example.com/terms.backup.html' }).id).to.equal('terms.backup');
179+
});
180+
181+
it('keeps unknown extension in URL pathname', () => {
182+
expect(new SourceDocument({ location: 'https://example.com/terms.of.service' }).id).to.equal('terms.of.service');
183+
});
184+
it('decodes URL-encoded characters before replacing illegal ones', () => {
185+
expect(new SourceDocument({ location: 'https://example.com/terms%3Aof%3Aservice' }).id).to.equal('terms_of_service');
186+
});
187+
188+
context('replaces characters that are illegal in filenames for cross-platform compatibility', () => {
189+
const ILLEGAL_CHARACTERS_IN_URL_PATHNAME = {
190+
':': 'colon',
191+
'"': 'double quote',
192+
'<': 'less than',
193+
'>': 'greater than',
194+
'|': 'vertical bar',
195+
'*': 'asterisk',
196+
};
197+
198+
for (const [ character, name ] of Object.entries(ILLEGAL_CHARACTERS_IN_URL_PATHNAME)) {
199+
it(`replaces ${name} "${character}"`, () => {
200+
expect(new SourceDocument({ location: `https://example.com/before${character}after` }).id).to.equal('before_after');
201+
});
202+
}
203+
204+
const ILLEGAL_CHARACTERS_ENCODED_IN_URL = {
205+
'%5C': 'backslash',
206+
'%3F': 'question mark',
207+
};
208+
209+
for (const [ encoded, name ] of Object.entries(ILLEGAL_CHARACTERS_ENCODED_IN_URL)) {
210+
it(`replaces ${name} decoded from "${encoded}"`, () => {
211+
expect(new SourceDocument({ location: `https://example.com/before${encoded}after` }).id).to.equal('before_after');
212+
});
213+
}
214+
});
215+
});
216+
160217
describe('#toPersistence', () => {
161218
it('converts basic source document declarations into JSON representation', () => {
162219
const result = new SourceDocument({

0 commit comments

Comments
 (0)