Skip to content

Commit 36e8377

Browse files
committed
Remove known extensions from source document IDs
1 parent 9ff0f31 commit 36e8377

2 files changed

Lines changed: 20 additions & 1 deletion

File tree

src/archivist/services/sourceDocument.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import path from 'path';
2+
3+
import mime from 'mime';
4+
15
export default class SourceDocument {
26
/**
37
* Represents a source document containing web content and metadata for extraction.
@@ -63,7 +67,11 @@ export default class SourceDocument {
6367
static generateId(location) {
6468
const ILLEGAL_CHARACTERS = /[\\:"<>|*?]/g; // Characters forbidden in filenames for cross-platform compatibility; see https://github.com/actions/toolkit/blob/main/packages/artifact/src/internal/upload/path-and-artifact-name-validation.ts
6569

66-
return decodeURIComponent(new URL(location).pathname)
70+
const pathname = decodeURIComponent(new URL(location).pathname);
71+
const extension = path.extname(pathname);
72+
const pathnameWithoutExtension = mime.getType(extension) ? pathname.slice(0, -extension.length) : pathname; // Remove file extension when it corresponds to a known MIME type, as the extension is not part of the document's identity but a web server implementation detail
73+
74+
return pathnameWithoutExtension
6775
.split('/')
6876
.filter(Boolean)
6977
.join('-')

src/archivist/services/sourceDocument.test.js

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,17 @@ describe('SourceDocument', () => {
170170
expect(new SourceDocument({ location: 'https://example.com/terms%20of%20service' }).id).to.equal('terms of service');
171171
});
172172

173+
it('removes known file extension from URL pathname', () => {
174+
expect(new SourceDocument({ location: 'https://example.com/en.html' }).id).to.equal('en');
175+
});
176+
177+
it('removes only the last file extension', () => {
178+
expect(new SourceDocument({ location: 'https://example.com/terms.backup.html' }).id).to.equal('terms.backup');
179+
});
180+
181+
it('keeps unknown extension in URL pathname', () => {
182+
expect(new SourceDocument({ location: 'https://example.com/terms.of.service' }).id).to.equal('terms.of.service');
183+
});
173184
it('decodes URL-encoded characters before replacing illegal ones', () => {
174185
expect(new SourceDocument({ location: 'https://example.com/terms%3Aof%3Aservice' }).id).to.equal('terms_of_service');
175186
});

0 commit comments

Comments
 (0)