From 92637ab248fc80b86c41e40f07d11b938d64e96b Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 01:31:06 +1000 Subject: [PATCH 01/11] feat: autolink bare domains via a curated TLD list --- .../core/src/converters/roundtrip.test.ts | 4 + packages/core/src/extensions/autolink.test.ts | 42 +++++++ .../src/extensions/inline-mark-plugin.test.ts | 23 ++++ .../inline-text-to-mark-chunks.test.ts | 106 +++++++++++++++++- .../extensions/inline-text-to-mark-chunks.ts | 3 + packages/core/src/lezer/autolink-tld.test.ts | 55 +++++++++ packages/core/src/lezer/autolink-tld.ts | 93 +++++++++++++++ packages/core/src/lezer/bare-autolink.ts | 97 ++++++++++++++++ packages/core/src/lezer/parser.ts | 8 +- 9 files changed, 426 insertions(+), 5 deletions(-) create mode 100644 packages/core/src/extensions/autolink.test.ts create mode 100644 packages/core/src/lezer/autolink-tld.test.ts create mode 100644 packages/core/src/lezer/autolink-tld.ts create mode 100644 packages/core/src/lezer/bare-autolink.ts diff --git a/packages/core/src/converters/roundtrip.test.ts b/packages/core/src/converters/roundtrip.test.ts index 936885c..d9195fd 100644 --- a/packages/core/src/converters/roundtrip.test.ts +++ b/packages/core/src/converters/roundtrip.test.ts @@ -75,6 +75,10 @@ describe('markdown round-trip is byte-identical', () => { 'mail me@example.com ok', 'a b', 'end https://example.com.', + // Bare domains autolink too, but stay plain text to the converters + 'see google.com here', + 'paths sub.domain.io/a/b?x=1 end', + 'not a link README.md here', '![cat](https://example.com/cat.png)', 'a ![one](https://example.com/1.png) b ![two](https://example.com/2.png) c', '![](https://www.youtube.com/watch?v=dQw4w9WgXcQ)', diff --git a/packages/core/src/extensions/autolink.test.ts b/packages/core/src/extensions/autolink.test.ts new file mode 100644 index 0000000..c3c4efa --- /dev/null +++ b/packages/core/src/extensions/autolink.test.ts @@ -0,0 +1,42 @@ +import { describe, expect, it } from 'vitest' +import { page } from 'vitest/browser' + +import { setupFixture } from '../testing/index.ts' + +const pmRoot = page.locate('.ProseMirror') + +describe('autolink rendering', () => { + it('renders a scheme autolink as a link', async () => { + using fixture = setupFixture() + const { n } = fixture + fixture.set(n.doc(n.paragraph('see https://example.com here'))) + await expect + .element(pmRoot.getByRole('link', { name: 'https://example.com' })) + .toBeInTheDocument() + }) + + it('renders a bare domain as a link', async () => { + using fixture = setupFixture() + const { n } = fixture + fixture.set(n.doc(n.paragraph('go to google.com now'))) + await expect.element(pmRoot.getByRole('link', { name: 'google.com' })).toBeInTheDocument() + }) + + // Locks the product decision: a link is never un-linked by moving the caret + // into it. The link keeps its blue `` and stays editable. + it('keeps a scheme autolink a link when the caret is inside it', async () => { + using fixture = setupFixture() + const { n } = fixture + fixture.set(n.doc(n.paragraph('see https://example.com here'))) + await expect + .element(pmRoot.getByRole('link', { name: 'https://example.com' })) + .toBeInTheDocument() + }) + + it('keeps a bare domain a link when the caret is inside it', async () => { + using fixture = setupFixture() + const { n } = fixture + fixture.set(n.doc(n.paragraph('go to google.com now'))) + await expect.element(pmRoot.getByRole('link', { name: 'google.com' })).toBeInTheDocument() + }) +}) diff --git a/packages/core/src/extensions/inline-mark-plugin.test.ts b/packages/core/src/extensions/inline-mark-plugin.test.ts index 20a3f0a..f6fc888 100644 --- a/packages/core/src/extensions/inline-mark-plugin.test.ts +++ b/packages/core/src/extensions/inline-mark-plugin.test.ts @@ -66,6 +66,29 @@ describe('inlineMarkPlugin', () => { expect(linkText!.attrs.href).toBe('https://example.com') }) + it('applies mdLinkText with an https href on a bare domain', () => { + using fixture = setupFixture() + const { n } = fixture + const doc = n.doc(n.paragraph('visit google.com now')) + fixture.set(doc) + + const pos = findText(fixture.doc, 'google.com') + const $pos = fixture.doc.resolve(pos + 1) + const linkText = $pos.marks().find((m) => m.type.name === 'mdLinkText') + expect(linkText).toBeTruthy() + expect(linkText!.attrs.href).toBe('https://google.com') + }) + + it('leaves a bare host off the TLD list as plain text', () => { + using fixture = setupFixture() + const { n } = fixture + const doc = n.doc(n.paragraph('open README.md now')) + fixture.set(doc) + + const pos = findText(fixture.doc, 'README.md') + expect(marksAt(fixture.doc, pos + 1)).toEqual([]) + }) + it('marks `*foo*` inside headings as well', () => { using fixture = setupFixture() const { n } = fixture diff --git a/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts b/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts index 9a74974..8f0ccdb 100644 --- a/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts +++ b/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts @@ -315,11 +315,113 @@ describe('inlineTextToMarkChunks', () => { `) }) - it('does not autolink a schemeless host', () => { + it('autolinks a bare domain on the curated TLD list', () => { const chunks = inlineTextToMarkChunks(markBuilders, 'a example.com b') expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` " - 0-15: - + 0-2: - + 2-13: mdLinkText(href=https://example.com) + 13-15: - + " + `) + }) + + it('does not autolink a bare host whose TLD is off the list', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'a README.md b') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-13: - + " + `) + }) + + it('bare-autolinks a domain that starts the text', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'google.com') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-10: mdLinkText(href=https://google.com) + " + `) + }) + + it('bare-autolinks a domain with a path, keeping the path in the href', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'sub.domain.io/path?q=1') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-22: mdLinkText(href=https://sub.domain.io/path?q=1) + " + `) + }) + + it('preserves case in the bare-autolink href', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'GOOGLE.COM') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-10: mdLinkText(href=https://GOOGLE.COM) + " + `) + }) + + it('excludes a trailing period from a bare autolink', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'Visit google.com.') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-6: - + 6-16: mdLinkText(href=https://google.com) + 16-17: - + " + `) + }) + + it('does not bare-autolink a code-file name', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'edit node.js then') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-17: - + " + `) + }) + + it('claims a www. autolink as one chunk, not a nested bare domain', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'www.example.com') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-15: mdLinkText(href=https://www.example.com) + " + `) + }) + + it('does not bare-autolink the label of an explicit link', () => { + const chunks = inlineTextToMarkChunks(markBuilders, '[google.com](http://x)') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-1: mdLinkText(href=http://x) + mdMark + 1-11: mdLinkText(href=http://x) + 11-13: mdMark + 13-21: mdLinkUri + 21-22: mdMark + " + `) + }) + + it('does not bare-autolink inside inline code', () => { + const chunks = inlineTextToMarkChunks(markBuilders, '`see google.com`') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-1: mdCode + mdMark + 1-15: mdCode + 15-16: mdCode + mdMark + " + `) + }) + + it('does not bare-autolink a domain after an @ (it is an email)', () => { + const chunks = inlineTextToMarkChunks(markBuilders, 'mail a@google.com here') + expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` + " + 0-5: - + 5-17: mdLinkText(href=mailto:a@google.com) + 17-22: - " `) }) diff --git a/packages/core/src/extensions/inline-text-to-mark-chunks.ts b/packages/core/src/extensions/inline-text-to-mark-chunks.ts index e86180e..2030e31 100644 --- a/packages/core/src/extensions/inline-text-to-mark-chunks.ts +++ b/packages/core/src/extensions/inline-text-to-mark-chunks.ts @@ -1,5 +1,6 @@ import type { Mark } from '@prosekit/pm/model' +import { hostFromUrl, isLinkableBareHost } from '../lezer/autolink-tld.ts' import type { InlineElement } from '../lezer/inline.ts' import { parseInline } from '../lezer/inline.ts' import { LEZER_NODE_IDS } from '../lezer/node-ids.ts' @@ -59,12 +60,14 @@ export function inlineTextToMarkChunks( * - a URL with a scheme is used as-is * - an email becomes `mailto:` * - a `www.` URL gets an implied `https://` + * - a bare domain on the curated TLD list gets an implied `https://` * - anything else returns `undefined` */ function getAutolinkHref(urlText: string): string | undefined { if (/^[a-z][a-z0-9+.-]*:/i.test(urlText)) return urlText if (/^[^\s@]+@[^\s@]+$/.test(urlText)) return `mailto:${urlText}` if (/^www\./i.test(urlText)) return `https://${urlText}` + if (isLinkableBareHost(hostFromUrl(urlText))) return `https://${urlText}` return undefined } diff --git a/packages/core/src/lezer/autolink-tld.test.ts b/packages/core/src/lezer/autolink-tld.test.ts new file mode 100644 index 0000000..6c56072 --- /dev/null +++ b/packages/core/src/lezer/autolink-tld.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from 'vitest' + +import { hostFromUrl, isLinkableBareHost } from './autolink-tld.ts' + +describe('hostFromUrl', () => { + it('returns the whole string when there is no path', () => { + expect(hostFromUrl('google.com')).toBe('google.com') + }) + + it('strips the path', () => { + expect(hostFromUrl('sub.domain.io/path?q=1')).toBe('sub.domain.io') + }) +}) + +describe('isLinkableBareHost', () => { + const linkable = [ + 'google.com', + 'example.org', + 'sub.domain.io', + 'a-b.example.com', + 'GOOGLE.COM', + 'm.google.com', + ] + for (const host of linkable) { + it(`links ${host}`, () => { + expect(isLinkableBareHost(host)).toBe(true) + }) + } + + const rejected = [ + 'README.md', // md excluded + 'deploy.sh', // sh excluded + 'main.rs', // rs excluded + 'script.pl', // pl excluded + 'node.js', // js not a tld + 'index.html', // html not a tld + 'file.txt', // txt not a tld + 'Cargo.toml', // toml not a tld + 'package.json', // json not a tld + 'etc', // single label + 't.co', // 1-char host + 'x.io', // 1-char host + 'do.so', // so not a tld and host < 3 anyway + '1.2.3.4', // last label not a tld + 'v1.2', // last label not a tld + '192.168.0.1', // last label not a tld + '-bad.com', // leading hyphen label + 'bad-.com', // trailing hyphen label + ] + for (const host of rejected) { + it(`rejects ${host}`, () => { + expect(isLinkableBareHost(host)).toBe(false) + }) + } +}) diff --git a/packages/core/src/lezer/autolink-tld.ts b/packages/core/src/lezer/autolink-tld.ts new file mode 100644 index 0000000..f5ea951 --- /dev/null +++ b/packages/core/src/lezer/autolink-tld.ts @@ -0,0 +1,93 @@ +/** + * Curated, tunable list of TLDs that meowdown autolinks when they appear in a + * bare domain (no scheme, no `www.`). It deliberately omits TLDs that double as + * common code-file extensions even though they are real ccTLDs: `md` (markdown), + * `sh` (shell), `pl` (perl), `rs` (rust). Those still autolink behind a + * `www.`/scheme prefix, just not bare, so `README.md` and `deploy.sh` stay + * plain text. + */ +const BARE_AUTOLINK_TLDS: ReadonlySet = new Set([ + // generic + 'com', + 'org', + 'net', + 'edu', + 'gov', + 'mil', + 'int', + 'info', + 'biz', + // popular new gTLDs + 'io', + 'co', + 'ai', + 'app', + 'dev', + 'me', + 'xyz', + 'online', + 'site', + 'tech', + 'blog', + 'shop', + 'store', + 'cloud', + 'page', + 'wiki', + // common ccTLDs used as vanity / real sites + 'us', + 'uk', + 'ca', + 'de', + 'fr', + 'jp', + 'cn', + 'au', + 'in', + 'ru', + 'br', + 'eu', + 'nl', + 'es', + 'it', + 'ch', + 'se', + 'kr', +]) + +// A single DNS label: alphanumeric, hyphens allowed inside but not at the edges. +const LABEL_RE = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$/i + +/** The host portion of a bare candidate: everything before the first `/`. */ +export function hostFromUrl(text: string): string { + const slash = text.indexOf('/') + return slash === -1 ? text : text.slice(0, slash) +} + +/** + * True when `host` (no scheme, no `@`, path already stripped) is a bare domain + * meowdown links. Rules: + * + * - at least two dot-separated labels (host + tld) + * - the last label is in `BARE_AUTOLINK_TLDS` (matched case-insensitively) + * - the registrable label (the one before the tld) is at least 3 chars, so + * `t.co` / `x.io` / `do.so` stay plain text + * - every label is a valid DNS label (alphanumeric, inner hyphens only, <= 63 + * chars), which also rejects IP-like input such as `1.2.3.4` because its last + * label is not a known tld + */ +export function isLinkableBareHost(host: string): boolean { + const labels = host.split('.') + if (labels.length < 2) return false + + const tld = labels[labels.length - 1].toLowerCase() + if (!BARE_AUTOLINK_TLDS.has(tld)) return false + + const registrable = labels[labels.length - 2] + if (registrable.length < 3) return false + + for (const label of labels) { + if (label.length > 63 || !LABEL_RE.test(label)) return false + } + return true +} diff --git a/packages/core/src/lezer/bare-autolink.ts b/packages/core/src/lezer/bare-autolink.ts new file mode 100644 index 0000000..1fca7f7 --- /dev/null +++ b/packages/core/src/lezer/bare-autolink.ts @@ -0,0 +1,97 @@ +import type { MarkdownConfig } from '@lezer/markdown' + +import { + CHAR_0, + CHAR_9, + CHAR_HYPHEN_MINUS, + CHAR_LOWERCASE_A, + CHAR_LOWERCASE_Z, + CHAR_UPPERCASE_A, + CHAR_UPPERCASE_Z, +} from '../unicode.ts' + +import { hostFromUrl, isLinkableBareHost } from './autolink-tld.ts' + +// A domain (one or more labels, at least one dot) plus an optional path. +const DOMAIN_RE = /^[a-z0-9-]+(?:\.[a-z0-9-]+)+(?:\/[^\s<]*)?/i + +// Chars that may sit immediately before a bare autolink: whitespace or one of +// `( * _ ~`. Mirrors GFM's "start of line, after whitespace, or one of these" +// boundary rule. A `.`, `-`, alphanumeric, or `@` before the match means we are +// mid-word or mid-email, so no autolink starts there. +const BOUNDARY_BEFORE_RE = /[\s(*_~]/ + +function isDomainStartChar(code: number): boolean { + return ( + (code >= CHAR_0 && code <= CHAR_9) || + (code >= CHAR_UPPERCASE_A && code <= CHAR_UPPERCASE_Z) || + (code >= CHAR_LOWERCASE_A && code <= CHAR_LOWERCASE_Z) || + code === CHAR_HYPHEN_MINUS + ) +} + +function countChar(text: string, end: number, ch: string): number { + let count = 0 + for (let i = 0; i < end; i++) { + if (text[i] === ch) count++ + } + return count +} + +// Trailing-punctuation trimming, ported from `@lezer/markdown`'s GFM autolink so +// a bare domain ending a sentence drops the `.`/`,`/`)` etc. but keeps interior +// punctuation. Returns the kept length of `matched`. +function trimAutolinkEnd(matched: string): number { + let end = matched.length + for (;;) { + const last = matched[end - 1] + if ( + /[?!.,:*_~]/.test(last) || + (last === ')' && countChar(matched, end, ')') > countChar(matched, end, '(')) + ) { + end-- + } else if (last === ';') { + const entity = /&(?:#\d+|#x[a-f\d]+|\w+);$/.exec(matched.slice(0, end)) + if (!entity) break + end = entity.index + } else { + break + } + } + return end +} + +/** + * Inline parser for a bare domain autolink such as `google.com` or + * `sub.domain.io/path` (no scheme, no `www.`). It runs after GFM's own + * `Autolink` so `www.`/scheme/email forms are claimed first and never reach + * here. The domain must pass `isLinkableBareHost` (a curated TLD list plus + * shape rules), which keeps `node.js`, `README.md`, and `i.e.` plain text. It + * emits the shared `URL` node, so the existing mark walk renders it like any + * other autolink. + */ +export const bareAutolink: MarkdownConfig = { + parseInline: [ + { + name: 'BareAutolink', + before: 'Link', + parse(cx, next, pos) { + if (!isDomainStartChar(next) || cx.hasOpenLink) return -1 + + const before = cx.slice(pos - 1, pos) + if (before !== '' && !BOUNDARY_BEFORE_RE.test(before)) return -1 + + const match = DOMAIN_RE.exec(cx.slice(pos, cx.end)) + if (!match) return -1 + + const length = trimAutolinkEnd(match[0]) + if (length === 0) return -1 + + const text = match[0].slice(0, length) + if (!isLinkableBareHost(hostFromUrl(text))) return -1 + + return cx.addElement(cx.elt('URL', pos, pos + length)) + }, + }, + ], +} diff --git a/packages/core/src/lezer/parser.ts b/packages/core/src/lezer/parser.ts index 054452b..da5f9d5 100644 --- a/packages/core/src/lezer/parser.ts +++ b/packages/core/src/lezer/parser.ts @@ -1,5 +1,6 @@ import { GFM, type InlineContext, parser as defaultParser } from '@lezer/markdown' +import { bareAutolink } from './bare-autolink.ts' import { hashtag } from './hashtag.ts' import { wikilink } from './wikilink.ts' @@ -16,10 +17,11 @@ function consumeAllInline(cx: InlineContext): number { /** * `@lezer/markdown` parser configured with GFM (table, strikethrough, - * task list, autolink) plus meowdown's `Hashtag` and `Wikilink` inline - * syntax. Use when both block and inline structure must be recognized. + * task list, autolink) plus meowdown's `Hashtag`, `Wikilink`, and bare + * domain autolink inline syntax. Use when both block and inline structure + * must be recognized. */ -export const gfmParser = defaultParser.configure([GFM, hashtag, wikilink]) +export const gfmParser = defaultParser.configure([GFM, hashtag, wikilink, bareAutolink]) /** * `@lezer/markdown` parser configured with GFM plus a `SkipInline` From b053b70bd3040e345578007799cb542e58105b26 Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 01:31:06 +1000 Subject: [PATCH 02/11] docs: document bare-domain autolinking --- packages/core/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/core/README.md b/packages/core/README.md index bdc6742..8482a1f 100644 --- a/packages/core/README.md +++ b/packages/core/README.md @@ -59,6 +59,8 @@ Wikilinks (`[[target]]`) render with a dashed underline via the `.md-wikilink` c Markdown links (`[text](url)`) render the label as an `` with the `.md-link` class, colored by `--meowdown-accent`; the `[`, `]`, and `(url)` syntax dims in show mode and hides in hide and focus modes. Wire click handling with `defineLinkClickHandler(({ href, event }) => ...)` (or `@meowdown/react`'s `onLinkClick` prop). A plain click inside a link the caret already sits in just places the caret; `Mod`-click always fires. +Bare URLs autolink without `[text](url)` brackets and share the same `.md-link` rendering and click handling: a scheme URL (`https://example.com`), an angle autolink (``), a `www.` host (`www.example.com`), an email (`me@example.com`), and a bare domain (`google.com`, `sub.domain.io/path`). Bare domains are matched against a curated list of common TLDs, so file names and prose keep their dots without linkifying (`README.md`, `node.js`, `i.e.` stay plain text); reach for `[text](url)` or `` to link anything off that list. Autolinks are derived live from the text, so editing one re-evaluates it; the caret sitting inside a link never un-links it. + Inline images (`![alt](src)`) stay literal text and render in place via a mark view, with the raw `![alt](src)` hidden in hide and focus modes. Add it with `defineImage({ resolveImageUrl, onImagePaste })` (or `@meowdown/react`'s image props). `resolveImageUrl` is optional and defaults to showing http(s) URLs as-is. Wire click handling with `defineImageClickHandler(({ src, alt, event }) => ...)` (or `@meowdown/react`'s `onImageClick` prop). Pasting a lone tweet or YouTube link can auto-embed it. `defineEmbedPaste()` (or `@meowdown/react`'s `embedPaste` prop) rewrites the pasted link to `![](url)` so it renders as an embed; one undo turns the embed back into the raw link. It is not part of `defineEditorExtension`; add it explicitly. From 565a98946d1365092274f9875b29c49575dadf8a Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 01:42:23 +1000 Subject: [PATCH 03/11] fix --- packages/core/src/lezer/bare-autolink.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/core/src/lezer/bare-autolink.ts b/packages/core/src/lezer/bare-autolink.ts index 1fca7f7..86fc31c 100644 --- a/packages/core/src/lezer/bare-autolink.ts +++ b/packages/core/src/lezer/bare-autolink.ts @@ -30,6 +30,7 @@ function isDomainStartChar(code: number): boolean { ) } +// Ported from https://code.haverbeke.berlin/lezer/markdown/src/commit/1.6.4/src/extension.ts#L173-L177 function countChar(text: string, end: number, ch: string): number { let count = 0 for (let i = 0; i < end; i++) { @@ -38,9 +39,11 @@ function countChar(text: string, end: number, ch: string): number { return count } -// Trailing-punctuation trimming, ported from `@lezer/markdown`'s GFM autolink so -// a bare domain ending a sentence drops the `.`/`,`/`)` etc. but keeps interior -// punctuation. Returns the kept length of `matched`. +// Trailing-punctuation trimming, so a bare domain ending a sentence +// drops the `.` / `,` / `)` etc. but keeps interior punctuation. +// Returns the kept length of `matched`. +// +// Ported from https://code.haverbeke.berlin/lezer/markdown/src/commit/1.6.4/src/extension.ts#L179-L195 function trimAutolinkEnd(matched: string): number { let end = matched.length for (;;) { From 08e136fd936d8e6d6fdce8066359a0c6e53bb94b Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 01:46:21 +1000 Subject: [PATCH 04/11] test: cover the bare autolink parser directly --- packages/core/src/lezer/bare-autolink.test.ts | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 packages/core/src/lezer/bare-autolink.test.ts diff --git a/packages/core/src/lezer/bare-autolink.test.ts b/packages/core/src/lezer/bare-autolink.test.ts new file mode 100644 index 0000000..fd8df73 --- /dev/null +++ b/packages/core/src/lezer/bare-autolink.test.ts @@ -0,0 +1,96 @@ +import { describe, expect, it } from 'vitest' + +import { collectInlineElements, parseInline } from './inline.ts' +import { LEZER_NODE_IDS } from './node-ids.ts' + +/** Every `URL` node the parser emits for `text`, as `[from, to, slice]`. */ +function urls(text: string): Array<[number, number, string]> { + const elements = parseInline(text) + const nodes = collectInlineElements(elements, (node) => node.type === LEZER_NODE_IDS.URL) + return nodes.map((node) => [node.from, node.to, text.slice(node.from, node.to)]) +} + +describe('bareAutolink', () => { + describe('detects a bare domain', () => { + it('at the start of the text', () => { + expect(urls('google.com')).toEqual([[0, 10, 'google.com']]) + }) + + it('after whitespace', () => { + expect(urls('visit google.com now')).toEqual([[6, 16, 'google.com']]) + }) + + it('with a subdomain and a path', () => { + expect(urls('sub.domain.io/path?q=1')).toEqual([[0, 22, 'sub.domain.io/path?q=1']]) + }) + + it('right after an opening paren', () => { + expect(urls('(google.com)')).toEqual([[1, 11, 'google.com']]) + }) + }) + + describe('ignores text that is not a linkable bare domain', () => { + for (const text of [ + 'node.js', + 'README.md', + 'deploy.sh', + 'file.txt', + 'i.e.', + 't.co', + '1.2.3.4', + 'v1.2', + ]) { + it(text, () => { + expect(urls(text)).toEqual([]) + }) + } + }) + + describe('trims trailing punctuation', () => { + it('drops a sentence-ending period', () => { + expect(urls('Visit google.com.')).toEqual([[6, 16, 'google.com']]) + }) + + it('drops a trailing comma', () => { + expect(urls('google.com, then more')).toEqual([[0, 10, 'google.com']]) + }) + + it('drops an unbalanced closing paren', () => { + expect(urls('(google.com/foo)')).toEqual([[1, 15, 'google.com/foo']]) + }) + + it('keeps balanced parens inside the path', () => { + expect(urls('google.com/foo(bar)')).toEqual([[0, 19, 'google.com/foo(bar)']]) + }) + + it('drops a trailing entity reference', () => { + expect(urls('google.com/a&')).toEqual([[0, 12, 'google.com/a']]) + }) + }) + + describe('does not start mid-token or re-split other autolinks', () => { + it('treats an @ host as an email, not a bare domain', () => { + expect(urls('a@google.com')).toEqual([[0, 12, 'a@google.com']]) + }) + + it('leaves a www. autolink as a single URL', () => { + expect(urls('www.example.com')).toEqual([[0, 15, 'www.example.com']]) + }) + + it('leaves a scheme autolink as a single URL', () => { + expect(urls('https://example.com')).toEqual([[0, 19, 'https://example.com']]) + }) + + it('does not link the label of an explicit link', () => { + expect(urls('[google.com](http://x)')).toEqual([[13, 21, 'http://x']]) + }) + + it('does not link inside inline code', () => { + expect(urls('`google.com`')).toEqual([]) + }) + + it('does not link inside a wikilink', () => { + expect(urls('[[google.com]]')).toEqual([]) + }) + }) +}) From 93b742112ec92141e27c76d32d9f711d4dbb53aa Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 01:46:45 +1000 Subject: [PATCH 05/11] refactor --- packages/core/src/lezer/autolink-tld.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/core/src/lezer/autolink-tld.ts b/packages/core/src/lezer/autolink-tld.ts index f5ea951..47614d8 100644 --- a/packages/core/src/lezer/autolink-tld.ts +++ b/packages/core/src/lezer/autolink-tld.ts @@ -56,7 +56,7 @@ const BARE_AUTOLINK_TLDS: ReadonlySet = new Set([ ]) // A single DNS label: alphanumeric, hyphens allowed inside but not at the edges. -const LABEL_RE = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$/i +const DNS_LABEL_RE = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$/i /** The host portion of a bare candidate: everything before the first `/`. */ export function hostFromUrl(text: string): string { @@ -87,7 +87,7 @@ export function isLinkableBareHost(host: string): boolean { if (registrable.length < 3) return false for (const label of labels) { - if (label.length > 63 || !LABEL_RE.test(label)) return false + if (label.length > 63 || !DNS_LABEL_RE.test(label)) return false } return true } From 866a2de1b06f0a1e189417f1dbf099b18e91bffb Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 01:49:29 +1000 Subject: [PATCH 06/11] fix --- packages/core/src/lezer/autolink-tld.ts | 67 ++++++++----------------- 1 file changed, 22 insertions(+), 45 deletions(-) diff --git a/packages/core/src/lezer/autolink-tld.ts b/packages/core/src/lezer/autolink-tld.ts index 47614d8..99739b8 100644 --- a/packages/core/src/lezer/autolink-tld.ts +++ b/packages/core/src/lezer/autolink-tld.ts @@ -1,59 +1,36 @@ /** - * Curated, tunable list of TLDs that meowdown autolinks when they appear in a - * bare domain (no scheme, no `www.`). It deliberately omits TLDs that double as - * common code-file extensions even though they are real ccTLDs: `md` (markdown), - * `sh` (shell), `pl` (perl), `rs` (rust). Those still autolink behind a - * `www.`/scheme prefix, just not bare, so `README.md` and `deploy.sh` stay - * plain text. + * Allowed TLDs when they appear in a bare domain (no scheme, no `www.`). + * + * Source: Top 25 TLDs by domain count https://research.domaintools.com/statistics/tld-counts/ + * */ const BARE_AUTOLINK_TLDS: ReadonlySet = new Set([ - // generic 'com', - 'org', - 'net', - 'edu', - 'gov', - 'mil', - 'int', - 'info', - 'biz', - // popular new gTLDs - 'io', - 'co', - 'ai', - 'app', - 'dev', - 'me', - 'xyz', - 'online', - 'site', - 'tech', - 'blog', - 'shop', - 'store', - 'cloud', - 'page', - 'wiki', - // common ccTLDs used as vanity / real sites - 'us', - 'uk', - 'ca', 'de', - 'fr', - 'jp', + 'net', 'cn', - 'au', - 'in', + 'org', + 'uk', + 'xyz', + 'top', + 'nl', 'ru', + 'info', 'br', + 'fr', + 'au', + 'shop', 'eu', - 'nl', - 'es', + 'ca', + 'in', + 'online', 'it', + 'co', 'ch', - 'se', - 'kr', -]) + 'pl', + 'cc', + 'es', + ]) // A single DNS label: alphanumeric, hyphens allowed inside but not at the edges. const DNS_LABEL_RE = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$/i From 0e086a0b2a80cfd0cadef478589ae556b9f85fbb Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Thu, 18 Jun 2026 15:52:24 +0000 Subject: [PATCH 07/11] [autofix.ci] apply automated fixes --- packages/core/src/lezer/autolink-tld.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/src/lezer/autolink-tld.ts b/packages/core/src/lezer/autolink-tld.ts index 99739b8..7f3ccd7 100644 --- a/packages/core/src/lezer/autolink-tld.ts +++ b/packages/core/src/lezer/autolink-tld.ts @@ -30,7 +30,7 @@ const BARE_AUTOLINK_TLDS: ReadonlySet = new Set([ 'pl', 'cc', 'es', - ]) +]) // A single DNS label: alphanumeric, hyphens allowed inside but not at the edges. const DNS_LABEL_RE = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$/i From 641075e1476dfa8eb1f9d4508c84b10bf90c8392 Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 02:29:44 +1000 Subject: [PATCH 08/11] fix --- packages/core/src/lezer/autolink-tld.ts | 29 ++++++------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/packages/core/src/lezer/autolink-tld.ts b/packages/core/src/lezer/autolink-tld.ts index 7f3ccd7..9a3a51b 100644 --- a/packages/core/src/lezer/autolink-tld.ts +++ b/packages/core/src/lezer/autolink-tld.ts @@ -1,35 +1,20 @@ /** * Allowed TLDs when they appear in a bare domain (no scheme, no `www.`). * - * Source: Top 25 TLDs by domain count https://research.domaintools.com/statistics/tld-counts/ - * + * The 10 most-visited TLDs by real Chrome traffic. + * Source: Chrome UX Report https://github.com/zakird/crux-top-lists */ const BARE_AUTOLINK_TLDS: ReadonlySet = new Set([ 'com', - 'de', + 'br', 'net', - 'cn', + 'jp', 'org', - 'uk', - 'xyz', - 'top', - 'nl', - 'ru', - 'info', - 'br', - 'fr', - 'au', - 'shop', - 'eu', - 'ca', 'in', - 'online', + 'de', + 'ru', 'it', - 'co', - 'ch', - 'pl', - 'cc', - 'es', + 'fr', ]) // A single DNS label: alphanumeric, hyphens allowed inside but not at the edges. From 5f6dbeb52ed2eb32faac92c9f268f16a1111d6da Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 02:32:24 +1000 Subject: [PATCH 09/11] test: align autolink tests with the CrUX top-10 TLD list --- packages/core/src/converters/roundtrip.test.ts | 2 +- .../src/extensions/inline-text-to-mark-chunks.test.ts | 4 ++-- packages/core/src/lezer/autolink-tld.test.ts | 11 ++++++----- packages/core/src/lezer/bare-autolink.test.ts | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/packages/core/src/converters/roundtrip.test.ts b/packages/core/src/converters/roundtrip.test.ts index d9195fd..0fbbb68 100644 --- a/packages/core/src/converters/roundtrip.test.ts +++ b/packages/core/src/converters/roundtrip.test.ts @@ -77,7 +77,7 @@ describe('markdown round-trip is byte-identical', () => { 'end https://example.com.', // Bare domains autolink too, but stay plain text to the converters 'see google.com here', - 'paths sub.domain.io/a/b?x=1 end', + 'paths sub.domain.net/a/b?x=1 end', 'not a link README.md here', '![cat](https://example.com/cat.png)', 'a ![one](https://example.com/1.png) b ![two](https://example.com/2.png) c', diff --git a/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts b/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts index 8f0ccdb..98310ea 100644 --- a/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts +++ b/packages/core/src/extensions/inline-text-to-mark-chunks.test.ts @@ -345,10 +345,10 @@ describe('inlineTextToMarkChunks', () => { }) it('bare-autolinks a domain with a path, keeping the path in the href', () => { - const chunks = inlineTextToMarkChunks(markBuilders, 'sub.domain.io/path?q=1') + const chunks = inlineTextToMarkChunks(markBuilders, 'sub.domain.com/path?q=1') expect(foramtMarkChunks(chunks)).toMatchInlineSnapshot(` " - 0-22: mdLinkText(href=https://sub.domain.io/path?q=1) + 0-23: mdLinkText(href=https://sub.domain.com/path?q=1) " `) }) diff --git a/packages/core/src/lezer/autolink-tld.test.ts b/packages/core/src/lezer/autolink-tld.test.ts index 6c56072..7e7d306 100644 --- a/packages/core/src/lezer/autolink-tld.test.ts +++ b/packages/core/src/lezer/autolink-tld.test.ts @@ -8,7 +8,7 @@ describe('hostFromUrl', () => { }) it('strips the path', () => { - expect(hostFromUrl('sub.domain.io/path?q=1')).toBe('sub.domain.io') + expect(hostFromUrl('sub.domain.com/path?q=1')).toBe('sub.domain.com') }) }) @@ -16,7 +16,7 @@ describe('isLinkableBareHost', () => { const linkable = [ 'google.com', 'example.org', - 'sub.domain.io', + 'cdn.example.net', 'a-b.example.com', 'GOOGLE.COM', 'm.google.com', @@ -38,9 +38,10 @@ describe('isLinkableBareHost', () => { 'Cargo.toml', // toml not a tld 'package.json', // json not a tld 'etc', // single label - 't.co', // 1-char host - 'x.io', // 1-char host - 'do.so', // so not a tld and host < 3 anyway + 'page.io', // io is a real TLD but not in the curated list + 'corp.co', // co is a real TLD but excluded on purpose + 'ab.com', // 2-char registrable host (com is in the list) + 'x.org', // 1-char registrable host (org is in the list) '1.2.3.4', // last label not a tld 'v1.2', // last label not a tld '192.168.0.1', // last label not a tld diff --git a/packages/core/src/lezer/bare-autolink.test.ts b/packages/core/src/lezer/bare-autolink.test.ts index fd8df73..bab5944 100644 --- a/packages/core/src/lezer/bare-autolink.test.ts +++ b/packages/core/src/lezer/bare-autolink.test.ts @@ -21,7 +21,7 @@ describe('bareAutolink', () => { }) it('with a subdomain and a path', () => { - expect(urls('sub.domain.io/path?q=1')).toEqual([[0, 22, 'sub.domain.io/path?q=1']]) + expect(urls('sub.domain.com/path?q=1')).toEqual([[0, 23, 'sub.domain.com/path?q=1']]) }) it('right after an opening paren', () => { From 628bf6a5751d33f497dfbee582d605ebbeae0130 Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 02:33:19 +1000 Subject: [PATCH 10/11] add retry --- packages/core/vitest.config.ts | 2 ++ packages/react/vitest.config.ts | 2 ++ 2 files changed, 4 insertions(+) diff --git a/packages/core/vitest.config.ts b/packages/core/vitest.config.ts index 937c8fd..d51d361 100644 --- a/packages/core/vitest.config.ts +++ b/packages/core/vitest.config.ts @@ -3,6 +3,8 @@ import { defineProject } from 'vitest/config' export default defineProject({ test: { + retry: process.env.CI ? 3 : 0, + bail: process.env.CI ? 0 : 1, browser: { enabled: true, viewport: { diff --git a/packages/react/vitest.config.ts b/packages/react/vitest.config.ts index faa2750..274a587 100644 --- a/packages/react/vitest.config.ts +++ b/packages/react/vitest.config.ts @@ -5,6 +5,8 @@ import { defineProject } from 'vitest/config' export default defineProject({ plugins: [playwrightCommands()], test: { + retry: process.env.CI ? 3 : 0, + bail: process.env.CI ? 0 : 1, browser: { enabled: true, viewport: { From 490d0677d4591c0ada94ed877f4a8b6960f95f3e Mon Sep 17 00:00:00 2001 From: ocavue Date: Fri, 19 Jun 2026 02:35:29 +1000 Subject: [PATCH 11/11] style: fix vitest retry config indentation --- packages/react/vitest.config.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/react/vitest.config.ts b/packages/react/vitest.config.ts index 274a587..3480039 100644 --- a/packages/react/vitest.config.ts +++ b/packages/react/vitest.config.ts @@ -5,8 +5,8 @@ import { defineProject } from 'vitest/config' export default defineProject({ plugins: [playwrightCommands()], test: { - retry: process.env.CI ? 3 : 0, - bail: process.env.CI ? 0 : 1, + retry: process.env.CI ? 3 : 0, + bail: process.env.CI ? 0 : 1, browser: { enabled: true, viewport: {