Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 49 additions & 10 deletions src/specify_cli/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,12 @@ def render_toml_command(self, frontmatter: dict, body: str, source_id: str) -> s
# ``C:\\Users\\...`` whose ``\\U`` reads as an invalid unicode escape) would
# produce unparseable TOML — route those to the *literal* form ('''...'''),
# which does not process escapes, or to the escaped basic string.
if '"""' not in body and "\\" not in body:
# Control characters (U+0000–U+001F except tab/newline, U+007F) and a bare
# CR are illegal in every TOML string form, so a body containing them must
# go to the escaped basic string regardless of which delimiters it uses.
if self._has_illegal_toml_control(body):
toml_lines.append(f"prompt = {self._render_basic_toml_string(body)}")
elif '"""' not in body and "\\" not in body:
toml_lines.append('prompt = """')
toml_lines.append(body)
toml_lines.append('"""')
Expand All @@ -256,17 +261,51 @@ def render_toml_command(self, frontmatter: dict, body: str, source_id: str) -> s

return "\n".join(toml_lines)

@staticmethod
def _has_illegal_toml_control(value: str) -> bool:
"""True if *value* has a character TOML forbids in strings.

TOML bans control characters (U+0000–U+001F except tab and newline, plus
U+007F) in every string form, and a bare CR that is not part of a CRLF
pair. Such a value cannot be emitted raw into any multiline string.
"""
length = len(value)
for i, ch in enumerate(value):
code = ord(ch)
if ch == "\r":
if i + 1 < length and value[i + 1] == "\n":
continue
return True
if (code < 0x20 and ch not in ("\t", "\n")) or code == 0x7F:
return True
return False

@staticmethod
def _render_basic_toml_string(value: str) -> str:
"""Render *value* as a TOML basic string literal."""
escaped = (
value.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
return f'"{escaped}"'
"""Render *value* as a TOML basic string literal.

Escapes the delimiter and backslash, the shorthand escapes (\\n, \\r,
\\t), and any remaining control character (U+0000–U+001F, U+007F) as a
``\\uXXXX`` sequence so the result is always valid TOML.
"""
out = []
for ch in value:
code = ord(ch)
if ch == "\\":
out.append("\\\\")
elif ch == '"':
out.append('\\"')
elif ch == "\n":
out.append("\\n")
elif ch == "\r":
out.append("\\r")
elif ch == "\t":
out.append("\\t")
elif code < 0x20 or code == 0x7F:
out.append(f"\\u{code:04x}")
else:
out.append(ch)
return '"' + "".join(out) + '"'

def render_yaml_command(
self,
Expand Down
68 changes: 57 additions & 11 deletions src/specify_cli/integrations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,56 @@ def _split_frontmatter(content: str) -> tuple[str, str]:
body = "".join(lines[frontmatter_end + 1 :])
return frontmatter, body

@staticmethod
def _has_illegal_toml_control(value: str) -> bool:
"""True when *value* contains a character TOML forbids literally.

TOML basic/literal strings (single- or multi-line) allow tab and, in
the multiline forms, newlines — but every other control character
(``U+0000``–``U+001F`` and ``U+007F``) must be ``\\u``-escaped, which
only a basic string can do. A bare carriage return counts too: a
multiline basic string treats ``\\r`` as a newline only when paired
into ``\\r\\n``; a lone ``\\r`` is an illegal control character.
"""
length = len(value)
for i, ch in enumerate(value):
code = ord(ch)
if ch == "\r":
# Only a CR that is part of a CRLF newline is allowed literally.
if i + 1 < length and value[i + 1] == "\n":
continue
return True
if (code < 0x20 and ch not in ("\t", "\n")) or code == 0x7F:
return True
return False

@staticmethod
def _escape_toml_basic(value: str) -> str:
"""Render *value* as a single-line basic string, escaping everything.

Always valid TOML: backslash/quote are escaped, the common control
chars use their short escapes, and any remaining control character is
emitted as a ``\\uXXXX`` sequence.
"""
out: list[str] = []
for ch in value:
code = ord(ch)
if ch == "\\":
out.append("\\\\")
elif ch == '"':
out.append('\\"')
elif ch == "\n":
out.append("\\n")
elif ch == "\r":
out.append("\\r")
elif ch == "\t":
out.append("\\t")
elif code < 0x20 or code == 0x7F:
out.append(f"\\u{code:04x}")
else:
out.append(ch)
return '"' + "".join(out) + '"'

@staticmethod
def _render_toml_string(value: str) -> str:
"""Render *value* as a TOML string literal.
Expand All @@ -949,6 +999,12 @@ def _render_toml_string(value: str) -> str:
literal string or escaped basic string when delimiters appear in
the content.
"""
# Control characters other than tab/newline (and a bare CR) cannot
# appear literally in any TOML string; route them to a fully-escaped
# basic string so the generated file stays parseable.
if TomlIntegration._has_illegal_toml_control(value):
return TomlIntegration._escape_toml_basic(value)

if "\n" not in value and "\r" not in value:
escaped = value.replace("\\", "\\\\").replace('"', '\\"')
return f'"{escaped}"'
Expand All @@ -961,17 +1017,7 @@ def _render_toml_string(value: str) -> str:
if "'''" not in value and not value.endswith("'"):
return "'''\n" + value + "'''"

return (
'"'
+ (
value.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
+ '"'
)
return TomlIntegration._escape_toml_basic(value)

@staticmethod
def _render_toml(description: str, body: str) -> str:
Expand Down
12 changes: 12 additions & 0 deletions tests/integrations/test_integration_base_toml.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,18 @@ def test_toml_closing_delimiter_inline_when_safe(self, tmp_path, monkeypatch):
"closing delimiter should be inline when body does not end with a quote"
)

def test_toml_string_escapes_control_characters(self):
"""A value with control chars / a bare CR must render as parseable TOML.

TOML forbids literal control characters (U+0000–U+001F except tab and
newline, plus U+007F) in every string form, and a bare CR that is not
part of a CRLF pair. The renderer used to emit these raw into a basic or
``\"\"\"`` multiline string, producing a config file that fails to parse."""
value = "start\x00null\x01ctrl\x1besc\x7fdel\rlone-cr end"
rendered = TomlIntegration._render_toml_string(value)
parsed = tomllib.loads(f"prompt = {rendered}")
assert parsed["prompt"] == value

def test_toml_is_valid(self, tmp_path):
"""Every generated TOML file must parse without errors."""
i = get_integration(self.KEY)
Expand Down
19 changes: 19 additions & 0 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,6 +1750,25 @@ def test_render_toml_command_preserves_multiline_description(self):

assert parsed["description"] == "first line\nsecond line\n"

def test_render_toml_command_escapes_control_characters(self):
"""Control characters and a lone CR must be escaped so the TOML parses.

TOML forbids literal control characters (U+0000–U+001F except tab and
newline, plus U+007F) in any string, and treats a bare CR outside a
CRLF pair as illegal. The renderer used to emit these raw — into a
basic string (single-line) or a ``\"\"\"`` multiline string (for a lone
CR) — producing a command file that fails to parse."""
from specify_cli.agents import CommandRegistrar as AgentCommandRegistrar

registrar = AgentCommandRegistrar()
body = "start\x00null\x01ctrl\x1besc\x7fdel\rlone-cr end"
output = registrar.render_toml_command(
{"description": "d"}, body, "extension:test-ext"
)

parsed = tomllib.loads(output)
assert parsed["prompt"] == body

def test_render_toml_command_preserves_backslashes_in_body(self):
"""A backslash in the body (e.g. a Windows path) must not break TOML.

Expand Down