From 2749a44d99576c1dffe9eeb2467ce9a8cfaaf953 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 21 Mar 2026 16:57:18 +0100 Subject: [PATCH 01/38] first stab --- Lib/profiling/sampling/__init__.py | 11 +- Lib/profiling/sampling/binary_reader.py | 3 + Lib/profiling/sampling/cli.py | 25 ++- Lib/profiling/sampling/ndjson_collector.py | 216 +++++++++++++++++++++ 4 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 Lib/profiling/sampling/ndjson_collector.py diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 6a0bb5e5c2f387..21d3a773a2ba63 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,6 +9,15 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .ndjson_collector import NdjsonCollector from .string_table import StringTable -__all__ = ("Collector", "PstatsCollector", "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", "StringTable") +__all__ = ( + "Collector", + "PstatsCollector", + "CollapsedStackCollector", + "HeatmapCollector", + "GeckoCollector", + "NdjsonCollector", + "StringTable", +) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index a11be3652597a6..d5bfc0d6130f1a 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,6 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector +from .ndjson_collector import NdjsonCollector from .pstats_collector import PstatsCollector @@ -117,6 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) + elif output_format == 'ndjson': + collector = NdjsonCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index f4b31aad45b922..4f9e784f80495d 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -19,6 +19,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .ndjson_collector import NdjsonCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -87,6 +88,7 @@ class CustomFormatter( "flamegraph": "html", "gecko": "json", "heatmap": "html", + "ndjson": "ndjson", "binary": "bin", } @@ -96,6 +98,7 @@ class CustomFormatter( "flamegraph": FlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, + "ndjson": NdjsonCollector, "binary": BinaryCollector, } @@ -467,6 +470,13 @@ def _add_format_options(parser, include_compression=True, include_binary=True): dest="format", help="Generate interactive HTML heatmap visualization with line-level sample counts", ) + format_group.add_argument( + "--ndjson", + action="store_const", + const="ndjson", + dest="format", + help="Generate NDJSON snapshot output for external consumers", + ) if include_binary: format_group.add_argument( "--binary", @@ -545,15 +555,17 @@ def _sort_to_mode(sort_choice): return sort_map.get(sort_choice, SORT_MODE_NSAMPLES) def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=False, - output_file=None, compression='auto'): + mode=None, output_file=None, compression='auto'): """Create the appropriate collector based on format type. Args: - format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary') + format_type: The output format ('pstats', 'collapsed', 'flamegraph', + 'gecko', 'heatmap', 'ndjson', 'binary') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format for creating interval markers in Firefox Profiler) + mode: Profiling mode for collectors that expose it in metadata output_file: Output file path (required for binary format) compression: Compression type for binary format ('auto', 'zstd', 'none') @@ -577,6 +589,11 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) + if format_type == "ndjson": + return collector_class( + sample_interval_usec, skip_idle=skip_idle, mode=mode + ) + return collector_class(sample_interval_usec, skip_idle=skip_idle) @@ -951,7 +968,7 @@ def _handle_attach(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto') ) @@ -1029,7 +1046,7 @@ def _handle_run(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto') ) diff --git a/Lib/profiling/sampling/ndjson_collector.py b/Lib/profiling/sampling/ndjson_collector.py new file mode 100644 index 00000000000000..123ec1c5ea9a1c --- /dev/null +++ b/Lib/profiling/sampling/ndjson_collector.py @@ -0,0 +1,216 @@ +"""NDJSON collector.""" + +import json +import uuid +from itertools import batched + +from .constants import ( + PROFILING_MODE_ALL, + PROFILING_MODE_CPU, + PROFILING_MODE_EXCEPTION, + PROFILING_MODE_GIL, + PROFILING_MODE_WALL, +) +from .stack_collector import StackTraceCollector + + +_CHUNK_SIZE = 1000 + +_MODE_NAMES = { + PROFILING_MODE_WALL: "wall", + PROFILING_MODE_CPU: "cpu", + PROFILING_MODE_GIL: "gil", + PROFILING_MODE_ALL: "all", + PROFILING_MODE_EXCEPTION: "exception", +} + + +class NdjsonCollector(StackTraceCollector): + """Collector that exports finalized profiling data as NDJSON.""" + + def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): + super().__init__(sample_interval_usec, skip_idle=skip_idle) + self.run_id = uuid.uuid4().hex + + self._string_to_id = {} + self._strings = [] + + self._frame_to_id = {} + self._frames = [] + + self._frame_self = {} + self._frame_cumulative = {} + self._samples_total = 0 + + self._mode = mode + + def process_frames(self, frames, _thread_id, weight=1): + if not frames: + return + + self._samples_total += weight + + frame_ids = [ + self._get_or_create_frame_id(filename, location, funcname) + for filename, location, funcname, _opcode in frames + ] + leaf_frame_id = frame_ids[0] + + self._frame_self[leaf_frame_id] = ( + self._frame_self.get(leaf_frame_id, 0) + weight + ) + + for frame_id in set(frame_ids): + self._frame_cumulative[frame_id] = ( + self._frame_cumulative.get(frame_id, 0) + weight + ) + + def export(self, filename): + with open(filename, "w", encoding="utf-8") as output: + self._write_message(output, self._build_meta_record()) + self._write_chunked_defs(output, "str_def", self._strings) + self._write_chunked_defs(output, "frame_def", self._frames) + self._write_chunked_agg(output, self._iter_agg_entries()) + self._write_message( + output, + { + "type": "end", + "v": 1, + "run_id": self.run_id, + "samples_total": self._samples_total, + }, + ) + + print(f"NDJSON profile written to {filename}") + + def _build_meta_record(self): + record = { + "type": "meta", + "v": 1, + "run_id": self.run_id, + "sample_interval_usec": self.sample_interval_usec, + } + + if self._mode is not None: + record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode)) + + return record + + def _get_or_create_frame_id(self, filename, location, funcname): + synthetic = location is None + location_fields = self._normalize_export_location(location) + func_str_id = self._intern_string(funcname) + path_str_id = self._intern_string(filename) + + frame_key = ( + path_str_id, + func_str_id, + location_fields["line"], + location_fields.get("end_line"), + location_fields.get("col"), + location_fields.get("end_col"), + synthetic, + ) + + if (frame_id := self._frame_to_id.get(frame_key)) is not None: + return frame_id + + frame_id = len(self._frames) + 1 + frame_record = { + "frame_id": frame_id, + "path_str_id": path_str_id, + "func_str_id": func_str_id, + **location_fields, + } + if synthetic: + frame_record["synthetic"] = True + + self._frame_to_id[frame_key] = frame_id + self._frames.append(frame_record) + return frame_id + + def _intern_string(self, value): + value = str(value) + + if (string_id := self._string_to_id.get(value)) is not None: + return string_id + + string_id = len(self._strings) + 1 + self._string_to_id[value] = string_id + self._strings.append({"str_id": string_id, "value": value}) + return string_id + + @staticmethod + def _normalize_export_location(location): + if location is None: + return {"line": 0} + + if isinstance(location, int): + return {"line": max(location, 0)} + + if not isinstance(location, tuple): + lineno = getattr(location, "lineno", 0) + location = ( + lineno, + getattr(location, "end_lineno", lineno), + getattr(location, "col_offset", -1), + getattr(location, "end_col_offset", -1), + ) + + lineno, end_lineno, col_offset, end_col_offset = location + if not isinstance(lineno, int) or lineno <= 0: + return {"line": 0} + + normalized = {"line": lineno} + if isinstance(end_lineno, int) and end_lineno > 0: + normalized["end_line"] = end_lineno + if isinstance(col_offset, int) and col_offset >= 0: + normalized["col"] = col_offset + if isinstance(end_col_offset, int) and end_col_offset >= 0: + normalized["end_col"] = end_col_offset + return normalized + + def _iter_agg_entries(self): + entries = [] + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + entries.append( + { + "frame_id": frame_id, + "self": self._frame_self.get(frame_id, 0), + "cumulative": self._frame_cumulative.get(frame_id, 0), + } + ) + return entries + + def _write_chunked_defs(self, output, record_type, entries): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message( + output, + { + "type": record_type, + "v": 1, + "run_id": self.run_id, + "defs": chunk, + }, + ) + + def _write_chunked_agg(self, output, entries): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message( + output, + { + "type": "agg", + "v": 1, + "run_id": self.run_id, + "kind": "frame", + "scope": "final", + "samples_total": self._samples_total, + "entries": chunk, + }, + ) + + @staticmethod + def _write_message(output, record): + output.write(json.dumps(record, separators=(",", ":"))) + output.write("\n") From f13d34c02b4b3a3e507e8863253c8f3c672484e9 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 21 Mar 2026 21:08:18 +0100 Subject: [PATCH 02/38] s/ndjson/jsonl/ --- Lib/profiling/sampling/__init__.py | 4 ++-- Lib/profiling/sampling/binary_reader.py | 6 +++--- Lib/profiling/sampling/cli.py | 16 ++++++++-------- .../{ndjson_collector.py => jsonl_collector.py} | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) rename Lib/profiling/sampling/{ndjson_collector.py => jsonl_collector.py} (97%) diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 21d3a773a2ba63..71579a3903253e 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,7 +9,7 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .string_table import StringTable __all__ = ( @@ -18,6 +18,6 @@ "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", - "NdjsonCollector", + "JsonlCollector", "StringTable", ) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index d5bfc0d6130f1a..8d1d8eef9155eb 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,7 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .pstats_collector import PstatsCollector @@ -118,8 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) - elif output_format == 'ndjson': - collector = NdjsonCollector(interval) + elif output_format == 'jsonl': + collector = JsonlCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index 4f9e784f80495d..bb97c9729364cc 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -19,7 +19,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -88,7 +88,7 @@ class CustomFormatter( "flamegraph": "html", "gecko": "json", "heatmap": "html", - "ndjson": "ndjson", + "jsonl": "jsonl", "binary": "bin", } @@ -98,7 +98,7 @@ class CustomFormatter( "flamegraph": FlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, - "ndjson": NdjsonCollector, + "jsonl": JsonlCollector, "binary": BinaryCollector, } @@ -471,11 +471,11 @@ def _add_format_options(parser, include_compression=True, include_binary=True): help="Generate interactive HTML heatmap visualization with line-level sample counts", ) format_group.add_argument( - "--ndjson", + "--jsonl", action="store_const", - const="ndjson", + const="jsonl", dest="format", - help="Generate NDJSON snapshot output for external consumers", + help="Generate JSONL snapshot output for external consumers", ) if include_binary: format_group.add_argument( @@ -560,7 +560,7 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals Args: format_type: The output format ('pstats', 'collapsed', 'flamegraph', - 'gecko', 'heatmap', 'ndjson', 'binary') + 'gecko', 'heatmap', 'jsonl', 'binary') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format @@ -589,7 +589,7 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) - if format_type == "ndjson": + if format_type == "jsonl": return collector_class( sample_interval_usec, skip_idle=skip_idle, mode=mode ) diff --git a/Lib/profiling/sampling/ndjson_collector.py b/Lib/profiling/sampling/jsonl_collector.py similarity index 97% rename from Lib/profiling/sampling/ndjson_collector.py rename to Lib/profiling/sampling/jsonl_collector.py index 123ec1c5ea9a1c..1d6575425c2616 100644 --- a/Lib/profiling/sampling/ndjson_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -1,4 +1,4 @@ -"""NDJSON collector.""" +"""JSONL collector.""" import json import uuid @@ -25,8 +25,8 @@ } -class NdjsonCollector(StackTraceCollector): - """Collector that exports finalized profiling data as NDJSON.""" +class JsonlCollector(StackTraceCollector): + """Collector that exports finalized profiling data as JSONL.""" def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): super().__init__(sample_interval_usec, skip_idle=skip_idle) @@ -81,7 +81,7 @@ def export(self, filename): }, ) - print(f"NDJSON profile written to {filename}") + print(f"JSONL profile written to {filename}") def _build_meta_record(self): record = { From c15d318022cf9c226cd36b36818a270f994fb99c Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sun, 22 Mar 2026 02:51:12 +0100 Subject: [PATCH 03/38] printing to stdout isn't a great idea --- Lib/profiling/sampling/jsonl_collector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 1d6575425c2616..3333b7352c9411 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -81,8 +81,6 @@ def export(self, filename): }, ) - print(f"JSONL profile written to {filename}") - def _build_meta_record(self): record = { "type": "meta", From cb27fc035d79a7611bd7583d82c02f92b4980a93 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 30 Mar 2026 22:50:59 +0200 Subject: [PATCH 04/38] even a basic test --- .../test_sampling_profiler/test_collectors.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 86fb9d4c05b3bc..66052a8b26f3c1 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -16,6 +16,7 @@ CollapsedStackCollector, FlamegraphCollector, ) + from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.gecko_collector import GeckoCollector from profiling.sampling.collector import extract_lineno, normalize_location from profiling.sampling.opcode_utils import get_opcode_info, format_opcode @@ -1665,6 +1666,86 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) + def test_jsonl_collector_basic(self): + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + collector = JsonlCollector(1000) + run_id = collector.run_id + + self.assertIsNotNone(run_id) + + test_frames1 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + ) + ], + ) + ] + test_frames2 = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + ) + ], + ) + ] # Same stack + test_frames3 = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [MockFrameInfo("other.py", 5, "other_func")])] + ) + ] + + collector.collect(test_frames1) + collector.collect(test_frames2) + collector.collect(test_frames3) + + with captured_stdout(), captured_stderr(): + collector.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "func1"}, + {"str_id": 2, "value": "file.py"}, + {"str_id": 3, "value": "func2"}, + {"str_id": 4, "value": "other_func"}, + {"str_id": 5, "value": "other.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 10, "end_line": 10}, + {"frame_id": 2, "path_str_id": 2, "func_str_id": 3, + "line": 20, "end_line": 20}, + {"frame_id": 3, "path_str_id": 5, "func_str_id": 4, + "line": 5, "end_line": 5}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 3, + "entries": [{"frame_id": 1, "self": 2, "cumulative": 2}, + {"frame_id": 2, "self": 0, "cumulative": 2}, + {"frame_id": 3, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 3}), + ] + + for exp in expected: + self.assertIn(exp, lines) + class TestRecursiveFunctionHandling(unittest.TestCase): """Tests for correct handling of recursive functions in cumulative stats.""" From 59cbb4a9cb7d7a7fe018d0c22d9d2a23e5b67d1f Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 30 Mar 2026 22:54:22 +0200 Subject: [PATCH 05/38] separate func for end record --- Lib/profiling/sampling/jsonl_collector.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 3333b7352c9411..59ab3b865c182c 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -71,15 +71,7 @@ def export(self, filename): self._write_chunked_defs(output, "str_def", self._strings) self._write_chunked_defs(output, "frame_def", self._frames) self._write_chunked_agg(output, self._iter_agg_entries()) - self._write_message( - output, - { - "type": "end", - "v": 1, - "run_id": self.run_id, - "samples_total": self._samples_total, - }, - ) + self._write_message(output, self._build_end_record()) def _build_meta_record(self): record = { @@ -94,6 +86,16 @@ def _build_meta_record(self): return record + def _build_end_record(self): + record = { + "type": "end", + "v": 1, + "run_id": self.run_id, + "samples_total": self._samples_total, + } + + return record + def _get_or_create_frame_id(self, filename, location, funcname): synthetic = location is None location_fields = self._normalize_export_location(location) From 25c692207966798b82e50729caba58e9e0f4b708 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 30 Mar 2026 23:14:18 +0200 Subject: [PATCH 06/38] proper name --- .../test_profiling/test_sampling_profiler/test_collectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 66052a8b26f3c1..6127284618a13d 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1666,7 +1666,7 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) - def test_jsonl_collector_basic(self): + def test_jsonl_collector_export(self): collapsed_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, collapsed_out) From 67cd39a0a07e3e74ab025ad2b39858f2d3bef275 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:48:26 +0200 Subject: [PATCH 07/38] test_jsonl_collector_with_location_info --- .../test_sampling_profiler/test_collectors.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 6127284618a13d..175d2f7c263809 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2145,6 +2145,56 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) + def test_jsonl_collector_with_location_info(self): + """Test JsonlCollector handles LocationInfo properly.""" + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + collector = JsonlCollector(sample_interval_usec=1000) + run_id = collector.run_id + + # Frame with LocationInfo + frame = MockFrameInfo("test.py", 42, "my_function") + frames = [ + MockInterpreterInfo( + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + + # Should extract lineno from location + with captured_stdout(), captured_stderr(): + collector.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "my_function"}, + {"str_id": 2, "value": "test.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 42, "end_line": 42}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 1, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 1}), + ] + + for exp in expected: + self.assertIn(exp, lines) + class TestOpcodeHandling(unittest.TestCase): """Tests for opcode field handling in collectors.""" From 7c85d474ffd94cb654ab3ebe68f631ecd75faeb5 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:52:15 +0200 Subject: [PATCH 08/38] test synthetic frames --- .../test_sampling_profiler/test_collectors.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 175d2f7c263809..d04b8a49871f48 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2145,6 +2145,7 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) + def test_jsonl_collector_with_location_info(self): """Test JsonlCollector handles LocationInfo properly.""" collapsed_out = tempfile.NamedTemporaryFile(delete=False) @@ -2196,6 +2197,59 @@ def jsonl(obj): self.assertIn(exp, lines) + def test_jsonl_collector_with_none_location(self): + """Test JsonlCollector handles None location (synthetic frames).""" + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + collector = JsonlCollector(sample_interval_usec=1000) + run_id = collector.run_id + + # Create frame with None location (like GC frame) + frame = MockFrameInfo("~", 0, "") + frame.location = None # Synthetic frame has no location + frames = [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + ) + ] + collector.collect(frames) + + # Should handle None location as synthetic frame + with captured_stdout(), captured_stderr(): + collector.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": ""}, + {"str_id": 2, "value": "~"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 0, "synthetic": True}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 1, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 1}), + ] + + for exp in expected: + self.assertIn(exp, lines) + + class TestOpcodeHandling(unittest.TestCase): """Tests for opcode field handling in collectors.""" From 3eddae83d550962779f11225c3a55f3bd52f37de Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:02:08 +0200 Subject: [PATCH 09/38] too many new lines --- .../test_profiling/test_sampling_profiler/test_collectors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index d04b8a49871f48..f46cfc1dbcd0b1 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2145,7 +2145,6 @@ def test_gecko_collector_with_location_info(self): # Verify function name is in string table self.assertIn("handle_request", string_array) - def test_jsonl_collector_with_location_info(self): """Test JsonlCollector handles LocationInfo properly.""" collapsed_out = tempfile.NamedTemporaryFile(delete=False) @@ -2196,7 +2195,6 @@ def jsonl(obj): for exp in expected: self.assertIn(exp, lines) - def test_jsonl_collector_with_none_location(self): """Test JsonlCollector handles None location (synthetic frames).""" collapsed_out = tempfile.NamedTemporaryFile(delete=False) From f71252ed7b63a7ce65ef83caf4d7f73ca4899e28 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:21:42 +0200 Subject: [PATCH 10/38] BUG? confusing... two ways to set skip_idle? --- .../test_sampling_profiler/test_modes.py | 157 +++++++++++++++++- 1 file changed, 155 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 0b38fb4ad4bcf6..67b82eff091d08 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -9,6 +9,7 @@ import profiling.sampling import profiling.sampling.sample from profiling.sampling.pstats_collector import PstatsCollector + from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.cli import main, _parse_mode from profiling.sampling.constants import PROFILING_MODE_EXCEPTION from _remote_debugging import ( @@ -20,9 +21,13 @@ "Test only runs when _remote_debugging is available" ) -from test.support import requires_remote_subprocess_debugging +from test.support import ( + captured_stdout, + captured_stderr, + requires_remote_subprocess_debugging, +) -from .helpers import test_subprocess +from .helpers import close_and_unlink, test_subprocess from .mocks import MockFrameInfo, MockInterpreterInfo @@ -228,6 +233,154 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) + def test_jsonl_collector_rspects_skip_idle(self): + """Test that frames are actually filtered when skip_idle=True.""" + import tempfile + import json + + collapsed_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, collapsed_out) + + # Create mock frames with different thread statuses + class MockThreadInfoWithStatus: + def __init__(self, thread_id, frame_info, status): + self.thread_id = thread_id + self.frame_info = frame_info + self.status = status + + # Create test data: active thread (HAS_GIL | ON_CPU), idle thread (neither), and another active thread + ACTIVE_STATUS = ( + THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU + ) # Has GIL and on CPU + IDLE_STATUS = 0 # Neither has GIL nor on CPU + + test_frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfoWithStatus( + 1, + [MockFrameInfo("active1.py", 10, "active_func1")], + ACTIVE_STATUS, + ), + MockThreadInfoWithStatus( + 2, + [MockFrameInfo("idle.py", 20, "idle_func")], + IDLE_STATUS, + ), + MockThreadInfoWithStatus( + 3, + [MockFrameInfo("active2.py", 30, "active_func2")], + ACTIVE_STATUS, + ), + ], + ) + ] + + # Test with skip_idle=True - should only process running threads + collector_skip = JsonlCollector( + sample_interval_usec=1000, skip_idle=True + ) + collector_skip.collect(test_frames) + + run_id = collector_skip.run_id + + # Should only have functions from running threads (status 0) + with captured_stdout(), captured_stderr(): + collector_skip.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + def jsonl(obj): + return json.dumps(obj, separators=(",", ":")) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "active_func1"}, + {"str_id": 2, "value": "active1.py"}, + {"str_id": 3, "value": "idle_func"}, + {"str_id": 4, "value": "idle.py"}, + {"str_id": 5, "value": "active_func2"}, + {"str_id": 6, "value": "active2.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 10, "end_line": 10}, + {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, + "line": 20, "end_line": 20}, + {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, + "line": 30, "end_line": 30}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 3, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, + {"frame_id": 2, "self": 1, "cumulative": 1}, + {"frame_id": 3, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 3}), + ] + + for exp in expected: + self.assertIn(exp, lines) + + # Test with skip_idle=False - should process all threads + collector_no_skip = JsonlCollector( + sample_interval_usec=1000, skip_idle=False + ) + collector_no_skip.collect(test_frames) + + run_id = collector_no_skip.run_id + + # Should have functions from all threads + with captured_stdout(), captured_stderr(): + collector_no_skip.export(collapsed_out.name) + + # Check file contents + with open(collapsed_out.name, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + self.assertEqual(len(lines), 5) + + expected = [ + jsonl({"type": "meta", "v": 1, "run_id": run_id, + "sample_interval_usec": 1000}), + jsonl({"type": "str_def", "v": 1, "run_id": run_id, + "defs": [{"str_id": 1, "value": "active_func1"}, + {"str_id": 2, "value": "active1.py"}, + {"str_id": 3, "value": "idle_func"}, + {"str_id": 4, "value": "idle.py"}, + {"str_id": 5, "value": "active_func2"}, + {"str_id": 6, "value": "active2.py"}]}), + jsonl({"type": "frame_def", "v": 1, "run_id": run_id, + "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, + "line": 10, "end_line": 10}, + {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, + "line": 20, "end_line": 20}, + {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, + "line": 30, "end_line": 30}]}), + jsonl({"type": "agg", "v": 1, "run_id": run_id, + "kind": "frame", "scope": "final", "samples_total": 3, + "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, + {"frame_id": 2, "self": 1, "cumulative": 1}, + {"frame_id": 3, "self": 1, "cumulative": 1}]}), + jsonl({"type": "end", "v": 1, "run_id": run_id, + "samples_total": 3}), + ] + + for exp in expected: + self.assertIn(exp, lines) + + # self.assertIn(active1_key, collector_no_skip.result) + # self.assertIn(active2_key, collector_no_skip.result) + # self.assertIn( + # idle_key, collector_no_skip.result + # ) # Idle thread should be included @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): From c183109f8a3a22153607e4cdf618d2dcc1b62c78 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:51:16 +0200 Subject: [PATCH 11/38] ok, thx b4fac15613a16f9cd7b2ee32840523b399f4621f --- .../test_sampling_profiler/test_modes.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 67b82eff091d08..9d792b8d6f20ab 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -305,24 +305,19 @@ def jsonl(obj): jsonl({"type": "str_def", "v": 1, "run_id": run_id, "defs": [{"str_id": 1, "value": "active_func1"}, {"str_id": 2, "value": "active1.py"}, - {"str_id": 3, "value": "idle_func"}, - {"str_id": 4, "value": "idle.py"}, - {"str_id": 5, "value": "active_func2"}, - {"str_id": 6, "value": "active2.py"}]}), + {"str_id": 3, "value": "active_func2"}, + {"str_id": 4, "value": "active2.py"}]}), jsonl({"type": "frame_def", "v": 1, "run_id": run_id, "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, "line": 10, "end_line": 10}, {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, - "line": 20, "end_line": 20}, - {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, "line": 30, "end_line": 30}]}), jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 3, + "kind": "frame", "scope": "final", "samples_total": 2, "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, - {"frame_id": 2, "self": 1, "cumulative": 1}, - {"frame_id": 3, "self": 1, "cumulative": 1}]}), + {"frame_id": 2, "self": 1, "cumulative": 1}]}), jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 3}), + "samples_total": 2}), ] for exp in expected: From f20eb52efbb6b9a14c868affb25944d229b26cb7 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:16:36 +0200 Subject: [PATCH 12/38] check if it works fine with (file, loc, func, op) --- .../test_sampling_profiler/test_collectors.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index f46cfc1dbcd0b1..dcf0b09828a790 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2467,6 +2467,38 @@ def test_gecko_collector_frame_format(self): # Should have recorded 3 functions self.assertEqual(thread["funcTable"]["length"], 3) + def test_jsonl_collector_frame_format(self): + """Test JsonlCollector with 4-element frame format.""" + collector = JsonlCollector(sample_interval_usec=1000) + collector.collect(self._make_sample_frames()) + + with tempfile.NamedTemporaryFile(delete=False) as f: + self.addClassCleanup(close_and_unlink, f) + collector.export(f.name) + + with open(f.name, "r", encoding="utf-8") as fp: + records = [json.loads(line) for line in fp] + + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + + self.assertEqual(len(frame_defs), 3) + + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + funcs = {str_defs[item["func_str_id"]] for item in frame_defs} + + self.assertEqual(paths, {"app.py", "utils.py", "lib.py"}) + self.assertEqual(funcs, {"main", "helper", "process"}) class TestInternalFrameFiltering(unittest.TestCase): """Tests for filtering internal profiler frames from output.""" From 546ce90a007f30469e2c9f9a83240ba2a4d05e9d Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:19:44 +0200 Subject: [PATCH 13/38] missing new line --- .../test_profiling/test_sampling_profiler/test_collectors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index dcf0b09828a790..084c3c549f99d0 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2500,6 +2500,7 @@ def test_jsonl_collector_frame_format(self): self.assertEqual(paths, {"app.py", "utils.py", "lib.py"}) self.assertEqual(funcs, {"main", "helper", "process"}) + class TestInternalFrameFiltering(unittest.TestCase): """Tests for filtering internal profiler frames from output.""" From 350ad99bc7d3c4c9170be60b447c7329c2694654 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:25:20 +0200 Subject: [PATCH 14/38] filter out sync coordinator --- .../test_sampling_profiler/test_collectors.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 084c3c549f99d0..d80288c6ec959c 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2627,3 +2627,54 @@ def test_collapsed_stack_collector_filters_internal_frames(self): for (call_tree, _), _ in collector.stack_counter.items(): for filename, _, _ in call_tree: self.assertNotIn("_sync_coordinator", filename) + + def test_jsonl_collector_filters_internal_frames(self): + """Test that JsonlCollector filters out internal frames.""" + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(sample_interval_usec=1000) + + frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("app.py", 50, "run"), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo("", 87, "_run_code"), + ], + status=THREAD_STATUS_HAS_GIL, + ) + ], + ) + ] + + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + + self.assertIn("app.py", paths) + self.assertIn("", paths) + + for path in paths: + self.assertNotIn("_sync_coordinator", path) From 942d821da00c9d3b879ea0608b071fec9aba6236 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:30:12 +0200 Subject: [PATCH 15/38] s/collapsed_out/jsonl_out/, less copying :D --- .../test_sampling_profiler/test_collectors.py | 24 +++++++++---------- .../test_sampling_profiler/test_modes.py | 12 +++++----- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index d80288c6ec959c..8432f2ac6de398 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1667,8 +1667,8 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff_pct"], -50.0) def test_jsonl_collector_export(self): - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(1000) run_id = collector.run_id @@ -1706,10 +1706,10 @@ def test_jsonl_collector_export(self): collector.collect(test_frames3) with captured_stdout(), captured_stderr(): - collector.export(collapsed_out.name) + collector.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") @@ -2147,8 +2147,8 @@ def test_gecko_collector_with_location_info(self): def test_jsonl_collector_with_location_info(self): """Test JsonlCollector handles LocationInfo properly.""" - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) run_id = collector.run_id @@ -2164,10 +2164,10 @@ def test_jsonl_collector_with_location_info(self): # Should extract lineno from location with captured_stdout(), captured_stderr(): - collector.export(collapsed_out.name) + collector.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") @@ -2197,8 +2197,8 @@ def jsonl(obj): def test_jsonl_collector_with_none_location(self): """Test JsonlCollector handles None location (synthetic frames).""" - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) run_id = collector.run_id @@ -2216,10 +2216,10 @@ def test_jsonl_collector_with_none_location(self): # Should handle None location as synthetic frame with captured_stdout(), captured_stderr(): - collector.export(collapsed_out.name) + collector.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 9d792b8d6f20ab..a4c7ed857ce7fb 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -238,8 +238,8 @@ def test_jsonl_collector_rspects_skip_idle(self): import tempfile import json - collapsed_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, collapsed_out) + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) # Create mock frames with different thread statuses class MockThreadInfoWithStatus: @@ -287,10 +287,10 @@ def __init__(self, thread_id, frame_info, status): # Should only have functions from running threads (status 0) with captured_stdout(), captured_stderr(): - collector_skip.export(collapsed_out.name) + collector_skip.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") @@ -333,10 +333,10 @@ def jsonl(obj): # Should have functions from all threads with captured_stdout(), captured_stderr(): - collector_no_skip.export(collapsed_out.name) + collector_no_skip.export(jsonl_out.name) # Check file contents - with open(collapsed_out.name, "r") as f: + with open(jsonl_out.name, "r") as f: content = f.read() lines = content.strip().split("\n") From bd9aefe1fd36ed2123257a62d8586a3f5e308c66 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:31:24 +0200 Subject: [PATCH 16/38] nicer reading --- .../test_profiling/test_sampling_profiler/test_collectors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 8432f2ac6de398..e12ea44f566cb4 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2633,8 +2633,6 @@ def test_jsonl_collector_filters_internal_frames(self): jsonl_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, jsonl_out) - collector = JsonlCollector(sample_interval_usec=1000) - frames = [ MockInterpreterInfo( 0, @@ -2652,6 +2650,7 @@ def test_jsonl_collector_filters_internal_frames(self): ) ] + collector = JsonlCollector(sample_interval_usec=1000) collector.collect(frames) collector.export(jsonl_out.name) From 311a4e38b0299dd9a08b9d8084daab8c16cde8a7 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:32:23 +0200 Subject: [PATCH 17/38] typo --- Lib/test/test_profiling/test_sampling_profiler/test_modes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index a4c7ed857ce7fb..37cb6c3a5c5ab2 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -233,7 +233,7 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) - def test_jsonl_collector_rspects_skip_idle(self): + def test_jsonl_collector_respects_skip_idle(self): """Test that frames are actually filtered when skip_idle=True.""" import tempfile import json From 749a8686b0ff13a09ae8218c6270484cb0670d14 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:37:39 +0200 Subject: [PATCH 18/38] too much copying, left-over --- Lib/test/test_profiling/test_sampling_profiler/test_modes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 37cb6c3a5c5ab2..2bac26c37091b0 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -371,11 +371,6 @@ def jsonl(obj): for exp in expected: self.assertIn(exp, lines) - # self.assertIn(active1_key, collector_no_skip.result) - # self.assertIn(active2_key, collector_no_skip.result) - # self.assertIn( - # idle_key, collector_no_skip.result - # ) # Idle thread should be included @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): From 85ce978c53b3864e0fb720fcc0f9d1e101f5b1fb Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:49:39 +0200 Subject: [PATCH 19/38] just Counter --- Lib/profiling/sampling/jsonl_collector.py | 87 +++++++++++------------ 1 file changed, 40 insertions(+), 47 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 59ab3b865c182c..56539c2a9e2232 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -1,5 +1,6 @@ """JSONL collector.""" +from collections import Counter import json import uuid from itertools import batched @@ -38,8 +39,8 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): self._frame_to_id = {} self._frames = [] - self._frame_self = {} - self._frame_cumulative = {} + self._frame_self = Counter() + self._frame_cumulative = Counter() self._samples_total = 0 self._mode = mode @@ -56,21 +57,39 @@ def process_frames(self, frames, _thread_id, weight=1): ] leaf_frame_id = frame_ids[0] - self._frame_self[leaf_frame_id] = ( - self._frame_self.get(leaf_frame_id, 0) + weight - ) + self._frame_self[leaf_frame_id] += weight for frame_id in set(frame_ids): - self._frame_cumulative[frame_id] = ( - self._frame_cumulative.get(frame_id, 0) + weight - ) + self._frame_cumulative[frame_id] += weight def export(self, filename): with open(filename, "w", encoding="utf-8") as output: self._write_message(output, self._build_meta_record()) - self._write_chunked_defs(output, "str_def", self._strings) - self._write_chunked_defs(output, "frame_def", self._frames) - self._write_chunked_agg(output, self._iter_agg_entries()) + self._write_chunked_records( + output, + {"type": "str_def", "v": 1, "run_id": self.run_id}, + "defs", + self._strings, + ) + self._write_chunked_records( + output, + {"type": "frame_def", "v": 1, "run_id": self.run_id}, + "defs", + self._frames, + ) + self._write_chunked_records( + output, + { + "type": "agg", + "v": 1, + "run_id": self.run_id, + "kind": "frame", + "scope": "final", + "samples_total": self._samples_total, + }, + "entries", + self._iter_agg_entries(), + ) self._write_message(output, self._build_end_record()) def _build_meta_record(self): @@ -171,44 +190,18 @@ def _normalize_export_location(location): return normalized def _iter_agg_entries(self): - entries = [] - for frame_record in self._frames: - frame_id = frame_record["frame_id"] - entries.append( - { - "frame_id": frame_id, - "self": self._frame_self.get(frame_id, 0), - "cumulative": self._frame_cumulative.get(frame_id, 0), - } - ) - return entries - - def _write_chunked_defs(self, output, record_type, entries): - for chunk in batched(entries, _CHUNK_SIZE): - self._write_message( - output, - { - "type": record_type, - "v": 1, - "run_id": self.run_id, - "defs": chunk, - }, - ) + return [ + { + "frame_id": frame_record["frame_id"], + "self": self._frame_self[frame_record["frame_id"]], + "cumulative": self._frame_cumulative[frame_record["frame_id"]], + } + for frame_record in self._frames + ] - def _write_chunked_agg(self, output, entries): + def _write_chunked_records(self, output, base_record, chunk_field, entries): for chunk in batched(entries, _CHUNK_SIZE): - self._write_message( - output, - { - "type": "agg", - "v": 1, - "run_id": self.run_id, - "kind": "frame", - "scope": "final", - "samples_total": self._samples_total, - "entries": chunk, - }, - ) + self._write_message(output, {**base_record, chunk_field: chunk}) @staticmethod def _write_message(output, record): From 820d3b9f85dbda7d653d774f29b9d064517134a4 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:52:11 +0200 Subject: [PATCH 20/38] ruff --- Lib/profiling/sampling/jsonl_collector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 56539c2a9e2232..244501ba446f07 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -199,7 +199,9 @@ def _iter_agg_entries(self): for frame_record in self._frames ] - def _write_chunked_records(self, output, base_record, chunk_field, entries): + def _write_chunked_records( + self, output, base_record, chunk_field, entries + ): for chunk in batched(entries, _CHUNK_SIZE): self._write_message(output, {**base_record, chunk_field: chunk}) From aad4b180d71cc02f1d92daefda79e2f14e37de02 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:56:14 +0200 Subject: [PATCH 21/38] future-proof name --- Lib/profiling/sampling/jsonl_collector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 244501ba446f07..7d7b44c8d89407 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -88,7 +88,7 @@ def export(self, filename): "samples_total": self._samples_total, }, "entries", - self._iter_agg_entries(), + self._iter_final_agg_entries(), ) self._write_message(output, self._build_end_record()) @@ -189,7 +189,7 @@ def _normalize_export_location(location): normalized["end_col"] = end_col_offset return normalized - def _iter_agg_entries(self): + def _iter_final_agg_entries(self): return [ { "frame_id": frame_record["frame_id"], From da3e754fa64f906c75d6937d23e7b0365426a1b7 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 13:56:58 +0200 Subject: [PATCH 22/38] future-proof iter for streaming --- Lib/profiling/sampling/jsonl_collector.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 7d7b44c8d89407..1b318573425edf 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -190,14 +190,13 @@ def _normalize_export_location(location): return normalized def _iter_final_agg_entries(self): - return [ - { - "frame_id": frame_record["frame_id"], - "self": self._frame_self[frame_record["frame_id"]], - "cumulative": self._frame_cumulative[frame_record["frame_id"]], + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + yield { + "frame_id": frame_id, + "self": self._frame_self[frame_id], + "cumulative": self._frame_cumulative[frame_id], } - for frame_record in self._frames - ] def _write_chunked_records( self, output, base_record, chunk_field, entries From cb6ed347142cfb3550f228584eafe98a87099e38 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:13:23 +0200 Subject: [PATCH 23/38] truth to be told, this should be layer above --- Lib/profiling/sampling/collector.py | 5 ++- Lib/profiling/sampling/jsonl_collector.py | 42 +++++++------------ .../test_sampling_profiler/test_collectors.py | 5 +++ 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py index 7dc095c6c279bd..dc6eb751b99e15 100644 --- a/Lib/profiling/sampling/collector.py +++ b/Lib/profiling/sampling/collector.py @@ -20,13 +20,16 @@ def normalize_location(location): """Normalize location to a 4-tuple format. Args: - location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None + location: tuple (lineno, end_lineno, col_offset, end_col_offset), + an integer line number, or None Returns: tuple: (lineno, end_lineno, col_offset, end_col_offset) """ if location is None: return DEFAULT_LOCATION + if isinstance(location, int): + return (location, location, -1, -1) return location diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 1b318573425edf..6c8f2bc2fd3135 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -12,6 +12,7 @@ PROFILING_MODE_GIL, PROFILING_MODE_WALL, ) +from .collector import normalize_location from .stack_collector import StackTraceCollector @@ -117,7 +118,7 @@ def _build_end_record(self): def _get_or_create_frame_id(self, filename, location, funcname): synthetic = location is None - location_fields = self._normalize_export_location(location) + location_fields = self._location_to_export_fields(location) func_str_id = self._intern_string(funcname) path_str_id = self._intern_string(filename) @@ -160,34 +161,19 @@ def _intern_string(self, value): return string_id @staticmethod - def _normalize_export_location(location): - if location is None: - return {"line": 0} - - if isinstance(location, int): - return {"line": max(location, 0)} - - if not isinstance(location, tuple): - lineno = getattr(location, "lineno", 0) - location = ( - lineno, - getattr(location, "end_lineno", lineno), - getattr(location, "col_offset", -1), - getattr(location, "end_col_offset", -1), - ) + def _location_to_export_fields(location): + lineno, end_lineno, col_offset, end_col_offset = normalize_location( + location + ) - lineno, end_lineno, col_offset, end_col_offset = location - if not isinstance(lineno, int) or lineno <= 0: - return {"line": 0} - - normalized = {"line": lineno} - if isinstance(end_lineno, int) and end_lineno > 0: - normalized["end_line"] = end_lineno - if isinstance(col_offset, int) and col_offset >= 0: - normalized["col"] = col_offset - if isinstance(end_col_offset, int) and end_col_offset >= 0: - normalized["end_col"] = end_col_offset - return normalized + fields = {"line": lineno} + if end_lineno > 0: + fields["end_line"] = end_lineno + if col_offset >= 0: + fields["col"] = col_offset + if end_col_offset >= 0: + fields["end_col"] = end_col_offset + return fields def _iter_final_agg_entries(self): for frame_record in self._frames: diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index e12ea44f566cb4..908ecb1464ae5b 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1955,6 +1955,11 @@ def test_extract_lineno_from_none(self): """Test extracting lineno from None (synthetic frames).""" self.assertEqual(extract_lineno(None), 0) + def test_normalize_location_with_int(self): + """Test normalize_location expands a legacy integer line number.""" + result = normalize_location(42) + self.assertEqual(result, (42, 42, -1, -1)) + def test_normalize_location_with_location_info(self): """Test normalize_location passes through LocationInfo.""" loc = LocationInfo(10, 15, 0, 5) From 5a59e0b5de7205a56d5a165d7b83295a22a9c9fb Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:20:48 +0200 Subject: [PATCH 24/38] helper --- .../test_sampling_profiler/test_collectors.py | 255 +++++++++--------- .../test_sampling_profiler/test_modes.py | 148 +--------- 2 files changed, 130 insertions(+), 273 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 908ecb1464ae5b..4f1c76de414c5e 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -58,6 +58,25 @@ def find_child_by_name(children, strings, substr): return None +def _jsonl_tables(records): + meta = next(record for record in records if record["type"] == "meta") + end = next(record for record in records if record["type"] == "end") + agg = next(record for record in records if record["type"] == "agg") + str_defs = { + item["str_id"]: item["value"] + for record in records + if record["type"] == "str_def" + for item in record["defs"] + } + frame_defs = [ + item + for record in records + if record["type"] == "frame_def" + for item in record["defs"] + ] + return meta, str_defs, frame_defs, agg, end + + class TestSampleProfilerComponents(unittest.TestCase): """Unit tests for individual profiler components.""" @@ -1666,14 +1685,12 @@ def test_diff_flamegraph_load_baseline(self): self.assertAlmostEqual(cold_node["diff"], -1.0) self.assertAlmostEqual(cold_node["diff_pct"], -50.0) - def test_jsonl_collector_export(self): + def test_jsonl_collector_export_exact_output(self): jsonl_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(1000) - run_id = collector.run_id - - self.assertIsNotNone(run_id) + collector.run_id = "run-123" test_frames1 = [ MockInterpreterInfo( @@ -1705,46 +1722,74 @@ def test_jsonl_collector_export(self): collector.collect(test_frames2) collector.collect(test_frames3) - with captured_stdout(), captured_stderr(): - collector.export(jsonl_out.name) + collector.export(jsonl_out.name) - # Check file contents - with open(jsonl_out.name, "r") as f: + with open(jsonl_out.name, "r", encoding="utf-8") as f: content = f.read() - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "func1"}, - {"str_id": 2, "value": "file.py"}, - {"str_id": 3, "value": "func2"}, - {"str_id": 4, "value": "other_func"}, - {"str_id": 5, "value": "other.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 10, "end_line": 10}, - {"frame_id": 2, "path_str_id": 2, "func_str_id": 3, - "line": 20, "end_line": 20}, - {"frame_id": 3, "path_str_id": 5, "func_str_id": 4, - "line": 5, "end_line": 5}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 3, - "entries": [{"frame_id": 1, "self": 2, "cumulative": 2}, - {"frame_id": 2, "self": 0, "cumulative": 2}, - {"frame_id": 3, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 3}), - ] - - for exp in expected: - self.assertIn(exp, lines) + self.assertEqual( + content, + ( + '{"type":"meta","v":1,"run_id":"run-123","sample_interval_usec":1000}\n' + '{"type":"str_def","v":1,"run_id":"run-123","defs":[{"str_id":1,"value":"func1"},{"str_id":2,"value":"file.py"},{"str_id":3,"value":"func2"},{"str_id":4,"value":"other_func"},{"str_id":5,"value":"other.py"}]}\n' + '{"type":"frame_def","v":1,"run_id":"run-123","defs":[{"frame_id":1,"path_str_id":2,"func_str_id":1,"line":10,"end_line":10},{"frame_id":2,"path_str_id":2,"func_str_id":3,"line":20,"end_line":20},{"frame_id":3,"path_str_id":5,"func_str_id":4,"line":5,"end_line":5}]}\n' + '{"type":"agg","v":1,"run_id":"run-123","kind":"frame","scope":"final","samples_total":3,"entries":[{"frame_id":1,"self":2,"cumulative":2},{"frame_id":2,"self":0,"cumulative":2},{"frame_id":3,"self":1,"cumulative":1}]}\n' + '{"type":"end","v":1,"run_id":"run-123","samples_total":3}\n' + ), + ) + + def test_jsonl_collector_skip_idle_filters_threads(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + active_status = THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU + frames = [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [MockFrameInfo("active1.py", 10, "active_func1")], + status=active_status, + ), + MockThreadInfo( + 2, + [MockFrameInfo("idle.py", 20, "idle_func")], + status=0, + ), + MockThreadInfo( + 3, + [MockFrameInfo("active2.py", 30, "active_func2")], + status=active_status, + ), + ], + ) + ] + + def export_summary(skip_idle): + collector = JsonlCollector(1000, skip_idle=skip_idle) + collector.collect(frames) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, str_defs, frame_defs, agg_record, _ = _jsonl_tables(records) + paths = {str_defs[item["path_str_id"]] for item in frame_defs} + funcs = {str_defs[item["func_str_id"]] for item in frame_defs} + return paths, funcs, agg_record["samples_total"] + + paths, funcs, samples_total = export_summary(skip_idle=True) + self.assertEqual(paths, {"active1.py", "active2.py"}) + self.assertEqual(funcs, {"active_func1", "active_func2"}) + self.assertEqual(samples_total, 2) + + paths, funcs, samples_total = export_summary(skip_idle=False) + self.assertEqual(paths, {"active1.py", "idle.py", "active2.py"}) + self.assertEqual( + funcs, {"active_func1", "idle_func", "active_func2"} + ) + self.assertEqual(samples_total, 3) class TestRecursiveFunctionHandling(unittest.TestCase): @@ -2156,7 +2201,6 @@ def test_jsonl_collector_with_location_info(self): self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) - run_id = collector.run_id # Frame with LocationInfo frame = MockFrameInfo("test.py", 42, "my_function") @@ -2167,38 +2211,28 @@ def test_jsonl_collector_with_location_info(self): ] collector.collect(frames) - # Should extract lineno from location - with captured_stdout(), captured_stderr(): - collector.export(jsonl_out.name) + collector.export(jsonl_out.name) - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "my_function"}, - {"str_id": 2, "value": "test.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 42, "end_line": 42}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 1, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 1}), - ] - - for exp in expected: - self.assertIn(exp, lines) + meta, str_defs, frame_defs, agg, end = _jsonl_tables(records) + self.assertEqual(meta["sample_interval_usec"], 1000) + self.assertEqual(agg["samples_total"], 1) + self.assertEqual(end["samples_total"], 1) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(str_defs[frame_defs[0]["path_str_id"]], "test.py") + self.assertEqual(str_defs[frame_defs[0]["func_str_id"]], "my_function") + self.assertEqual( + frame_defs[0], + { + "frame_id": 1, + "path_str_id": frame_defs[0]["path_str_id"], + "func_str_id": frame_defs[0]["func_str_id"], + "line": 42, + "end_line": 42, + }, + ) def test_jsonl_collector_with_none_location(self): """Test JsonlCollector handles None location (synthetic frames).""" @@ -2206,7 +2240,6 @@ def test_jsonl_collector_with_none_location(self): self.addCleanup(close_and_unlink, jsonl_out) collector = JsonlCollector(sample_interval_usec=1000) - run_id = collector.run_id # Create frame with None location (like GC frame) frame = MockFrameInfo("~", 0, "") @@ -2219,38 +2252,28 @@ def test_jsonl_collector_with_none_location(self): ] collector.collect(frames) - # Should handle None location as synthetic frame - with captured_stdout(), captured_stderr(): - collector.export(jsonl_out.name) - - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() - - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) + collector.export(jsonl_out.name) - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": ""}, - {"str_id": 2, "value": "~"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 0, "synthetic": True}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 1, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 1}), - ] + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] - for exp in expected: - self.assertIn(exp, lines) + meta, str_defs, frame_defs, agg, end = _jsonl_tables(records) + self.assertEqual(meta["sample_interval_usec"], 1000) + self.assertEqual(agg["samples_total"], 1) + self.assertEqual(end["samples_total"], 1) + self.assertEqual(len(frame_defs), 1) + self.assertEqual(str_defs[frame_defs[0]["path_str_id"]], "~") + self.assertEqual(str_defs[frame_defs[0]["func_str_id"]], "") + self.assertEqual( + frame_defs[0], + { + "frame_id": 1, + "path_str_id": frame_defs[0]["path_str_id"], + "func_str_id": frame_defs[0]["func_str_id"], + "line": 0, + "synthetic": True, + }, + ) class TestOpcodeHandling(unittest.TestCase): @@ -2484,18 +2507,7 @@ def test_jsonl_collector_frame_format(self): with open(f.name, "r", encoding="utf-8") as fp: records = [json.loads(line) for line in fp] - str_defs = { - item["str_id"]: item["value"] - for record in records - if record["type"] == "str_def" - for item in record["defs"] - } - frame_defs = [ - item - for record in records - if record["type"] == "frame_def" - for item in record["defs"] - ] + _, str_defs, frame_defs, _, _ = _jsonl_tables(records) self.assertEqual(len(frame_defs), 3) @@ -2662,18 +2674,7 @@ def test_jsonl_collector_filters_internal_frames(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - str_defs = { - item["str_id"]: item["value"] - for record in records - if record["type"] == "str_def" - for item in record["defs"] - } - frame_defs = [ - item - for record in records - if record["type"] == "frame_def" - for item in record["defs"] - ] + _, str_defs, frame_defs, _, _ = _jsonl_tables(records) paths = {str_defs[item["path_str_id"]] for item in frame_defs} diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 2bac26c37091b0..6cd636593e3db1 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -9,7 +9,6 @@ import profiling.sampling import profiling.sampling.sample from profiling.sampling.pstats_collector import PstatsCollector - from profiling.sampling.jsonl_collector import JsonlCollector from profiling.sampling.cli import main, _parse_mode from profiling.sampling.constants import PROFILING_MODE_EXCEPTION from _remote_debugging import ( @@ -21,13 +20,9 @@ "Test only runs when _remote_debugging is available" ) -from test.support import ( - captured_stdout, - captured_stderr, - requires_remote_subprocess_debugging, -) +from test.support import requires_remote_subprocess_debugging -from .helpers import close_and_unlink, test_subprocess +from .helpers import test_subprocess from .mocks import MockFrameInfo, MockInterpreterInfo @@ -233,145 +228,6 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) - def test_jsonl_collector_respects_skip_idle(self): - """Test that frames are actually filtered when skip_idle=True.""" - import tempfile - import json - - jsonl_out = tempfile.NamedTemporaryFile(delete=False) - self.addCleanup(close_and_unlink, jsonl_out) - - # Create mock frames with different thread statuses - class MockThreadInfoWithStatus: - def __init__(self, thread_id, frame_info, status): - self.thread_id = thread_id - self.frame_info = frame_info - self.status = status - - # Create test data: active thread (HAS_GIL | ON_CPU), idle thread (neither), and another active thread - ACTIVE_STATUS = ( - THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU - ) # Has GIL and on CPU - IDLE_STATUS = 0 # Neither has GIL nor on CPU - - test_frames = [ - MockInterpreterInfo( - 0, - [ - MockThreadInfoWithStatus( - 1, - [MockFrameInfo("active1.py", 10, "active_func1")], - ACTIVE_STATUS, - ), - MockThreadInfoWithStatus( - 2, - [MockFrameInfo("idle.py", 20, "idle_func")], - IDLE_STATUS, - ), - MockThreadInfoWithStatus( - 3, - [MockFrameInfo("active2.py", 30, "active_func2")], - ACTIVE_STATUS, - ), - ], - ) - ] - - # Test with skip_idle=True - should only process running threads - collector_skip = JsonlCollector( - sample_interval_usec=1000, skip_idle=True - ) - collector_skip.collect(test_frames) - - run_id = collector_skip.run_id - - # Should only have functions from running threads (status 0) - with captured_stdout(), captured_stderr(): - collector_skip.export(jsonl_out.name) - - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() - - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - def jsonl(obj): - return json.dumps(obj, separators=(",", ":")) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "active_func1"}, - {"str_id": 2, "value": "active1.py"}, - {"str_id": 3, "value": "active_func2"}, - {"str_id": 4, "value": "active2.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 10, "end_line": 10}, - {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, - "line": 30, "end_line": 30}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 2, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, - {"frame_id": 2, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 2}), - ] - - for exp in expected: - self.assertIn(exp, lines) - - # Test with skip_idle=False - should process all threads - collector_no_skip = JsonlCollector( - sample_interval_usec=1000, skip_idle=False - ) - collector_no_skip.collect(test_frames) - - run_id = collector_no_skip.run_id - - # Should have functions from all threads - with captured_stdout(), captured_stderr(): - collector_no_skip.export(jsonl_out.name) - - # Check file contents - with open(jsonl_out.name, "r") as f: - content = f.read() - - lines = content.strip().split("\n") - self.assertEqual(len(lines), 5) - - expected = [ - jsonl({"type": "meta", "v": 1, "run_id": run_id, - "sample_interval_usec": 1000}), - jsonl({"type": "str_def", "v": 1, "run_id": run_id, - "defs": [{"str_id": 1, "value": "active_func1"}, - {"str_id": 2, "value": "active1.py"}, - {"str_id": 3, "value": "idle_func"}, - {"str_id": 4, "value": "idle.py"}, - {"str_id": 5, "value": "active_func2"}, - {"str_id": 6, "value": "active2.py"}]}), - jsonl({"type": "frame_def", "v": 1, "run_id": run_id, - "defs": [{"frame_id": 1, "path_str_id": 2, "func_str_id": 1, - "line": 10, "end_line": 10}, - {"frame_id": 2, "path_str_id": 4, "func_str_id": 3, - "line": 20, "end_line": 20}, - {"frame_id": 3, "path_str_id": 6, "func_str_id": 5, - "line": 30, "end_line": 30}]}), - jsonl({"type": "agg", "v": 1, "run_id": run_id, - "kind": "frame", "scope": "final", "samples_total": 3, - "entries": [{"frame_id": 1, "self": 1, "cumulative": 1}, - {"frame_id": 2, "self": 1, "cumulative": 1}, - {"frame_id": 3, "self": 1, "cumulative": 1}]}), - jsonl({"type": "end", "v": 1, "run_id": run_id, - "samples_total": 3}), - ] - - for exp in expected: - self.assertIn(exp, lines) - - @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): """Test GIL mode filtering functionality (--mode=gil).""" From 192e54bd5415d8242068c5650a1e27795dadc285 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:31:15 +0200 Subject: [PATCH 25/38] reorder --- Lib/profiling/sampling/jsonl_collector.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 6c8f2bc2fd3135..372205a566afc6 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -116,6 +116,15 @@ def _build_end_record(self): return record + def _iter_final_agg_entries(self): + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + yield { + "frame_id": frame_id, + "self": self._frame_self[frame_id], + "cumulative": self._frame_cumulative[frame_id], + } + def _get_or_create_frame_id(self, filename, location, funcname): synthetic = location is None location_fields = self._location_to_export_fields(location) @@ -175,15 +184,6 @@ def _location_to_export_fields(location): fields["end_col"] = end_col_offset return fields - def _iter_final_agg_entries(self): - for frame_record in self._frames: - frame_id = frame_record["frame_id"] - yield { - "frame_id": frame_id, - "self": self._frame_self[frame_id], - "cumulative": self._frame_cumulative[frame_id], - } - def _write_chunked_records( self, output, base_record, chunk_field, entries ): From 3189a8fe45b31a2949b27cde2b7f4ae6d2f06cd5 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:46:44 +0200 Subject: [PATCH 26/38] eh, just copy from heatmap --- Lib/profiling/sampling/jsonl_collector.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 372205a566afc6..146075b00b121c 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -43,6 +43,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): self._frame_self = Counter() self._frame_cumulative = Counter() self._samples_total = 0 + self._seen_frame_ids = set() self._mode = mode @@ -51,17 +52,21 @@ def process_frames(self, frames, _thread_id, weight=1): return self._samples_total += weight + self._seen_frame_ids.clear() - frame_ids = [ - self._get_or_create_frame_id(filename, location, funcname) - for filename, location, funcname, _opcode in frames - ] - leaf_frame_id = frame_ids[0] + for i, (filename, location, funcname, _opcode) in enumerate(frames): + frame_id = self._get_or_create_frame_id(filename, location, funcname) + is_leaf = (i == 0) + count_cumulative = frame_id not in self._seen_frame_ids - self._frame_self[leaf_frame_id] += weight + if count_cumulative: + self._seen_frame_ids.add(frame_id) - for frame_id in set(frame_ids): - self._frame_cumulative[frame_id] += weight + if is_leaf: + self._frame_self[frame_id] += weight + + if count_cumulative: + self._frame_cumulative[frame_id] += weight def export(self, filename): with open(filename, "w", encoding="utf-8") as output: From 935779f072ede6cefa1759ae30d224347819f699 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:54:14 +0200 Subject: [PATCH 27/38] smaller chunk; matter of taste --- Lib/profiling/sampling/jsonl_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 146075b00b121c..12d8e4b3e2e77b 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -16,7 +16,7 @@ from .stack_collector import StackTraceCollector -_CHUNK_SIZE = 1000 +_CHUNK_SIZE = 256 _MODE_NAMES = { PROFILING_MODE_WALL: "wall", From e3d8aff3de38a327ff3bab9f8b3b3070232f1764 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:54:27 +0200 Subject: [PATCH 28/38] test actual chunking --- .../test_sampling_profiler/test_collectors.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 4f1c76de414c5e..2988a2efe21eaa 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1791,6 +1791,49 @@ def export_summary(skip_idle): ) self.assertEqual(samples_total, 3) + def test_jsonl_collector_splits_large_exports_into_chunks(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + + for i in range(257): + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [MockFrameInfo(f"file{i}.py", i + 1, f"func{i}")], + ) + ], + ) + ] + ) + + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + run_ids = {record["run_id"] for record in records} + self.assertEqual(len(run_ids), 1) + self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") + + _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables(records) + str_chunks = [record for record in records if record["type"] == "str_def"] + frame_chunks = [record for record in records if record["type"] == "frame_def"] + agg_chunks = [record for record in records if record["type"] == "agg"] + + self.assertEqual([len(record["defs"]) for record in str_chunks], [256, 256, 2]) + self.assertEqual([len(record["defs"]) for record in frame_chunks], [256, 1]) + self.assertEqual([len(record["entries"]) for record in agg_chunks], [256, 1]) + self.assertEqual(len(str_defs), 514) + self.assertEqual(len(frame_defs), 257) + self.assertEqual(agg_record["samples_total"], 257) + self.assertEqual(end_record["samples_total"], 257) + class TestRecursiveFunctionHandling(unittest.TestCase): """Tests for correct handling of recursive functions in cumulative stats.""" From d37f07a26cd36aa854927a778d8f3c7cdecc7015 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:00:29 +0200 Subject: [PATCH 29/38] test edge cases --- .../test_sampling_profiler/test_collectors.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 2988a2efe21eaa..a2e1f85c45d680 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1738,6 +1738,80 @@ def test_jsonl_collector_export_exact_output(self): ), ) + def test_jsonl_collector_export_includes_mode_in_meta(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000, mode=PROFILING_MODE_CPU) + collector.collect( + [ + MockInterpreterInfo( + 0, + [MockThreadInfo(1, [MockFrameInfo("file.py", 10, "func")])], + ) + ] + ) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + meta_record = next(record for record in records if record["type"] == "meta") + self.assertEqual(meta_record["mode"], "cpu") + + def test_jsonl_collector_export_empty_profile(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.run_id = "run-123" + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + self.assertEqual([record["type"] for record in records], ["meta", "end"]) + self.assertEqual(records[0]["sample_interval_usec"], 1000) + self.assertEqual(records[0]["run_id"], "run-123") + self.assertEqual(records[1]["samples_total"], 0) + self.assertEqual(records[1]["run_id"], "run-123") + + def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): + jsonl_out = tempfile.NamedTemporaryFile(delete=False) + self.addCleanup(close_and_unlink, jsonl_out) + + collector = JsonlCollector(1000) + collector.collect( + [ + MockInterpreterInfo( + 0, + [ + MockThreadInfo( + 1, + [ + MockFrameInfo("recursive.py", 10, "recursive_func"), + MockFrameInfo("recursive.py", 10, "recursive_func"), + MockFrameInfo("recursive.py", 10, "recursive_func"), + ], + ) + ], + ) + ] + ) + collector.export(jsonl_out.name) + + with open(jsonl_out.name, "r", encoding="utf-8") as f: + records = [json.loads(line) for line in f] + + _, _, frame_defs, agg_record, end_record = _jsonl_tables(records) + self.assertEqual(len(frame_defs), 1) + self.assertEqual( + agg_record["entries"], + [{"frame_id": frame_defs[0]["frame_id"], "self": 1, "cumulative": 1}], + ) + self.assertEqual(agg_record["samples_total"], 1) + self.assertEqual(end_record["samples_total"], 1) + def test_jsonl_collector_skip_idle_filters_threads(self): jsonl_out = tempfile.NamedTemporaryFile(delete=False) self.addCleanup(close_and_unlink, jsonl_out) From aaaa9722d4ced736842227882ee43bd47cbc3b96 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:05:30 +0200 Subject: [PATCH 30/38] ruff --- Lib/profiling/sampling/jsonl_collector.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 12d8e4b3e2e77b..a1d37df85c2672 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -55,8 +55,10 @@ def process_frames(self, frames, _thread_id, weight=1): self._seen_frame_ids.clear() for i, (filename, location, funcname, _opcode) in enumerate(frames): - frame_id = self._get_or_create_frame_id(filename, location, funcname) - is_leaf = (i == 0) + frame_id = self._get_or_create_frame_id( + filename, location, funcname + ) + is_leaf = i == 0 count_cumulative = frame_id not in self._seen_frame_ids if count_cumulative: From a9b6ccd58ddfa464f130bcd27ec53f43163eb1e5 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:16:37 +0200 Subject: [PATCH 31/38] match pep8 --- Lib/test/test_profiling/test_sampling_profiler/test_modes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py index 6cd636593e3db1..0b38fb4ad4bcf6 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py @@ -228,6 +228,7 @@ def test_cpu_mode_with_no_samples(self): self.assertIn("No samples were collected", output) self.assertIn("CPU mode", output) + @requires_remote_subprocess_debugging() class TestGilModeFiltering(unittest.TestCase): """Test GIL mode filtering functionality (--mode=gil).""" From 4fb3ade939080abaf0239ec9df7e6d43afd3b0af Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:44:27 +0200 Subject: [PATCH 32/38] style --- Lib/profiling/sampling/binary_reader.py | 2 +- .../test_sampling_profiler/test_collectors.py | 124 +++++++++++++----- 2 files changed, 94 insertions(+), 32 deletions(-) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index 8d1d8eef9155eb..a29dad91ae339d 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -118,7 +118,7 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) - elif output_format == 'jsonl': + elif output_format == "jsonl": collector = JsonlCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index a2e1f85c45d680..915468141a9217 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -1697,7 +1697,11 @@ def test_jsonl_collector_export_exact_output(self): 0, [ MockThreadInfo( - 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + 1, + [ + MockFrameInfo("file.py", 10, "func1"), + MockFrameInfo("file.py", 20, "func2"), + ], ) ], ) @@ -1707,14 +1711,23 @@ def test_jsonl_collector_export_exact_output(self): 0, [ MockThreadInfo( - 1, [MockFrameInfo("file.py", 10, "func1"), MockFrameInfo("file.py", 20, "func2")] + 1, + [ + MockFrameInfo("file.py", 10, "func1"), + MockFrameInfo("file.py", 20, "func2"), + ], ) ], ) ] # Same stack test_frames3 = [ MockInterpreterInfo( - 0, [MockThreadInfo(1, [MockFrameInfo("other.py", 5, "other_func")])] + 0, + [ + MockThreadInfo( + 1, [MockFrameInfo("other.py", 5, "other_func")] + ) + ], ) ] @@ -1747,7 +1760,11 @@ def test_jsonl_collector_export_includes_mode_in_meta(self): [ MockInterpreterInfo( 0, - [MockThreadInfo(1, [MockFrameInfo("file.py", 10, "func")])], + [ + MockThreadInfo( + 1, [MockFrameInfo("file.py", 10, "func")] + ) + ], ) ] ) @@ -1756,7 +1773,9 @@ def test_jsonl_collector_export_includes_mode_in_meta(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - meta_record = next(record for record in records if record["type"] == "meta") + meta_record = next( + record for record in records if record["type"] == "meta" + ) self.assertEqual(meta_record["mode"], "cpu") def test_jsonl_collector_export_empty_profile(self): @@ -1770,7 +1789,9 @@ def test_jsonl_collector_export_empty_profile(self): with open(jsonl_out.name, "r", encoding="utf-8") as f: records = [json.loads(line) for line in f] - self.assertEqual([record["type"] for record in records], ["meta", "end"]) + self.assertEqual( + [record["type"] for record in records], ["meta", "end"] + ) self.assertEqual(records[0]["sample_interval_usec"], 1000) self.assertEqual(records[0]["run_id"], "run-123") self.assertEqual(records[1]["samples_total"], 0) @@ -1789,9 +1810,15 @@ def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): MockThreadInfo( 1, [ - MockFrameInfo("recursive.py", 10, "recursive_func"), - MockFrameInfo("recursive.py", 10, "recursive_func"), - MockFrameInfo("recursive.py", 10, "recursive_func"), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), + MockFrameInfo( + "recursive.py", 10, "recursive_func" + ), ], ) ], @@ -1807,7 +1834,13 @@ def test_jsonl_collector_recursive_frames_counted_once_per_sample(self): self.assertEqual(len(frame_defs), 1) self.assertEqual( agg_record["entries"], - [{"frame_id": frame_defs[0]["frame_id"], "self": 1, "cumulative": 1}], + [ + { + "frame_id": frame_defs[0]["frame_id"], + "self": 1, + "cumulative": 1, + } + ], ) self.assertEqual(agg_record["samples_total"], 1) self.assertEqual(end_record["samples_total"], 1) @@ -1860,9 +1893,7 @@ def export_summary(skip_idle): paths, funcs, samples_total = export_summary(skip_idle=False) self.assertEqual(paths, {"active1.py", "idle.py", "active2.py"}) - self.assertEqual( - funcs, {"active_func1", "idle_func", "active_func2"} - ) + self.assertEqual(funcs, {"active_func1", "idle_func", "active_func2"}) self.assertEqual(samples_total, 3) def test_jsonl_collector_splits_large_exports_into_chunks(self): @@ -1879,7 +1910,11 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): [ MockThreadInfo( 1, - [MockFrameInfo(f"file{i}.py", i + 1, f"func{i}")], + [ + MockFrameInfo( + f"file{i}.py", i + 1, f"func{i}" + ) + ], ) ], ) @@ -1895,14 +1930,26 @@ def test_jsonl_collector_splits_large_exports_into_chunks(self): self.assertEqual(len(run_ids), 1) self.assertRegex(next(iter(run_ids)), r"^[0-9a-f]{32}$") - _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables(records) - str_chunks = [record for record in records if record["type"] == "str_def"] - frame_chunks = [record for record in records if record["type"] == "frame_def"] + _, str_defs, frame_defs, agg_record, end_record = _jsonl_tables( + records + ) + str_chunks = [ + record for record in records if record["type"] == "str_def" + ] + frame_chunks = [ + record for record in records if record["type"] == "frame_def" + ] agg_chunks = [record for record in records if record["type"] == "agg"] - self.assertEqual([len(record["defs"]) for record in str_chunks], [256, 256, 2]) - self.assertEqual([len(record["defs"]) for record in frame_chunks], [256, 1]) - self.assertEqual([len(record["entries"]) for record in agg_chunks], [256, 1]) + self.assertEqual( + [len(record["defs"]) for record in str_chunks], [256, 256, 2] + ) + self.assertEqual( + [len(record["defs"]) for record in frame_chunks], [256, 1] + ) + self.assertEqual( + [len(record["entries"]) for record in agg_chunks], [256, 1] + ) self.assertEqual(len(str_defs), 514) self.assertEqual(len(frame_defs), 257) self.assertEqual(agg_record["samples_total"], 257) @@ -2071,7 +2118,9 @@ def test_pstats_collector_cumulative_percentage_cannot_exceed_100(self): cumulative_calls = stats[1] self.assertEqual(cumulative_calls, 10) - def test_pstats_collector_different_lines_same_function_counted_separately(self): + def test_pstats_collector_different_lines_same_function_counted_separately( + self, + ): """Test that different line numbers in same function are tracked separately.""" collector = PstatsCollector(sample_interval_usec=1000) @@ -2278,8 +2327,7 @@ def test_flamegraph_collector_with_location_info(self): frame = MockFrameInfo("app.py", 100, "process_data") frames = [ MockInterpreterInfo( - 0, - [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) @@ -2287,8 +2335,15 @@ def test_flamegraph_collector_with_location_info(self): data = collector._convert_to_flamegraph_format() # Verify the function name includes lineno from location strings = data.get("strings", []) - name_found = any("process_data" in s and "100" in s for s in strings if isinstance(s, str)) - self.assertTrue(name_found, f"Expected to find 'process_data' with line 100 in {strings}") + name_found = any( + "process_data" in s and "100" in s + for s in strings + if isinstance(s, str) + ) + self.assertTrue( + name_found, + f"Expected to find 'process_data' with line 100 in {strings}", + ) def test_gecko_collector_with_location_info(self): """Test GeckoCollector handles LocationInfo properly.""" @@ -2297,8 +2352,7 @@ def test_gecko_collector_with_location_info(self): frame = MockFrameInfo("server.py", 50, "handle_request") frames = [ MockInterpreterInfo( - 0, - [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) @@ -2561,8 +2615,12 @@ def _make_sample_frames(self): 1, [ MockFrameInfo("app.py", 100, "main", opcode=90), - MockFrameInfo("utils.py", 50, "helper", opcode=100), - MockFrameInfo("lib.py", 25, "process", opcode=None), + MockFrameInfo( + "utils.py", 50, "helper", opcode=100 + ), + MockFrameInfo( + "lib.py", 25, "process", opcode=None + ), ], status=THREAD_STATUS_HAS_GIL, ) @@ -2720,7 +2778,9 @@ def test_flamegraph_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo( + "/lib/_sync_coordinator.py", 100, "main" + ), MockFrameInfo("", 87, "_run_code"), ], status=THREAD_STATUS_HAS_GIL, @@ -2748,7 +2808,9 @@ def test_collapsed_stack_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), + MockFrameInfo( + "/lib/_sync_coordinator.py", 100, "main" + ), ], status=THREAD_STATUS_HAS_GIL, ) From a0decb5d8b34072fdf8f70cd4276ace9b78e7380 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:49:26 +0200 Subject: [PATCH 33/38] too defensive --- Lib/profiling/sampling/jsonl_collector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index a1d37df85c2672..187c4175da6816 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -48,9 +48,6 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): self._mode = mode def process_frames(self, frames, _thread_id, weight=1): - if not frames: - return - self._samples_total += weight self._seen_frame_ids.clear() From 5f1704b87d756066d94d94fd7c2f861c107b40ac Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:16:15 +0200 Subject: [PATCH 34/38] too many style changes --- .../test_sampling_profiler/test_collectors.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 915468141a9217..3c2ce8c66f7570 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2118,9 +2118,7 @@ def test_pstats_collector_cumulative_percentage_cannot_exceed_100(self): cumulative_calls = stats[1] self.assertEqual(cumulative_calls, 10) - def test_pstats_collector_different_lines_same_function_counted_separately( - self, - ): + def test_pstats_collector_different_lines_same_function_counted_separately(self): """Test that different line numbers in same function are tracked separately.""" collector = PstatsCollector(sample_interval_usec=1000) @@ -2327,7 +2325,8 @@ def test_flamegraph_collector_with_location_info(self): frame = MockFrameInfo("app.py", 100, "process_data") frames = [ MockInterpreterInfo( - 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) @@ -2335,15 +2334,8 @@ def test_flamegraph_collector_with_location_info(self): data = collector._convert_to_flamegraph_format() # Verify the function name includes lineno from location strings = data.get("strings", []) - name_found = any( - "process_data" in s and "100" in s - for s in strings - if isinstance(s, str) - ) - self.assertTrue( - name_found, - f"Expected to find 'process_data' with line 100 in {strings}", - ) + name_found = any("process_data" in s and "100" in s for s in strings if isinstance(s, str)) + self.assertTrue(name_found, f"Expected to find 'process_data' with line 100 in {strings}") def test_gecko_collector_with_location_info(self): """Test GeckoCollector handles LocationInfo properly.""" @@ -2352,7 +2344,8 @@ def test_gecko_collector_with_location_info(self): frame = MockFrameInfo("server.py", 50, "handle_request") frames = [ MockInterpreterInfo( - 0, [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] + 0, + [MockThreadInfo(1, [frame], status=THREAD_STATUS_HAS_GIL)] ) ] collector.collect(frames) From f2a21fb0108e0d92e96dd0768236cebd4d005cce Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:18:14 +0200 Subject: [PATCH 35/38] less style --- .../test_sampling_profiler/test_collectors.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 3c2ce8c66f7570..833800c163c146 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2608,12 +2608,8 @@ def _make_sample_frames(self): 1, [ MockFrameInfo("app.py", 100, "main", opcode=90), - MockFrameInfo( - "utils.py", 50, "helper", opcode=100 - ), - MockFrameInfo( - "lib.py", 25, "process", opcode=None - ), + MockFrameInfo("utils.py", 50, "helper", opcode=100), + MockFrameInfo("lib.py", 25, "process", opcode=None), ], status=THREAD_STATUS_HAS_GIL, ) @@ -2801,9 +2797,7 @@ def test_collapsed_stack_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo( - "/lib/_sync_coordinator.py", 100, "main" - ), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), ], status=THREAD_STATUS_HAS_GIL, ) From 15b07badef5755278f0ea57fe28692423498c7e6 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:20:22 +0200 Subject: [PATCH 36/38] ha! even less style... --- .../test_profiling/test_sampling_profiler/test_collectors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py index 833800c163c146..f96304b1f3443a 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py @@ -2767,9 +2767,7 @@ def test_flamegraph_collector_filters_internal_frames(self): 1, [ MockFrameInfo("app.py", 50, "run"), - MockFrameInfo( - "/lib/_sync_coordinator.py", 100, "main" - ), + MockFrameInfo("/lib/_sync_coordinator.py", 100, "main"), MockFrameInfo("", 87, "_run_code"), ], status=THREAD_STATUS_HAS_GIL, From 148f4e21d4c27d9e4c25e46d026ad4f94061e236 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:33:18 +0200 Subject: [PATCH 37/38] news --- .../Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst new file mode 100644 index 00000000000000..d2d7e0d98d158b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -0,0 +1,3 @@ +The ``profiling.sampling`` module now supports JSONL output format via +`--jsonl`. Each run emits newline-delimited JSON records suitable for +streaming or agents. From 69c576826256dc48472be37d6c93e53ba1628889 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:33:34 +0200 Subject: [PATCH 38/38] news: proper formatting --- .../next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst index d2d7e0d98d158b..d270cc14288d8a 100644 --- a/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst +++ b/Misc/NEWS.d/next/Library/2026-03-31-17-33-10.gh-issue-146256.Nm_Ke_.rst @@ -1,3 +1,3 @@ The ``profiling.sampling`` module now supports JSONL output format via -`--jsonl`. Each run emits newline-delimited JSON records suitable for +``--jsonl``. Each run emits newline-delimited JSON records suitable for streaming or agents.