-
-
Notifications
You must be signed in to change notification settings - Fork 34.3k
gh-146256: Add --jsonl collector to the profiling.sampling
#146257
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2749a44
f13d34c
c15d318
6a0ea81
cb27fc0
59cbb4a
25c6922
67cd39a
7c85d47
3eddae8
f71252e
9836ffa
c183109
f20eb52
546ce90
350ad99
942d821
bd9aefe
311a4e3
749a868
85ce978
820d3b9
aad4b18
da3e754
cb6ed34
5a59e0b
192e54b
3189a8f
935779f
e3d8aff
d37f07a
aaaa972
a9b6ccd
4fb3ade
a0decb5
5f1704b
f2a21fb
15b07ba
148f4e2
69c5768
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,13 +20,16 @@ def normalize_location(location): | |
| """Normalize location to a 4-tuple format. | ||
| Args: | ||
| location: tuple (lineno, end_lineno, col_offset, end_col_offset) or None | ||
| location: tuple (lineno, end_lineno, col_offset, end_col_offset), | ||
| an integer line number, or None | ||
| Returns: | ||
| tuple: (lineno, end_lineno, col_offset, end_col_offset) | ||
| """ | ||
| if location is None: | ||
| return DEFAULT_LOCATION | ||
| if isinstance(location, int): | ||
| return (location, location, -1, -1) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now handled per collector:
I don't know if it's necessary but I simply didn't want to make https://github.com/python/cpython/pull/146257/changes#diff-58ccdb8421c89943862c73d1cbeae3e961873b55ed2adb7efc875dafd549c01bR177 too defensive. |
||
| return location | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,200 @@ | ||
| """JSONL collector.""" | ||
|
|
||
| from collections import Counter | ||
| import json | ||
| import uuid | ||
| from itertools import batched | ||
|
|
||
| from .constants import ( | ||
| PROFILING_MODE_ALL, | ||
| PROFILING_MODE_CPU, | ||
| PROFILING_MODE_EXCEPTION, | ||
| PROFILING_MODE_GIL, | ||
| PROFILING_MODE_WALL, | ||
| ) | ||
| from .collector import normalize_location | ||
| from .stack_collector import StackTraceCollector | ||
|
|
||
|
|
||
| _CHUNK_SIZE = 256 | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More interesting than it looks! The collector is not streaming yet. The values I found are on par with 256-512:
|
||
|
|
||
| _MODE_NAMES = { | ||
| PROFILING_MODE_WALL: "wall", | ||
| PROFILING_MODE_CPU: "cpu", | ||
| PROFILING_MODE_GIL: "gil", | ||
| PROFILING_MODE_ALL: "all", | ||
| PROFILING_MODE_EXCEPTION: "exception", | ||
| } | ||
|
|
||
|
|
||
| class JsonlCollector(StackTraceCollector): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe the collectors should be separate from renderers? |
||
| """Collector that exports finalized profiling data as JSONL.""" | ||
|
|
||
| def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): | ||
| super().__init__(sample_interval_usec, skip_idle=skip_idle) | ||
| self.run_id = uuid.uuid4().hex | ||
|
|
||
| self._string_to_id = {} | ||
| self._strings = [] | ||
|
|
||
| self._frame_to_id = {} | ||
| self._frames = [] | ||
|
|
||
| self._frame_self = Counter() | ||
| self._frame_cumulative = Counter() | ||
| self._samples_total = 0 | ||
| self._seen_frame_ids = set() | ||
|
|
||
| self._mode = mode | ||
|
|
||
| def process_frames(self, frames, _thread_id, weight=1): | ||
| self._samples_total += weight | ||
| self._seen_frame_ids.clear() | ||
|
|
||
| for i, (filename, location, funcname, _opcode) in enumerate(frames): | ||
| frame_id = self._get_or_create_frame_id( | ||
| filename, location, funcname | ||
| ) | ||
| is_leaf = i == 0 | ||
| count_cumulative = frame_id not in self._seen_frame_ids | ||
|
|
||
| if count_cumulative: | ||
| self._seen_frame_ids.add(frame_id) | ||
|
|
||
| if is_leaf: | ||
| self._frame_self[frame_id] += weight | ||
|
|
||
| if count_cumulative: | ||
| self._frame_cumulative[frame_id] += weight | ||
|
|
||
| def export(self, filename): | ||
| with open(filename, "w", encoding="utf-8") as output: | ||
| self._write_message(output, self._build_meta_record()) | ||
| self._write_chunked_records( | ||
| output, | ||
| {"type": "str_def", "v": 1, "run_id": self.run_id}, | ||
| "defs", | ||
| self._strings, | ||
| ) | ||
| self._write_chunked_records( | ||
| output, | ||
| {"type": "frame_def", "v": 1, "run_id": self.run_id}, | ||
| "defs", | ||
| self._frames, | ||
| ) | ||
| self._write_chunked_records( | ||
| output, | ||
| { | ||
| "type": "agg", | ||
| "v": 1, | ||
| "run_id": self.run_id, | ||
| "kind": "frame", | ||
| "scope": "final", | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The very big thing here is ensuring that the format is future-proof. That's the reason for For example, in the future: |
||
| "samples_total": self._samples_total, | ||
| }, | ||
| "entries", | ||
| self._iter_final_agg_entries(), | ||
| ) | ||
| self._write_message(output, self._build_end_record()) | ||
|
|
||
| def _build_meta_record(self): | ||
| record = { | ||
| "type": "meta", | ||
| "v": 1, | ||
| "run_id": self.run_id, | ||
| "sample_interval_usec": self.sample_interval_usec, | ||
| } | ||
|
|
||
| if self._mode is not None: | ||
| record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode)) | ||
|
|
||
| return record | ||
|
|
||
| def _build_end_record(self): | ||
| record = { | ||
| "type": "end", | ||
| "v": 1, | ||
| "run_id": self.run_id, | ||
| "samples_total": self._samples_total, | ||
| } | ||
|
|
||
| return record | ||
|
|
||
| def _iter_final_agg_entries(self): | ||
| for frame_record in self._frames: | ||
| frame_id = frame_record["frame_id"] | ||
| yield { | ||
| "frame_id": frame_id, | ||
| "self": self._frame_self[frame_id], | ||
| "cumulative": self._frame_cumulative[frame_id], | ||
| } | ||
|
|
||
| def _get_or_create_frame_id(self, filename, location, funcname): | ||
| synthetic = location is None | ||
| location_fields = self._location_to_export_fields(location) | ||
| func_str_id = self._intern_string(funcname) | ||
| path_str_id = self._intern_string(filename) | ||
|
|
||
| frame_key = ( | ||
| path_str_id, | ||
| func_str_id, | ||
| location_fields["line"], | ||
| location_fields.get("end_line"), | ||
| location_fields.get("col"), | ||
| location_fields.get("end_col"), | ||
| synthetic, | ||
| ) | ||
|
|
||
| if (frame_id := self._frame_to_id.get(frame_key)) is not None: | ||
| return frame_id | ||
|
|
||
| frame_id = len(self._frames) + 1 | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1 or 0 indexed? |
||
| frame_record = { | ||
| "frame_id": frame_id, | ||
| "path_str_id": path_str_id, | ||
| "func_str_id": func_str_id, | ||
| **location_fields, | ||
| } | ||
| if synthetic: | ||
| frame_record["synthetic"] = True | ||
|
|
||
| self._frame_to_id[frame_key] = frame_id | ||
| self._frames.append(frame_record) | ||
| return frame_id | ||
|
|
||
| def _intern_string(self, value): | ||
| value = str(value) | ||
|
|
||
| if (string_id := self._string_to_id.get(value)) is not None: | ||
| return string_id | ||
|
|
||
| string_id = len(self._strings) + 1 | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1 or 0 indexed? |
||
| self._string_to_id[value] = string_id | ||
| self._strings.append({"str_id": string_id, "value": value}) | ||
| return string_id | ||
|
|
||
| @staticmethod | ||
| def _location_to_export_fields(location): | ||
| lineno, end_lineno, col_offset, end_col_offset = normalize_location( | ||
| location | ||
| ) | ||
|
|
||
| fields = {"line": lineno} | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be To quote the Markdown:
On the other hand, https://github.com/python/cpython/blob/main/Lib/profiling/sampling/constants.py#L24 This is the reason for adding the |
||
| if end_lineno > 0: | ||
| fields["end_line"] = end_lineno | ||
| if col_offset >= 0: | ||
| fields["col"] = col_offset | ||
| if end_col_offset >= 0: | ||
| fields["end_col"] = end_col_offset | ||
| return fields | ||
|
|
||
| def _write_chunked_records( | ||
| self, output, base_record, chunk_field, entries | ||
| ): | ||
| for chunk in batched(entries, _CHUNK_SIZE): | ||
| self._write_message(output, {**base_record, chunk_field: chunk}) | ||
|
|
||
| @staticmethod | ||
| def _write_message(output, record): | ||
| output.write(json.dumps(record, separators=(",", ":"))) | ||
| output.write("\n") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is already very complex. The collector constructor signature supports all collectors at once.
I've added
modefor the purpose ofmetabut I don't think this scales for other modes.(Truth to be told, I think that that complex signatures are also the underlying reason for the issue fixed by #145459)