Skip to content

Commit 2ff410c

Browse files
Optimize hot path performance for type hint conversion
- ResultSet: Pre-compute column_type_hints tuple once in _process_metadata instead of per-cell dict creation and .lower() lookup. Replace **({} if ... else {}) with simple if/else branching. Applied to AthenaResultSet, AthenaDictResultSet, and S3FS. - Array JSON guard: Add JSON detection heuristic (check for '"', '[{', '[null') before json.loads in _convert_typed_array, matching the existing pattern in map/struct to avoid JSONDecodeError exceptions on native format strings. - TypeNode field lookup: Add cached _field_type_map dict for O(1) name-based field type resolution, replacing O(n) list.index() in _get_field_type. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3140978 commit 2ff410c

3 files changed

Lines changed: 106 additions & 77 deletions

File tree

pyathena/parser.py

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,23 @@ class TypeNode:
5858
type_name: str
5959
children: list[TypeNode] = field(default_factory=list)
6060
field_names: list[str] | None = None
61+
_field_type_map: dict[str, TypeNode] | None = field(default=None, repr=False)
62+
63+
def get_field_type(self, name: str) -> TypeNode | None:
64+
"""Look up a child type node by field name using a cached dict.
65+
66+
Returns:
67+
The TypeNode for the named field, or None if not found.
68+
"""
69+
if self._field_type_map is None and self.field_names:
70+
self._field_type_map = {
71+
fn: self.children[i]
72+
for i, fn in enumerate(self.field_names)
73+
if i < len(self.children)
74+
}
75+
if self._field_type_map:
76+
return self._field_type_map.get(name)
77+
return None
6178

6279

6380
class TypeSignatureParser:
@@ -260,16 +277,20 @@ def _convert_typed_array(self, value: str, type_node: TypeNode) -> list[Any] | N
260277

261278
element_type = type_node.children[0] if type_node.children else TypeNode("varchar")
262279

263-
# Try JSON first
264-
try:
265-
parsed = json.loads(value)
266-
if isinstance(parsed, list):
267-
return [
268-
None if elem is None else self.convert(self._to_json_str(elem), element_type)
269-
for elem in parsed
270-
]
271-
except json.JSONDecodeError:
272-
pass
280+
# Try JSON first (only if content looks like JSON)
281+
inner_preview = value[1:10] if len(value) > 10 else value[1:-1]
282+
if '"' in inner_preview or value.startswith(("[{", "[null")):
283+
try:
284+
parsed = json.loads(value)
285+
if isinstance(parsed, list):
286+
return [
287+
None
288+
if elem is None
289+
else self.convert(self._to_json_str(elem), element_type)
290+
for elem in parsed
291+
]
292+
except json.JSONDecodeError:
293+
pass
273294

274295
# Native format
275296
inner = value[1:-1].strip()
@@ -376,7 +397,6 @@ def _convert_typed_struct(self, value: str, type_node: TypeNode) -> dict[str, An
376397
if not (value.startswith("{") and value.endswith("}")):
377398
return None
378399

379-
field_names = type_node.field_names or []
380400
field_types = type_node.children or []
381401

382402
# Try JSON first
@@ -387,7 +407,7 @@ def _convert_typed_struct(self, value: str, type_node: TypeNode) -> dict[str, An
387407
if isinstance(parsed, dict):
388408
result: dict[str, Any] = {}
389409
for i, (k, v) in enumerate(parsed.items()):
390-
ft = self._get_field_type(k, field_names, field_types, i)
410+
ft = self._get_field_type(k, type_node, i)
391411
result[k] = (
392412
self.convert(self._to_json_str(v), ft) if v is not None else None
393413
)
@@ -413,7 +433,7 @@ def _convert_typed_struct(self, value: str, type_node: TypeNode) -> dict[str, An
413433
if any(char in k for char in '{}="'):
414434
continue
415435

416-
ft = self._get_field_type(k, field_names, field_types, field_index)
436+
ft = self._get_field_type(k, type_node, field_index)
417437
field_index += 1
418438

419439
if v.startswith("{") and v.endswith("}"):
@@ -428,6 +448,7 @@ def _convert_typed_struct(self, value: str, type_node: TypeNode) -> dict[str, An
428448
return result if result else None
429449

430450
# Unnamed struct
451+
field_names = type_node.field_names or []
431452
values = [v.strip() for v in inner.split(",")]
432453
result = {}
433454
for i, v in enumerate(values):
@@ -436,30 +457,29 @@ def _convert_typed_struct(self, value: str, type_node: TypeNode) -> dict[str, An
436457
result[name] = self._convert_element(v, ft)
437458
return result
438459

460+
@staticmethod
439461
def _get_field_type(
440-
self,
441462
field_name: str,
442-
field_names: list[str],
443-
field_types: list[TypeNode],
463+
type_node: TypeNode,
444464
field_index: int,
445465
) -> TypeNode:
446466
"""Look up the type for a struct field by name or index.
447467
448-
Tries name-based lookup first, then falls back to positional index.
468+
Uses the TypeNode's cached dict for O(1) name lookup, then falls
469+
back to positional index.
449470
450471
Args:
451472
field_name: Name of the field to look up.
452-
field_names: List of known field names from the type hint.
453-
field_types: List of corresponding field types.
473+
type_node: The parent row/struct TypeNode.
454474
field_index: Current positional index as fallback.
455475
456476
Returns:
457477
TypeNode for the field, defaulting to varchar if not found.
458478
"""
459-
if field_name in field_names:
460-
idx = field_names.index(field_name)
461-
if idx < len(field_types):
462-
return field_types[idx]
479+
ft = type_node.get_field_type(field_name)
480+
if ft is not None:
481+
return ft
482+
field_types = type_node.children or []
463483
if field_index < len(field_types):
464484
return field_types[field_index]
465485
return TypeNode("varchar")

pyathena/result_set.py

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def __init__(
8383
)
8484

8585
self._metadata: tuple[dict[str, Any], ...] | None = None
86+
self._column_type_hints: tuple[str | None, ...] | None = None
8687
self._rows: collections.deque[tuple[Any | None, ...] | dict[Any, Any | None]] = (
8788
collections.deque()
8889
)
@@ -424,6 +425,10 @@ def _process_metadata(self, response: dict[str, Any]) -> None:
424425
if column_info is None:
425426
raise DataError("KeyError `ColumnInfo`")
426427
self._metadata = tuple(column_info)
428+
if self._result_set_type_hints:
429+
self._column_type_hints = tuple(
430+
self._result_set_type_hints.get(m.get("Name", "").lower()) for m in self._metadata
431+
)
427432

428433
def _process_update_count(self, response: dict[str, Any]) -> None:
429434
update_count = response.get("UpdateCount")
@@ -449,21 +454,23 @@ def _get_rows(
449454
converter: Converter | None = None,
450455
) -> list[tuple[Any | None, ...] | dict[Any, Any | None]]:
451456
conv = converter or self._converter
452-
hints = self._result_set_type_hints
457+
col_hints = self._column_type_hints
458+
if col_hints:
459+
return [
460+
tuple(
461+
conv.convert(meta.get("Type"), row.get("VarCharValue"), type_hint=hint)
462+
if hint
463+
else conv.convert(meta.get("Type"), row.get("VarCharValue"))
464+
for meta, row, hint in zip(
465+
metadata, rows[i].get("Data", []), col_hints, strict=False
466+
)
467+
)
468+
for i in range(offset, len(rows))
469+
]
453470
return [
454471
tuple(
455-
[
456-
conv.convert(
457-
meta.get("Type"),
458-
row.get("VarCharValue"),
459-
**(
460-
{"type_hint": hints[meta.get("Name", "").lower()]}
461-
if hints and meta.get("Name", "").lower() in hints
462-
else {}
463-
),
464-
)
465-
for meta, row in zip(metadata, rows[i].get("Data", []), strict=False)
466-
]
472+
conv.convert(meta.get("Type"), row.get("VarCharValue"))
473+
for meta, row in zip(metadata, rows[i].get("Data", []), strict=False)
467474
)
468475
for i in range(offset, len(rows))
469476
]
@@ -645,24 +652,29 @@ def _get_rows(
645652
converter: Converter | None = None,
646653
) -> list[tuple[Any | None, ...] | dict[Any, Any | None]]:
647654
conv = converter or self._converter
648-
hints = self._result_set_type_hints
649-
return [
650-
self.dict_type(
651-
[
655+
col_hints = self._column_type_hints
656+
if col_hints:
657+
return [
658+
self.dict_type(
652659
(
653660
meta.get("Name"),
654-
conv.convert(
655-
meta.get("Type"),
656-
row.get("VarCharValue"),
657-
**(
658-
{"type_hint": hints[meta.get("Name", "").lower()]}
659-
if hints and meta.get("Name", "").lower() in hints
660-
else {}
661-
),
662-
),
661+
conv.convert(meta.get("Type"), row.get("VarCharValue"), type_hint=hint)
662+
if hint
663+
else conv.convert(meta.get("Type"), row.get("VarCharValue")),
663664
)
664-
for meta, row in zip(metadata, rows[i].get("Data", []), strict=False)
665-
]
665+
for meta, row, hint in zip(
666+
metadata, rows[i].get("Data", []), col_hints, strict=False
667+
)
668+
)
669+
for i in range(offset, len(rows))
670+
]
671+
return [
672+
self.dict_type(
673+
(
674+
meta.get("Name"),
675+
conv.convert(meta.get("Type"), row.get("VarCharValue")),
676+
)
677+
for meta, row in zip(metadata, rows[i].get("Data", []), strict=False)
666678
)
667679
for i in range(offset, len(rows))
668680
]

pyathena/s3fs/result_set.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,7 @@ def _fetch(self) -> None:
151151

152152
description = self.description if self.description else []
153153
column_types = [d[1] for d in description]
154-
column_names = [d[0] for d in description]
155-
hints = self._result_set_type_hints
154+
col_hints = self._column_type_hints
156155

157156
rows_fetched = 0
158157
while rows_fetched < self._arraysize:
@@ -165,35 +164,33 @@ def _fetch(self) -> None:
165164
# AthenaCSVReader returns None for NULL values directly,
166165
# DefaultCSVReader returns empty string which needs conversion
167166
if self._csv_reader_class is DefaultCSVReader:
168-
converted_row = tuple(
169-
self._converter.convert(
170-
col_type,
171-
value if value != "" else None,
172-
**(
173-
{"type_hint": hints[col_name.lower()]}
174-
if hints and col_name.lower() in hints
175-
else {}
176-
),
167+
if col_hints:
168+
converted_row = tuple(
169+
self._converter.convert(
170+
col_type, value if value != "" else None, type_hint=hint
171+
)
172+
if hint
173+
else self._converter.convert(col_type, value if value != "" else None)
174+
for col_type, value, hint in zip(column_types, row, col_hints, strict=False)
177175
)
178-
for col_type, col_name, value in zip(
179-
column_types, column_names, row, strict=False
176+
else:
177+
converted_row = tuple(
178+
self._converter.convert(col_type, value if value != "" else None)
179+
for col_type, value in zip(column_types, row, strict=False)
180180
)
181-
)
182181
else:
183-
converted_row = tuple(
184-
self._converter.convert(
185-
col_type,
186-
value,
187-
**(
188-
{"type_hint": hints[col_name.lower()]}
189-
if hints and col_name.lower() in hints
190-
else {}
191-
),
182+
if col_hints:
183+
converted_row = tuple(
184+
self._converter.convert(col_type, value, type_hint=hint)
185+
if hint
186+
else self._converter.convert(col_type, value)
187+
for col_type, value, hint in zip(column_types, row, col_hints, strict=False)
192188
)
193-
for col_type, col_name, value in zip(
194-
column_types, column_names, row, strict=False
189+
else:
190+
converted_row = tuple(
191+
self._converter.convert(col_type, value)
192+
for col_type, value in zip(column_types, row, strict=False)
195193
)
196-
)
197194
self._rows.append(converted_row)
198195
rows_fetched += 1
199196

0 commit comments

Comments
 (0)