|
| 1 | +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. |
| 2 | +# Use of this source code is governed by a BSD-style |
| 3 | +# license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +""" |
| 6 | +Utilities for working with BigLake tables. |
| 7 | +""" |
| 8 | + |
| 9 | +# TODO(tswast): Synchronize with bigframes/session/iceberg.py, which uses |
| 10 | +# pyiceberg and the BigLake APIs, rather than relying on dry run. |
| 11 | + |
| 12 | +from __future__ import annotations |
| 13 | + |
| 14 | +import dataclasses |
| 15 | +from typing import Sequence |
| 16 | + |
| 17 | +import google.cloud.bigquery |
| 18 | + |
| 19 | +import pandas_gbq.core.resource_references |
| 20 | + |
| 21 | + |
| 22 | +_DRY_RUN_TEMPLATE = """ |
| 23 | +SELECT * |
| 24 | +FROM `{project}.{catalog}.{namespace}.{table}` |
| 25 | +""" |
| 26 | + |
| 27 | + |
| 28 | +_COUNT_TEMPLATE = """ |
| 29 | +SELECT COUNT(*) as total_rows |
| 30 | +FROM `{project}.{catalog}.{namespace}.{table}` |
| 31 | +""" |
| 32 | + |
| 33 | + |
| 34 | +@dataclasses.dataclass(frozen=True) |
| 35 | +class BigLakeTableMetadata: |
| 36 | + schema: Sequence[google.cloud.bigquery.SchemaField] |
| 37 | + num_rows: int |
| 38 | + |
| 39 | + |
| 40 | +def get_table_metadata( |
| 41 | + *, |
| 42 | + reference: pandas_gbq.core.resource_references.BigLakeTableId, |
| 43 | + bqclient: google.cloud.bigquery.Client, |
| 44 | +) -> BigLakeTableMetadata: |
| 45 | + """ |
| 46 | + Get the schema for a BigLake table. |
| 47 | +
|
| 48 | + Currently, this does some BigQuery queries. In the future, we'll want to get |
| 49 | + other metadata like the number of rows and storage bytes so that we can do a |
| 50 | + more accurate estimate of how many rows to sample. |
| 51 | + """ |
| 52 | + dry_run_config = google.cloud.bigquery.QueryJobConfig(dry_run=True) |
| 53 | + query = _DRY_RUN_TEMPLATE.format( |
| 54 | + project=reference.project, |
| 55 | + catalog=reference.catalog, |
| 56 | + namespace=".".join(reference.namespace), |
| 57 | + table=reference.table, |
| 58 | + ) |
| 59 | + job = bqclient.query(query, job_config=dry_run_config) |
| 60 | + job.result() |
| 61 | + schema = job.schema |
| 62 | + |
| 63 | + count_rows = list( |
| 64 | + bqclient.query_and_wait( |
| 65 | + _COUNT_TEMPLATE.format( |
| 66 | + project=reference.project, |
| 67 | + catalog=reference.catalog, |
| 68 | + namespace=".".join(reference.namespace), |
| 69 | + table=reference.table, |
| 70 | + ) |
| 71 | + ) |
| 72 | + ) |
| 73 | + assert ( |
| 74 | + len(count_rows) == 1 |
| 75 | + ), "got unexpected query response when determining number of rows" |
| 76 | + total_rows = count_rows[0].total_rows |
| 77 | + |
| 78 | + return BigLakeTableMetadata( |
| 79 | + schema=schema if schema is not None else [], |
| 80 | + num_rows=total_rows, |
| 81 | + ) |
0 commit comments