chore: fix biglake implementation

tswast · tswast · commit dc62471e76ba · 2026-03-04T22:55:21.000Z
diff --git a/.gitignore b/.gitignore
@@ -51,6 +51,7 @@ docs.metadata
 # Virtual environment
 env/
 venv/
+.venv/
 
 # Test logs
 coverage.xml
diff --git a/pandas_gbq/core/biglake.py b/pandas_gbq/core/biglake.py
@@ -2,60 +2,76 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
+"""
+Utilities for working with BigLake tables.
+"""
+
+# TODO(tswast): Synchronize with bigframes/session/iceberg.py, which uses
+# pyiceberg and the BigLake APIs, rather than relying on dry run.
+
 from __future__ import annotations
 
 import dataclasses
+from typing import Sequence
 
 import google.auth.transport.requests
+import google.cloud.bigquery
 import google.oauth2.credentials
 
-_ICEBERG_REST_CATALOG_URI = "https://biglake.googleapis.com/iceberg/v1/restcatalog"
-_TABLE_METADATA_PATH = (
-    "/v1/projects/{project}/catalogs/{catalog}/namespaces/{namespace}/tables/{table}"
-)
+import pandas_gbq.core.resource_references
+
+
+_DRY_RUN_TEMPLATE = """
+SELECT *
+FROM `{project}.{catalog}.{namespace}.{table}`
+"""
+
 
+_COUNT_TEMPLATE = """
+SELECT COUNT(*) as total_rows
+FROM `{project}.{catalog}.{namespace}.{table}`
+"""
 
 @dataclasses.dataclass(frozen=True)
-class BigLakeTableId:
-    project: str
-    catalog: str
-    namespace: str
-    table: str
+class BigLakeTableMetadata:
+    schema: Sequence[google.cloud.bigquery.SchemaField]
+    num_rows: int
 
 
 def get_table_metadata(
     *,
-    table_id: str,
-    credentials: google.oauth2.credentials.Credentials,
-    billing_project_id: str,
-):
+    reference: pandas_gbq.core.resource_references.BigLakeTableId,
+    bqclient: google.cloud.bigquery.Client,
+) -> BigLakeTableMetadata:
     """
-    Docstring for get_table_metadata
+    Get the schema for a BigLake table.
 
-    https://iceberg.apache.org/spec/#metrics;
-
-     curl -X GET -H "Authorization: Bearer \"$(gcloud auth application-default print-access-token)\"" \
-              -H "Content-Type: application/json; charset=utf-8" \
-    -H 'x-goog-user-project: swast-scratch' \
-    -H 'X-Iceberg-Access-Delegation: vended-credentials' \
+    Currently, this does some BigQuery queries. In the future, we'll want to get
+    other metadata like the number of rows and storage bytes so that we can do a
+    more accurate estimate of how many rows to sample.
     """
-    # https://iceberg.apache.org/spec/#metrics
-    # total-files-size
-    project, catalog, namespace, table = table_id.split(".")
-    session = google.auth.transport.requests.AuthorizedSession(credentials=credentials)
-    path = _TABLE_METADATA_PATH.format(
-        project=project,
-        catalog=catalog,
-        namespace=namespace,
-        table=table,
+    dry_run_config = google.cloud.bigquery.QueryJobConfig(dry_run=True)
+    query = _DRY_RUN_TEMPLATE.format(
+        project=reference.project,
+        catalog=reference.catalog,
+        namespace=".".join(reference.namespace),
+        table=reference.table,
     )
-    return session.get(
-        f"{_ICEBERG_REST_CATALOG_URI}.{path}",
-        headers={
-            "x-goog-user-project": billing_project_id,
-            "Content-Type": "application/json; charset=utf-8",
-            # TODO(tswast): parameter for this option (or get from catalog metadata?)
-            # /iceberg/{$api_version}/restcatalog/extensions/{name=projects/*/catalogs/*}
-            "X-Iceberg-Access-Delegation": "vended-credentials",
-        },
-    ).json()
+    job = bqclient.query(query, job_config=dry_run_config)
+    job.result()
+    schema = job.schema
+
+    count_rows = list(bqclient.query_and_wait(_COUNT_TEMPLATE.format(
+        project=reference.project,
+        catalog=reference.catalog,
+        namespace=".".join(reference.namespace),
+        table=reference.table,
+    )))
+    assert len(count_rows) == 1, "got unexpected query response when determining number of rows"
+    total_rows = count_rows[0].total_rows
+
+    return BigLakeTableMetadata(
+        schema=schema if schema is not None else [],
+        num_rows=total_rows,
+    )
+
diff --git a/pandas_gbq/core/resource_references.py b/pandas_gbq/core/resource_references.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2026 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import dataclasses
+import re
+
+
+_TABLE_REFEREENCE_PATTERN = re.compile(
+    # In the past, organizations could prefix their project IDs with a domain
+    # name. Such projects still exist, especially at Google.
+    r"^(?P<legacy_project_domain>[^:]+:)?"
+    r"(?P<project>[^.]+)\."
+    # Dataset for native BigQuery tables, catalog + namespace(s) for  BigLake.
+    r"(?P<inner_parts>([^.\s]+\.?)+)\."
+    # Table names can't contain ".", as that's used as the separator.
+    r"(?P<table>[^.]+)$"
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class BigLakeTableId:
+    project: str
+    catalog: str
+    namespace: tuple[str, ...]
+    table: str
+
+
+@dataclasses.dataclass(frozen=True)
+class BigQueryTableId:
+    project_id: str
+    dataset_id: str
+    table_id: str
+
+
+def parse_table_id(table_id: str) -> BigLakeTableId | BigQueryTableId:
+    """Turn a string into a BigLakeTableId or BigQueryTableId.
+
+    Raises:
+        ValueError: If the table ID is invalid.
+    """
+    regex_match = _TABLE_REFEREENCE_PATTERN.match(table_id)
+    if not regex_match:
+        raise ValueError(f"Invalid table ID: {table_id}")
+
+    inner_parts = regex_match.group("inner_parts").split(".")
+    if len(inner_parts) == 1:
+        return BigQueryTableId(
+            project_id=regex_match.group("project"),
+            dataset_id=inner_parts[0],
+            table_id=regex_match.group("table"),
+        )
+
+    return BigLakeTableId(
+        project=regex_match.group("project"),
+        catalog=inner_parts[0],
+        namespace=tuple(inner_parts[1:]),
+        table=regex_match.group("table"),
+    )
diff --git a/pandas_gbq/core/sample.py b/pandas_gbq/core/sample.py
@@ -16,6 +16,7 @@
 import pandas_gbq.core.read
 import pandas_gbq.core.biglake
 import pandas_gbq.gbq_connector
+import pandas_gbq.core.resource_references
 
 # Only import at module-level at type checking time to avoid circular
 # dependencies in the pandas package, which has an optional dependency on
@@ -53,7 +54,6 @@
 # TODO(tswast): Choose an estimate based on actual BigQuery stats.
 _ARRAY_LENGTH_ESTIMATE = 5
 _UNKNOWN_TYPE_SIZE_ESTIMATE = 4
-_MAX_ROW_BYTES = 100 * pandas_gbq.constants.BYTES_IN_MIB
 _MAX_AUTO_TARGET_BYTES = 1 * pandas_gbq.constants.BYTES_IN_GIB
 
 
@@ -62,15 +62,15 @@ def _calculate_target_bytes(target_mb: Optional[int]) -> int:
         return target_mb * pandas_gbq.constants.BYTES_IN_MIB
 
     mem = psutil.virtual_memory()
-    return min(_MAX_AUTO_TARGET_BYTES, max(_MAX_ROW_BYTES, mem.available // 4))
+    return min(_MAX_AUTO_TARGET_BYTES, mem.available // 4)
 
 
 def _estimate_limit(
     *,
-    target_bytes: int,
-    table_bytes: Optional[int],
-    table_rows: Optional[int],
     fields: Sequence[google.cloud.bigquery.SchemaField],
+    target_bytes: int,
+    table_bytes: Optional[int] = None,
+    table_rows: Optional[int] = None,
 ) -> int:
     if table_bytes and table_rows:
         proportion = target_bytes / table_bytes
@@ -119,8 +119,8 @@ def _estimate_row_bytes(fields: Sequence[google.cloud.bigquery.SchemaField]) ->
     Returns:
         An integer representing the estimated total row size in logical bytes.
     """
-    total_size = min(
-        _MAX_ROW_BYTES,
+    total_size = max(
+        1,
         sum(_estimate_field_bytes(field) for field in fields),
     )
     return total_size
@@ -165,10 +165,11 @@ def _sample_with_tablesample(
     progress_bar_type: Optional[str] = None,
     use_bqstorage_api: bool = True,
 ) -> Optional[pandas.DataFrame]:
+    sample_percent = min(100, max(1, int(proportion * 100)))
     query = f"""
     SELECT *
-    FROM `{table_id}`
-    TABLESAMPLE SYSTEM ({float(proportion) * 100.0} PERCENT)
+    FROM `{table_id}` t
+    TABLESAMPLE SYSTEM ({sample_percent} PERCENT)
     ORDER BY RAND() DESC
     LIMIT {int(target_row_count)};
     """
@@ -206,25 +207,55 @@ def _sample_with_limit(
 
 def _sample_biglake_table(
     *,
-    table_id: str,
-    credentials: google.oauth2.credentials.Credentials,
+    reference: pandas_gbq.core.resource_references.BigLakeTableId,
     bqclient: google.cloud.bigquery.Client,
     target_bytes: int,
     progress_bar_type: str | None,
     use_bqstorage_api: bool,
 ) -> Optional[pandas.DataFrame]:
-    pass
+    metadata = pandas_gbq.core.biglake.get_table_metadata(
+        reference=reference,
+        bqclient=bqclient,
+    )
+    total_rows = metadata.num_rows
+
+    # Avoid divide by 0 when calculating proportions.
+    if total_rows == 0:
+        total_rows = 1
+
+    target_row_count = _estimate_limit(
+        target_bytes=target_bytes,
+        fields=metadata.schema,
+        table_rows=total_rows,
+    )
+    proportion = max(0.01, target_row_count / total_rows)
+
+    # BigLake tables should always support table sample, since they are backed
+    # by parquet files.
+    return _sample_with_tablesample(
+        f"{reference.project}.{reference.catalog}.{'.'.join(reference.namespace)}.{reference.table}",
+        bqclient=bqclient,
+        proportion=proportion,
+        target_row_count=target_row_count,
+        progress_bar_type=progress_bar_type,
+        use_bqstorage_api=use_bqstorage_api,
+    )
 
 
 def _sample_bq_table(
     *,
-    table_id: str,
+    reference: pandas_gbq.core.resource_references.BigQueryTableId,
     bqclient: google.cloud.bigquery.Client,
     target_bytes: int,
     progress_bar_type: str | None,
     use_bqstorage_api: bool,
 ) -> Optional[pandas.DataFrame]:
-    table = bqclient.get_table(table_id)
+    table = bqclient.get_table(google.cloud.bigquery.TableReference(
+        google.cloud.bigquery.DatasetReference(
+            reference.project_id, reference.dataset_id
+        ),
+        reference.table_id
+    ))
     num_rows = table.num_rows
     num_bytes = table.num_bytes
     table_type = table.table_type
@@ -342,24 +373,22 @@ def sample(
     connector = pandas_gbq.gbq_connector.GbqConnector(
         project_id=billing_project_id, credentials=credentials
     )
-    credentials = cast(google.oauth2.credentials.Credentials, connector.credentials)
     bqclient = connector.get_client()
 
     # BigLake tables can't be read directly by the BQ Storage Read API, so make
     # sure we run a query first.
-    parts = table_id.split(".")
-    if len(parts) == 4:
+    reference = pandas_gbq.core.resource_references.parse_table_id(table_id)
+    if isinstance(reference, pandas_gbq.core.resource_references.BigLakeTableId):
         return _sample_biglake_table(
-            table_id=table_id,
-            credentials=credentials,
+            reference=reference,
             bqclient=bqclient,
             target_bytes=target_bytes,
             progress_bar_type=progress_bar_type,
             use_bqstorage_api=use_bqstorage_api,
         )
     else:
         return _sample_bq_table(
-            table_id=table_id,
+            reference=reference,
             bqclient=bqclient,
             target_bytes=target_bytes,
             progress_bar_type=progress_bar_type,