Skip to content
This repository was archived by the owner on Mar 13, 2026. It is now read-only.

Commit 8784664

Browse files
committed
feat: support biglake tables in pandas_gbq.sample
1 parent d59db79 commit 8784664

5 files changed

Lines changed: 160 additions & 69 deletions

File tree

pandas_gbq/core/biglake.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright (c) 2026 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
from __future__ import annotations
6+
7+
import dataclasses
8+
9+
import google.auth.transport.requests
10+
import google.oauth2.credentials
11+
12+
_ICEBERG_REST_CATALOG_URI = "https://biglake.googleapis.com/iceberg/v1/restcatalog"
13+
_TABLE_METADATA_PATH = (
14+
"/v1/projects/{project}/catalogs/{catalog}/namespaces/{namespace}/tables/{table}"
15+
)
16+
17+
18+
@dataclasses.dataclass(frozen=True)
19+
class BigLakeTableId:
20+
project: str
21+
catalog: str
22+
namespace: str
23+
table: str
24+
25+
26+
def get_table_metadata(
27+
*,
28+
table_id: str,
29+
credentials: google.oauth2.credentials.Credentials,
30+
billing_project_id: str,
31+
):
32+
"""
33+
Docstring for get_table_metadata
34+
35+
https://iceberg.apache.org/spec/#metrics;
36+
37+
curl -X GET -H "Authorization: Bearer \"$(gcloud auth application-default print-access-token)\"" \
38+
-H "Content-Type: application/json; charset=utf-8" \
39+
-H 'x-goog-user-project: swast-scratch' \
40+
-H 'X-Iceberg-Access-Delegation: vended-credentials' \
41+
"""
42+
# https://iceberg.apache.org/spec/#metrics
43+
# total-files-size
44+
project, catalog, namespace, table = table_id.split(".")
45+
session = google.auth.transport.requests.AuthorizedSession(credentials=credentials)
46+
path = _TABLE_METADATA_PATH.format(
47+
project=project,
48+
catalog=catalog,
49+
namespace=namespace,
50+
table=table,
51+
)
52+
return session.get(
53+
f"{_ICEBERG_REST_CATALOG_URI}.{path}",
54+
headers={
55+
"x-goog-user-project": billing_project_id,
56+
"Content-Type": "application/json; charset=utf-8",
57+
# TODO(tswast): parameter for this option (or get from catalog metadata?)
58+
# /iceberg/{$api_version}/restcatalog/extensions/{name=projects/*/catalogs/*}
59+
"X-Iceberg-Access-Delegation": "vended-credentials",
60+
},
61+
).json()

pandas_gbq/core/sample.py

Lines changed: 93 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import pandas_gbq.constants
1616
import pandas_gbq.core.read
17+
import pandas_gbq.core.biglake
1718
import pandas_gbq.gbq_connector
1819

1920
# Only import at module-level at type checking time to avoid circular
@@ -156,7 +157,7 @@ def _download_results_in_parallel(
156157

157158

158159
def _sample_with_tablesample(
159-
table: google.cloud.bigquery.Table,
160+
table_id: str,
160161
*,
161162
bqclient: google.cloud.bigquery.Client,
162163
proportion: float,
@@ -166,7 +167,7 @@ def _sample_with_tablesample(
166167
) -> Optional[pandas.DataFrame]:
167168
query = f"""
168169
SELECT *
169-
FROM `{table.project}.{table.dataset_id}.{table.table_id}`
170+
FROM `{table_id}`
170171
TABLESAMPLE SYSTEM ({float(proportion) * 100.0} PERCENT)
171172
ORDER BY RAND() DESC
172173
LIMIT {int(target_row_count)};
@@ -181,7 +182,7 @@ def _sample_with_tablesample(
181182

182183

183184
def _sample_with_limit(
184-
table: google.cloud.bigquery.Table,
185+
table_id: str,
185186
*,
186187
bqclient: google.cloud.bigquery.Client,
187188
target_row_count: int,
@@ -190,7 +191,7 @@ def _sample_with_limit(
190191
) -> Optional[pandas.DataFrame]:
191192
query = f"""
192193
SELECT *
193-
FROM `{table.project}.{table.dataset_id}.{table.table_id}`
194+
FROM `{table_id}`
194195
ORDER BY RAND() DESC
195196
LIMIT {int(target_row_count)};
196197
"""
@@ -203,6 +204,82 @@ def _sample_with_limit(
203204
)
204205

205206

207+
def _sample_biglake_table(
208+
*,
209+
table_id: str,
210+
credentials: google.oauth2.credentials.Credentials,
211+
bqclient: google.cloud.bigquery.Client,
212+
target_bytes: int,
213+
progress_bar_type: str | None,
214+
use_bqstorage_api: bool,
215+
) -> Optional[pandas.DataFrame]:
216+
pass
217+
218+
219+
def _sample_bq_table(
220+
*,
221+
table_id: str,
222+
bqclient: google.cloud.bigquery.Client,
223+
target_bytes: int,
224+
progress_bar_type: str | None,
225+
use_bqstorage_api: bool,
226+
) -> Optional[pandas.DataFrame]:
227+
table = bqclient.get_table(table_id)
228+
num_rows = table.num_rows
229+
num_bytes = table.num_bytes
230+
table_type = table.table_type
231+
232+
# Some tables such as views report 0 despite actually having rows.
233+
if num_bytes == 0:
234+
num_bytes = None
235+
236+
# Table is small enough to download the whole thing.
237+
if (
238+
table_type in _READ_API_ELIGIBLE_TYPES
239+
and num_bytes is not None
240+
and num_bytes <= target_bytes
241+
):
242+
rows_iter = bqclient.list_rows(table)
243+
return pandas_gbq.core.read.download_results(
244+
rows_iter,
245+
bqclient=bqclient,
246+
progress_bar_type=progress_bar_type,
247+
warn_on_large_results=False,
248+
max_results=None,
249+
user_dtypes=None,
250+
use_bqstorage_api=use_bqstorage_api,
251+
)
252+
253+
target_row_count = _estimate_limit(
254+
target_bytes=target_bytes,
255+
table_bytes=num_bytes,
256+
table_rows=num_rows,
257+
fields=table.schema,
258+
)
259+
260+
# Table is eligible for TABLESAMPLE.
261+
if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES:
262+
proportion = target_bytes / num_bytes
263+
return _sample_with_tablesample(
264+
f"{table.project}.{table.dataset_id}.{table.table_id}",
265+
bqclient=bqclient,
266+
proportion=proportion,
267+
target_row_count=target_row_count,
268+
progress_bar_type=progress_bar_type,
269+
use_bqstorage_api=use_bqstorage_api,
270+
)
271+
272+
# Not eligible for TABLESAMPLE or reading directly, so take a random sample
273+
# with a full table scan.
274+
return _sample_with_limit(
275+
f"{table.project}.{table.dataset_id}.{table.table_id}",
276+
bqclient=bqclient,
277+
target_row_count=target_row_count,
278+
progress_bar_type=progress_bar_type,
279+
use_bqstorage_api=use_bqstorage_api,
280+
)
281+
282+
206283
def sample(
207284
table_id: str,
208285
*,
@@ -267,57 +344,24 @@ def sample(
267344
)
268345
credentials = cast(google.oauth2.credentials.Credentials, connector.credentials)
269346
bqclient = connector.get_client()
270-
table = bqclient.get_table(table_id)
271-
num_rows = table.num_rows
272-
num_bytes = table.num_bytes
273-
table_type = table.table_type
274347

275-
# Some tables such as views report 0 despite actually having rows.
276-
if num_bytes == 0:
277-
num_bytes = None
278-
279-
# Table is small enough to download the whole thing.
280-
if (
281-
table_type in _READ_API_ELIGIBLE_TYPES
282-
and num_bytes is not None
283-
and num_bytes <= target_bytes
284-
):
285-
rows_iter = bqclient.list_rows(table)
286-
return pandas_gbq.core.read.download_results(
287-
rows_iter,
348+
# BigLake tables can't be read directly by the BQ Storage Read API, so make
349+
# sure we run a query first.
350+
parts = table_id.split(".")
351+
if len(parts) == 4:
352+
return _sample_biglake_table(
353+
table_id=table_id,
354+
credentials=credentials,
288355
bqclient=bqclient,
356+
target_bytes=target_bytes,
289357
progress_bar_type=progress_bar_type,
290-
warn_on_large_results=False,
291-
max_results=None,
292-
user_dtypes=None,
293358
use_bqstorage_api=use_bqstorage_api,
294359
)
295-
296-
target_row_count = _estimate_limit(
297-
target_bytes=target_bytes,
298-
table_bytes=num_bytes,
299-
table_rows=num_rows,
300-
fields=table.schema,
301-
)
302-
303-
# Table is eligible for TABLESAMPLE.
304-
if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES:
305-
proportion = target_bytes / num_bytes
306-
return _sample_with_tablesample(
307-
table,
360+
else:
361+
return _sample_bq_table(
362+
table_id=table_id,
308363
bqclient=bqclient,
309-
proportion=proportion,
310-
target_row_count=target_row_count,
364+
target_bytes=target_bytes,
311365
progress_bar_type=progress_bar_type,
312366
use_bqstorage_api=use_bqstorage_api,
313367
)
314-
315-
# Not eligible for TABLESAMPLE or reading directly, so take a random sample
316-
# with a full table scan.
317-
return _sample_with_limit(
318-
table,
319-
bqclient=bqclient,
320-
target_row_count=target_row_count,
321-
progress_bar_type=progress_bar_type,
322-
use_bqstorage_api=use_bqstorage_api,
323-
)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
# allow pandas-gbq to detect invalid package versions at runtime.
4141
"google-cloud-bigquery >=3.20.0,<4.0.0",
4242
"packaging >=22.0.0",
43+
"requests >= 2.20.0, < 3.0.0",
4344
]
4445
extras = {
4546
"bqstorage": [

testing/constraints-3.9.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ google-auth==2.14.1
1616
google-auth-oauthlib==0.7.0
1717
google-cloud-bigquery==3.20.0
1818
packaging==22.0.0
19+
requests==2.20.0
1920
# Extras
2021
google-cloud-bigquery-storage==2.16.2
2122
tqdm==4.23.0

tests/unit/test_core_sample.py

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -207,16 +207,11 @@ def test_estimate_limit(target_bytes, table_bytes, table_rows, fields, expected_
207207

208208
@mock.patch("pandas_gbq.core.read.download_results")
209209
def test_sample_with_tablesample(mock_download_results, mock_bigquery_client):
210-
mock_table = mock.Mock(spec=google.cloud.bigquery.Table)
211-
mock_table.project = "test-project"
212-
mock_table.dataset_id = "test_dataset"
213-
mock_table.table_id = "test_table"
214-
215210
proportion = 0.1
216211
target_row_count = 100
217212

218213
pandas_gbq.core.sample._sample_with_tablesample(
219-
mock_table,
214+
"test-project.test_dataset.test_table",
220215
bqclient=mock_bigquery_client,
221216
proportion=proportion,
222217
target_row_count=target_row_count,
@@ -226,25 +221,17 @@ def test_sample_with_tablesample(mock_download_results, mock_bigquery_client):
226221
query = mock_bigquery_client.query_and_wait.call_args[0][0]
227222
assert "TABLESAMPLE SYSTEM (10.0 PERCENT)" in query
228223
assert "LIMIT 100" in query
229-
assert (
230-
f"FROM `{mock_table.project}.{mock_table.dataset_id}.{mock_table.table_id}`"
231-
in query
232-
)
224+
assert "FROM `test-project.test_dataset.test_table`" in query
233225

234226
mock_download_results.assert_called_once()
235227

236228

237229
@mock.patch("pandas_gbq.core.read.download_results")
238230
def test_sample_with_limit(mock_download_results, mock_bigquery_client):
239-
mock_table = mock.Mock(spec=google.cloud.bigquery.Table)
240-
mock_table.project = "test-project"
241-
mock_table.dataset_id = "test_dataset"
242-
mock_table.table_id = "test_table"
243-
244231
target_row_count = 200
245232

246233
pandas_gbq.core.sample._sample_with_limit(
247-
mock_table,
234+
"test-project.test_dataset.test_table",
248235
bqclient=mock_bigquery_client,
249236
target_row_count=target_row_count,
250237
)
@@ -253,10 +240,7 @@ def test_sample_with_limit(mock_download_results, mock_bigquery_client):
253240
query = mock_bigquery_client.query_and_wait.call_args[0][0]
254241
assert "TABLESAMPLE" not in query
255242
assert "LIMIT 200" in query
256-
assert (
257-
f"FROM `{mock_table.project}.{mock_table.dataset_id}.{mock_table.table_id}`"
258-
in query
259-
)
243+
assert "FROM `test-project.test_dataset.test_table`" in query
260244

261245
mock_download_results.assert_called_once()
262246

0 commit comments

Comments
 (0)