1414
1515import pandas_gbq .constants
1616import pandas_gbq .core .read
17+ import pandas_gbq .core .biglake
1718import pandas_gbq .gbq_connector
1819
1920# Only import at module-level at type checking time to avoid circular
@@ -156,7 +157,7 @@ def _download_results_in_parallel(
156157
157158
158159def _sample_with_tablesample (
159- table : google . cloud . bigquery . Table ,
160+ table_id : str ,
160161 * ,
161162 bqclient : google .cloud .bigquery .Client ,
162163 proportion : float ,
@@ -166,7 +167,7 @@ def _sample_with_tablesample(
166167) -> Optional [pandas .DataFrame ]:
167168 query = f"""
168169 SELECT *
169- FROM `{ table . project } . { table . dataset_id } . { table . table_id } `
170+ FROM `{ table_id } `
170171 TABLESAMPLE SYSTEM ({ float (proportion ) * 100.0 } PERCENT)
171172 ORDER BY RAND() DESC
172173 LIMIT { int (target_row_count )} ;
@@ -181,7 +182,7 @@ def _sample_with_tablesample(
181182
182183
183184def _sample_with_limit (
184- table : google . cloud . bigquery . Table ,
185+ table_id : str ,
185186 * ,
186187 bqclient : google .cloud .bigquery .Client ,
187188 target_row_count : int ,
@@ -190,7 +191,7 @@ def _sample_with_limit(
190191) -> Optional [pandas .DataFrame ]:
191192 query = f"""
192193 SELECT *
193- FROM `{ table . project } . { table . dataset_id } . { table . table_id } `
194+ FROM `{ table_id } `
194195 ORDER BY RAND() DESC
195196 LIMIT { int (target_row_count )} ;
196197 """
@@ -203,6 +204,82 @@ def _sample_with_limit(
203204 )
204205
205206
207+ def _sample_biglake_table (
208+ * ,
209+ table_id : str ,
210+ credentials : google .oauth2 .credentials .Credentials ,
211+ bqclient : google .cloud .bigquery .Client ,
212+ target_bytes : int ,
213+ progress_bar_type : str | None ,
214+ use_bqstorage_api : bool ,
215+ ) -> Optional [pandas .DataFrame ]:
216+ pass
217+
218+
219+ def _sample_bq_table (
220+ * ,
221+ table_id : str ,
222+ bqclient : google .cloud .bigquery .Client ,
223+ target_bytes : int ,
224+ progress_bar_type : str | None ,
225+ use_bqstorage_api : bool ,
226+ ) -> Optional [pandas .DataFrame ]:
227+ table = bqclient .get_table (table_id )
228+ num_rows = table .num_rows
229+ num_bytes = table .num_bytes
230+ table_type = table .table_type
231+
232+ # Some tables such as views report 0 despite actually having rows.
233+ if num_bytes == 0 :
234+ num_bytes = None
235+
236+ # Table is small enough to download the whole thing.
237+ if (
238+ table_type in _READ_API_ELIGIBLE_TYPES
239+ and num_bytes is not None
240+ and num_bytes <= target_bytes
241+ ):
242+ rows_iter = bqclient .list_rows (table )
243+ return pandas_gbq .core .read .download_results (
244+ rows_iter ,
245+ bqclient = bqclient ,
246+ progress_bar_type = progress_bar_type ,
247+ warn_on_large_results = False ,
248+ max_results = None ,
249+ user_dtypes = None ,
250+ use_bqstorage_api = use_bqstorage_api ,
251+ )
252+
253+ target_row_count = _estimate_limit (
254+ target_bytes = target_bytes ,
255+ table_bytes = num_bytes ,
256+ table_rows = num_rows ,
257+ fields = table .schema ,
258+ )
259+
260+ # Table is eligible for TABLESAMPLE.
261+ if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES :
262+ proportion = target_bytes / num_bytes
263+ return _sample_with_tablesample (
264+ f"{ table .project } .{ table .dataset_id } .{ table .table_id } " ,
265+ bqclient = bqclient ,
266+ proportion = proportion ,
267+ target_row_count = target_row_count ,
268+ progress_bar_type = progress_bar_type ,
269+ use_bqstorage_api = use_bqstorage_api ,
270+ )
271+
272+ # Not eligible for TABLESAMPLE or reading directly, so take a random sample
273+ # with a full table scan.
274+ return _sample_with_limit (
275+ f"{ table .project } .{ table .dataset_id } .{ table .table_id } " ,
276+ bqclient = bqclient ,
277+ target_row_count = target_row_count ,
278+ progress_bar_type = progress_bar_type ,
279+ use_bqstorage_api = use_bqstorage_api ,
280+ )
281+
282+
206283def sample (
207284 table_id : str ,
208285 * ,
@@ -267,57 +344,24 @@ def sample(
267344 )
268345 credentials = cast (google .oauth2 .credentials .Credentials , connector .credentials )
269346 bqclient = connector .get_client ()
270- table = bqclient .get_table (table_id )
271- num_rows = table .num_rows
272- num_bytes = table .num_bytes
273- table_type = table .table_type
274347
275- # Some tables such as views report 0 despite actually having rows.
276- if num_bytes == 0 :
277- num_bytes = None
278-
279- # Table is small enough to download the whole thing.
280- if (
281- table_type in _READ_API_ELIGIBLE_TYPES
282- and num_bytes is not None
283- and num_bytes <= target_bytes
284- ):
285- rows_iter = bqclient .list_rows (table )
286- return pandas_gbq .core .read .download_results (
287- rows_iter ,
348+ # BigLake tables can't be read directly by the BQ Storage Read API, so make
349+ # sure we run a query first.
350+ parts = table_id .split ("." )
351+ if len (parts ) == 4 :
352+ return _sample_biglake_table (
353+ table_id = table_id ,
354+ credentials = credentials ,
288355 bqclient = bqclient ,
356+ target_bytes = target_bytes ,
289357 progress_bar_type = progress_bar_type ,
290- warn_on_large_results = False ,
291- max_results = None ,
292- user_dtypes = None ,
293358 use_bqstorage_api = use_bqstorage_api ,
294359 )
295-
296- target_row_count = _estimate_limit (
297- target_bytes = target_bytes ,
298- table_bytes = num_bytes ,
299- table_rows = num_rows ,
300- fields = table .schema ,
301- )
302-
303- # Table is eligible for TABLESAMPLE.
304- if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES :
305- proportion = target_bytes / num_bytes
306- return _sample_with_tablesample (
307- table ,
360+ else :
361+ return _sample_bq_table (
362+ table_id = table_id ,
308363 bqclient = bqclient ,
309- proportion = proportion ,
310- target_row_count = target_row_count ,
364+ target_bytes = target_bytes ,
311365 progress_bar_type = progress_bar_type ,
312366 use_bqstorage_api = use_bqstorage_api ,
313367 )
314-
315- # Not eligible for TABLESAMPLE or reading directly, so take a random sample
316- # with a full table scan.
317- return _sample_with_limit (
318- table ,
319- bqclient = bqclient ,
320- target_row_count = target_row_count ,
321- progress_bar_type = progress_bar_type ,
322- use_bqstorage_api = use_bqstorage_api ,
323- )
0 commit comments