1616import pandas_gbq .core .read
1717import pandas_gbq .core .biglake
1818import pandas_gbq .gbq_connector
19+ import pandas_gbq .core .resource_references
1920
2021# Only import at module-level at type checking time to avoid circular
2122# dependencies in the pandas package, which has an optional dependency on
5354# TODO(tswast): Choose an estimate based on actual BigQuery stats.
5455_ARRAY_LENGTH_ESTIMATE = 5
5556_UNKNOWN_TYPE_SIZE_ESTIMATE = 4
56- _MAX_ROW_BYTES = 100 * pandas_gbq .constants .BYTES_IN_MIB
5757_MAX_AUTO_TARGET_BYTES = 1 * pandas_gbq .constants .BYTES_IN_GIB
5858
5959
@@ -62,15 +62,15 @@ def _calculate_target_bytes(target_mb: Optional[int]) -> int:
6262 return target_mb * pandas_gbq .constants .BYTES_IN_MIB
6363
6464 mem = psutil .virtual_memory ()
65- return min (_MAX_AUTO_TARGET_BYTES , max ( _MAX_ROW_BYTES , mem .available // 4 ) )
65+ return min (_MAX_AUTO_TARGET_BYTES , mem .available // 4 )
6666
6767
6868def _estimate_limit (
6969 * ,
70- target_bytes : int ,
71- table_bytes : Optional [int ],
72- table_rows : Optional [int ],
7370 fields : Sequence [google .cloud .bigquery .SchemaField ],
71+ target_bytes : int ,
72+ table_bytes : Optional [int ] = None ,
73+ table_rows : Optional [int ] = None ,
7474) -> int :
7575 if table_bytes and table_rows :
7676 proportion = target_bytes / table_bytes
@@ -119,8 +119,8 @@ def _estimate_row_bytes(fields: Sequence[google.cloud.bigquery.SchemaField]) ->
119119 Returns:
120120 An integer representing the estimated total row size in logical bytes.
121121 """
122- total_size = min (
123- _MAX_ROW_BYTES ,
122+ total_size = max (
123+ 1 ,
124124 sum (_estimate_field_bytes (field ) for field in fields ),
125125 )
126126 return total_size
@@ -165,10 +165,11 @@ def _sample_with_tablesample(
165165 progress_bar_type : Optional [str ] = None ,
166166 use_bqstorage_api : bool = True ,
167167) -> Optional [pandas .DataFrame ]:
168+ sample_percent = min (100 , max (1 , int (proportion * 100 )))
168169 query = f"""
169170 SELECT *
170- FROM `{ table_id } `
171- TABLESAMPLE SYSTEM ({ float ( proportion ) * 100.0 } PERCENT)
171+ FROM `{ table_id } ` t
172+ TABLESAMPLE SYSTEM ({ sample_percent } PERCENT)
172173 ORDER BY RAND() DESC
173174 LIMIT { int (target_row_count )} ;
174175 """
@@ -206,25 +207,55 @@ def _sample_with_limit(
206207
207208def _sample_biglake_table (
208209 * ,
209- table_id : str ,
210- credentials : google .oauth2 .credentials .Credentials ,
210+ reference : pandas_gbq .core .resource_references .BigLakeTableId ,
211211 bqclient : google .cloud .bigquery .Client ,
212212 target_bytes : int ,
213213 progress_bar_type : str | None ,
214214 use_bqstorage_api : bool ,
215215) -> Optional [pandas .DataFrame ]:
216- pass
216+ metadata = pandas_gbq .core .biglake .get_table_metadata (
217+ reference = reference ,
218+ bqclient = bqclient ,
219+ )
220+ total_rows = metadata .num_rows
221+
222+ # Avoid divide by 0 when calculating proportions.
223+ if total_rows == 0 :
224+ total_rows = 1
225+
226+ target_row_count = _estimate_limit (
227+ target_bytes = target_bytes ,
228+ fields = metadata .schema ,
229+ table_rows = total_rows ,
230+ )
231+ proportion = max (0.01 , target_row_count / total_rows )
232+
233+ # BigLake tables should always support table sample, since they are backed
234+ # by parquet files.
235+ return _sample_with_tablesample (
236+ f"{ reference .project } .{ reference .catalog } .{ '.' .join (reference .namespace )} .{ reference .table } " ,
237+ bqclient = bqclient ,
238+ proportion = proportion ,
239+ target_row_count = target_row_count ,
240+ progress_bar_type = progress_bar_type ,
241+ use_bqstorage_api = use_bqstorage_api ,
242+ )
217243
218244
219245def _sample_bq_table (
220246 * ,
221- table_id : str ,
247+ reference : pandas_gbq . core . resource_references . BigQueryTableId ,
222248 bqclient : google .cloud .bigquery .Client ,
223249 target_bytes : int ,
224250 progress_bar_type : str | None ,
225251 use_bqstorage_api : bool ,
226252) -> Optional [pandas .DataFrame ]:
227- table = bqclient .get_table (table_id )
253+ table = bqclient .get_table (google .cloud .bigquery .TableReference (
254+ google .cloud .bigquery .DatasetReference (
255+ reference .project_id , reference .dataset_id
256+ ),
257+ reference .table_id
258+ ))
228259 num_rows = table .num_rows
229260 num_bytes = table .num_bytes
230261 table_type = table .table_type
@@ -342,24 +373,22 @@ def sample(
342373 connector = pandas_gbq .gbq_connector .GbqConnector (
343374 project_id = billing_project_id , credentials = credentials
344375 )
345- credentials = cast (google .oauth2 .credentials .Credentials , connector .credentials )
346376 bqclient = connector .get_client ()
347377
348378 # BigLake tables can't be read directly by the BQ Storage Read API, so make
349379 # sure we run a query first.
350- parts = table_id . split ( "." )
351- if len ( parts ) == 4 :
380+ reference = pandas_gbq . core . resource_references . parse_table_id ( table_id )
381+ if isinstance ( reference , pandas_gbq . core . resource_references . BigLakeTableId ) :
352382 return _sample_biglake_table (
353- table_id = table_id ,
354- credentials = credentials ,
383+ reference = reference ,
355384 bqclient = bqclient ,
356385 target_bytes = target_bytes ,
357386 progress_bar_type = progress_bar_type ,
358387 use_bqstorage_api = use_bqstorage_api ,
359388 )
360389 else :
361390 return _sample_bq_table (
362- table_id = table_id ,
391+ reference = reference ,
363392 bqclient = bqclient ,
364393 target_bytes = target_bytes ,
365394 progress_bar_type = progress_bar_type ,
0 commit comments