66from unittest import mock
77
88import google .cloud .bigquery
9+ import google .cloud .bigquery .table
910import pytest
1011
1112import pandas_gbq .constants
7172 ), # 0
7273 google .cloud .bigquery .SchemaField ("simple_int" , "INT64" ), # 8
7374 ],
74- 8 , # 0 + 8
75+ 9 , # 1 + 8
7576 id = "empty-struct" ,
7677 ),
77- pytest .param (
78- [
79- google .cloud .bigquery .SchemaField ("bytes" , "BYTES" ),
80- ]
81- * 9_999 ,
82- pandas_gbq .core .sample ._MAX_ROW_BYTES ,
83- id = "many-bytes" ,
84- ),
8578 # Case 8: Complex Mix (Combining multiple cases)
8679 pytest .param (
8780 [
@@ -127,21 +120,21 @@ def test_calculate_target_bytes_with_available_memory(mock_virtual_memory):
127120 available_memory = 2 * pandas_gbq .constants .BYTES_IN_GIB # 2 GB
128121 mock_virtual_memory .return_value = mock .Mock (available = available_memory )
129122
130- # Expected bytes is available memory / 4, as it falls between _MAX_ROW_BYTES and _MAX_AUTO_TARGET_BYTES
123+ # Expected bytes is available memory / 4.
131124 expected_bytes = available_memory // 4
132125 actual_bytes = pandas_gbq .core .sample ._calculate_target_bytes (None )
133126 assert actual_bytes == expected_bytes
134127
135128
136129@mock .patch ("psutil.virtual_memory" )
137- def test_calculate_target_bytes_low_memory_uses_max_row_bytes (mock_virtual_memory ):
130+ def test_calculate_target_bytes_low_memory (mock_virtual_memory ):
138131 # Mock psutil.virtual_memory to return a mock object with an 'available' attribute.
139132 # Set available memory to a low value.
140133 available_memory = 100 # 100 bytes
141134 mock_virtual_memory .return_value = mock .Mock (available = available_memory )
142135
143- # Expected bytes should be _MAX_ROW_BYTES because available // 4 is less .
144- expected_bytes = pandas_gbq . core . sample . _MAX_ROW_BYTES
136+ # Expected bytes should be low value // 4.
137+ expected_bytes = 25
145138 actual_bytes = pandas_gbq .core .sample ._calculate_target_bytes (None )
146139 assert actual_bytes == expected_bytes
147140
@@ -206,7 +199,45 @@ def test_estimate_limit(target_bytes, table_bytes, table_rows, fields, expected_
206199
207200
208201@mock .patch ("pandas_gbq.core.read.download_results" )
209- def test_sample_with_tablesample (mock_download_results , mock_bigquery_client ):
202+ def test_download_results_in_parallel_with_table (
203+ mock_download_results , mock_bigquery_client
204+ ):
205+ rows = mock .Mock (spec = google .cloud .bigquery .table .RowIterator )
206+ rows ._table = "table"
207+ rows ._schema = "schema"
208+ pandas_gbq .core .sample ._download_results_in_parallel (
209+ rows , bqclient = mock_bigquery_client
210+ )
211+ mock_bigquery_client .list_rows .assert_called_once_with (
212+ "table" , selected_fields = "schema"
213+ )
214+ mock_download_results .assert_called_once ()
215+
216+
217+ @mock .patch ("pandas_gbq.core.read.download_results" )
218+ def test_download_results_in_parallel_no_table (
219+ mock_download_results , mock_bigquery_client
220+ ):
221+ rows = mock .Mock (spec = google .cloud .bigquery .table .RowIterator )
222+ rows ._table = None
223+ rows ._schema = None
224+ pandas_gbq .core .sample ._download_results_in_parallel (
225+ rows , bqclient = mock_bigquery_client
226+ )
227+ mock_bigquery_client .list_rows .assert_not_called ()
228+ mock_download_results .assert_called_once_with (
229+ rows ,
230+ bqclient = mock_bigquery_client ,
231+ progress_bar_type = None ,
232+ warn_on_large_results = False ,
233+ max_results = None ,
234+ user_dtypes = None ,
235+ use_bqstorage_api = True ,
236+ )
237+
238+
239+ @mock .patch ("pandas_gbq.core.sample._download_results_in_parallel" )
240+ def test_sample_with_tablesample (mock_download_results_in_parallel , mock_bigquery_client ):
210241 proportion = 0.1
211242 target_row_count = 100
212243
@@ -219,15 +250,23 @@ def test_sample_with_tablesample(mock_download_results, mock_bigquery_client):
219250
220251 mock_bigquery_client .query_and_wait .assert_called_once ()
221252 query = mock_bigquery_client .query_and_wait .call_args [0 ][0 ]
222- assert "TABLESAMPLE SYSTEM (10.0 PERCENT)" in query
253+ assert "TABLESAMPLE SYSTEM (10 PERCENT)" in query
223254 assert "LIMIT 100" in query
224255 assert "FROM `test-project.test_dataset.test_table`" in query
225256
226- mock_download_results .assert_called_once ()
257+ # The mock for query_and_wait returns a mock RowIterator, which is then
258+ # passed to _download_results_in_parallel.
259+ mock_results = mock_bigquery_client .query_and_wait .return_value
260+ mock_download_results_in_parallel .assert_called_with (
261+ mock_results ,
262+ bqclient = mock_bigquery_client ,
263+ progress_bar_type = None ,
264+ use_bqstorage_api = True ,
265+ )
227266
228267
229- @mock .patch ("pandas_gbq.core.read.download_results " )
230- def test_sample_with_limit (mock_download_results , mock_bigquery_client ):
268+ @mock .patch ("pandas_gbq.core.sample._download_results_in_parallel " )
269+ def test_sample_with_limit (mock_download_results_in_parallel , mock_bigquery_client ):
231270 target_row_count = 200
232271
233272 pandas_gbq .core .sample ._sample_with_limit (
@@ -242,7 +281,15 @@ def test_sample_with_limit(mock_download_results, mock_bigquery_client):
242281 assert "LIMIT 200" in query
243282 assert "FROM `test-project.test_dataset.test_table`" in query
244283
245- mock_download_results .assert_called_once ()
284+ # The mock for query_and_wait returns a mock RowIterator, which is then
285+ # passed to _download_results_in_parallel.
286+ mock_results = mock_bigquery_client .query_and_wait .return_value
287+ mock_download_results_in_parallel .assert_called_with (
288+ mock_results ,
289+ bqclient = mock_bigquery_client ,
290+ progress_bar_type = None ,
291+ use_bqstorage_api = True ,
292+ )
246293
247294
248295@pytest .fixture
@@ -254,15 +301,70 @@ def mock_gbq_connector(mock_bigquery_client):
254301 yield mock_connector
255302
256303
304+ @mock .patch ("pandas_gbq.core.biglake.get_table_metadata" )
305+ @mock .patch ("pandas_gbq.core.sample._sample_with_tablesample" )
306+ def test_sample_biglake_table (
307+ mock_sample_with_tablesample ,
308+ mock_get_table_metadata ,
309+ mock_gbq_connector ,
310+ mock_bigquery_client ,
311+ ):
312+ mock_metadata = mock .Mock ()
313+ mock_metadata .num_rows = 1000
314+ mock_metadata .schema = [google .cloud .bigquery .SchemaField ("col1" , "INT64" )]
315+ mock_get_table_metadata .return_value = mock_metadata
316+ table_id = "p.c.d.t"
317+
318+ with mock .patch (
319+ "pandas_gbq.core.sample._calculate_target_bytes" , return_value = 1000
320+ ):
321+ pandas_gbq .core .sample .sample (table_id , billing_project_id = "p" )
322+
323+ mock_sample_with_tablesample .assert_called_once ()
324+ # 1000 target bytes / 8 bytes/row in schema = 125 target_row_count
325+ # 125 / 1000 rows = 0.125 proportion.
326+ # min(100, max(1, 0.125 * 100)) = 12.5 -> 12
327+ # So proportion should be about 0.125
328+ # Let's check the args
329+ args , kwargs = mock_sample_with_tablesample .call_args
330+ assert kwargs ["proportion" ] > 0.1
331+ assert kwargs ["target_row_count" ] == 125
332+
333+
334+ @mock .patch ("pandas_gbq.core.biglake.get_table_metadata" )
335+ @mock .patch ("pandas_gbq.core.sample._sample_with_tablesample" )
336+ def test_sample_biglake_table_zero_rows (
337+ mock_sample_with_tablesample ,
338+ mock_get_table_metadata ,
339+ mock_gbq_connector ,
340+ mock_bigquery_client ,
341+ ):
342+ mock_metadata = mock .Mock ()
343+ mock_metadata .num_rows = 0
344+ mock_metadata .schema = [google .cloud .bigquery .SchemaField ("col1" , "INT64" )]
345+ mock_get_table_metadata .return_value = mock_metadata
346+ table_id = "p.c.d.t"
347+
348+ with mock .patch (
349+ "pandas_gbq.core.sample._calculate_target_bytes" , return_value = 1000
350+ ):
351+ pandas_gbq .core .sample .sample (table_id , billing_project_id = "p" )
352+
353+ mock_sample_with_tablesample .assert_called_once ()
354+ args , kwargs = mock_sample_with_tablesample .call_args
355+ # Avoid division by zero
356+ assert kwargs ["proportion" ] > 0.0
357+
358+
257359@mock .patch ("pandas_gbq.core.read.download_results" )
258360def test_sample_small_table_downloads_all (
259361 mock_download_results , mock_gbq_connector , mock_bigquery_client
260362):
261363 mock_table = mock .Mock (spec = google .cloud .bigquery .Table )
262- type(mock_table ).table_type = mock . PropertyMock ( return_value = "TABLE" )
263- type(mock_table ).num_bytes = mock . PropertyMock ( return_value = 1000 )
264- type(mock_table ).num_rows = mock . PropertyMock ( return_value = 10 )
265- type(mock_table ).schema = mock . PropertyMock ( return_value = [])
364+ type(mock_table ).table_type = "TABLE"
365+ type(mock_table ).num_bytes = 1000
366+ type(mock_table ).num_rows = 10
367+ type(mock_table ).schema = []
266368 mock_bigquery_client .get_table .return_value = mock_table
267369
268370 with mock .patch (
@@ -281,12 +383,13 @@ def test_sample_uses_tablesample(
281383 mock_sample_with_tablesample , mock_gbq_connector , mock_bigquery_client
282384):
283385 mock_table = mock .Mock (spec = google .cloud .bigquery .Table )
284- type(mock_table ).table_type = mock .PropertyMock (return_value = "TABLE" )
285- type(mock_table ).num_bytes = mock .PropertyMock (return_value = 1_000_000_000_000 )
286- type(mock_table ).num_rows = mock .PropertyMock (return_value = 1_000 )
287- type(mock_table ).schema = mock .PropertyMock (
288- return_value = [google .cloud .bigquery .SchemaField ("col1" , "INT64" )]
289- )
386+ type(mock_table ).project = "my-project"
387+ type(mock_table ).dataset_id = "my_dataset"
388+ type(mock_table ).table_id = "my_table"
389+ type(mock_table ).table_type = "TABLE"
390+ type(mock_table ).num_bytes = 1_000_000_000_000
391+ type(mock_table ).num_rows = 1_000
392+ type(mock_table ).schema = [google .cloud .bigquery .SchemaField ("col1" , "INT64" )]
290393 mock_bigquery_client .get_table .return_value = mock_table
291394
292395 pandas_gbq .core .sample .sample ("my-project.my_dataset.my_table" , target_mb = 1 )
@@ -299,6 +402,9 @@ def test_sample_uses_limit_fallback(
299402 mock_sample_with_limit , mock_gbq_connector , mock_bigquery_client
300403):
301404 mock_table = mock .Mock (spec = google .cloud .bigquery .Table )
405+ mock_table .project = "my-project"
406+ mock_table .dataset_id = "my_dataset"
407+ mock_table .table_id = "my_table"
302408 mock_table .num_bytes = 10000
303409 mock_table .num_rows = 100
304410 mock_table .table_type = "VIEW" # Not eligible for TABLESAMPLE
@@ -318,6 +424,9 @@ def test_sample_uses_limit_fallback_no_bytes(
318424 mock_sample_with_limit , mock_gbq_connector , mock_bigquery_client
319425):
320426 mock_table = mock .Mock (spec = google .cloud .bigquery .Table )
427+ mock_table .project = "my-project"
428+ mock_table .dataset_id = "my_dataset"
429+ mock_table .table_id = "my_table"
321430 mock_table .num_bytes = None # num_bytes can be None
322431 mock_table .num_rows = 100
323432 mock_table .table_type = "TABLE"
0 commit comments