diff --git a/src/apify/request_loaders/_apify_request_list.py b/src/apify/request_loaders/_apify_request_list.py index 2639387f..4a5fe971 100644 --- a/src/apify/request_loaders/_apify_request_list.py +++ b/src/apify/request_loaders/_apify_request_list.py @@ -2,13 +2,14 @@ import asyncio import re +from copy import deepcopy from itertools import chain from typing import Annotated, Any from pydantic import BaseModel, ConfigDict, Field, TypeAdapter from pydantic.alias_generators import to_camel -from crawlee._types import HttpMethod +from crawlee._types import HttpMethod, JsonSerializable from crawlee.http_clients import HttpClient, ImpitHttpClient from crawlee.request_loaders import RequestList @@ -26,7 +27,7 @@ class _RequestDetails(BaseModel): method: HttpMethod = 'GET' payload: str = '' headers: Annotated[dict[str, str], Field(default_factory=dict)] - user_data: Annotated[dict[str, str], Field(default_factory=dict)] + user_data: Annotated[dict[str, JsonSerializable], Field(default_factory=dict)] class _RequestsFromUrlInput(_RequestDetails): @@ -154,7 +155,9 @@ async def _process_remote_url(request_input: _RequestsFromUrlInput, http_client: method=request_input.method, payload=request_input.payload.encode('utf-8'), headers=request_input.headers, - user_data=request_input.user_data, + # Deep-copy so `Request.from_url` (which writes `__crawlee` into the dict) cannot corrupt + # the shared input, and nested JSON values are not aliased across the requests. + user_data=deepcopy(request_input.user_data), ) for match in matches ] diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index cd35b755..b5edeecf 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -41,13 +41,19 @@ }, id='all_options', ), + pytest.param( + { + 'userData': {'depth': 1, 'isStartUrl': True, 'nested': {'key': 'value'}}, + }, + id='non_string_user_data', + ), ], ) async def test_request_list_open_request_types( request_method: HttpMethod, optional_input: dict[str, Any], ) -> None: - """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" + """Test proper request list generation from various optional inputs for all method types for simple input.""" minimal_request_dict_input = { 'url': 'https://www.abc.com', 'method': request_method, @@ -190,6 +196,33 @@ async def test_request_list_open_from_url_additional_inputs(httpserver: HTTPServ assert request.user_data == expected_user_data +async def test_request_list_open_from_url_with_user_data_and_multiple_urls(httpserver: HTTPServer) -> None: + """Test that a remote source with `userData` yielding multiple URLs creates all requests with that user data.""" + expected_urls = {'https://www.one.com', 'https://www.two.com'} + httpserver.expect_oneshot_request('/file.txt').respond_with_data(status=200, response_data=' '.join(expected_urls)) + + request_list = await ApifyRequestList.open( + request_list_sources_input=[ + {'requestsFromUrl': httpserver.url_for('/file.txt'), 'userData': {'depth': 1, 'nested': {'key': 'value'}}}, + ], + ) + + requests = [] + while request := await request_list.fetch_next_request(): + requests.append(request) + + assert {request.url for request in requests} == expected_urls + for request in requests: + assert request.user_data['depth'] == 1 + assert request.user_data['nested'] == {'key': 'value'} + + # Each request owns an independent copy; mutating one must not leak into the others. + nested = requests[0].user_data['nested'] + assert isinstance(nested, dict) + nested['key'] = 'mutated' + assert requests[1].user_data['nested'] == {'key': 'value'} + + async def test_request_list_open_from_url_non_utf8_body(httpserver: HTTPServer) -> None: """Test that a non-UTF-8 response body does not crash ApifyRequestList.open.""" expected_url = 'https://www.someurl.com'