From 1800d5d089a20c30a389154da96dcd7baab4c54a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 11 Jun 2026 15:12:41 +0200 Subject: [PATCH 1/3] fix: accept arbitrary JSON userData in ApifyRequestList --- .../request_loaders/_apify_request_list.py | 8 ++++-- tests/unit/actor/test_request_list.py | 28 ++++++++++++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/apify/request_loaders/_apify_request_list.py b/src/apify/request_loaders/_apify_request_list.py index 2639387f2..3b21fcedb 100644 --- a/src/apify/request_loaders/_apify_request_list.py +++ b/src/apify/request_loaders/_apify_request_list.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, ConfigDict, Field, TypeAdapter from pydantic.alias_generators import to_camel -from crawlee._types import HttpMethod +from crawlee._types import HttpMethod, JsonSerializable from crawlee.http_clients import HttpClient, ImpitHttpClient from crawlee.request_loaders import RequestList @@ -26,7 +26,7 @@ class _RequestDetails(BaseModel): method: HttpMethod = 'GET' payload: str = '' headers: Annotated[dict[str, str], Field(default_factory=dict)] - user_data: Annotated[dict[str, str], Field(default_factory=dict)] + user_data: Annotated[dict[str, JsonSerializable], Field(default_factory=dict)] class _RequestsFromUrlInput(_RequestDetails): @@ -154,7 +154,9 @@ async def _process_remote_url(request_input: _RequestsFromUrlInput, http_client: method=request_input.method, payload=request_input.payload.encode('utf-8'), headers=request_input.headers, - user_data=request_input.user_data, + # Copy the user data so that `Request.from_url` does not mutate the shared input dict, + # which would break creation of the subsequent requests. + user_data=dict(request_input.user_data), ) for match in matches ] diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index cd35b7557..1d2d61623 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -41,13 +41,19 @@ }, id='all_options', ), + pytest.param( + { + 'userData': {'depth': 1, 'isStartUrl': True, 'nested': {'key': 'value'}}, + }, + id='non_string_user_data', + ), ], ) async def test_request_list_open_request_types( request_method: HttpMethod, optional_input: dict[str, Any], ) -> None: - """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" + """Test proper request list generation from various optional inputs for all method types for simple input.""" minimal_request_dict_input = { 'url': 'https://www.abc.com', 'method': request_method, @@ -190,6 +196,26 @@ async def test_request_list_open_from_url_additional_inputs(httpserver: HTTPServ assert request.user_data == expected_user_data +async def test_request_list_open_from_url_with_user_data_and_multiple_urls(httpserver: HTTPServer) -> None: + """Test that a remote source with `userData` yielding multiple URLs creates all requests with that user data.""" + expected_urls = {'https://www.one.com', 'https://www.two.com'} + httpserver.expect_oneshot_request('/file.txt').respond_with_data(status=200, response_data=' '.join(expected_urls)) + + request_list = await ApifyRequestList.open( + request_list_sources_input=[ + {'requestsFromUrl': httpserver.url_for('/file.txt'), 'userData': {'depth': 1}}, + ], + ) + + requests = [] + while request := await request_list.fetch_next_request(): + requests.append(request) + + assert {request.url for request in requests} == expected_urls + for request in requests: + assert request.user_data['depth'] == 1 + + async def test_request_list_open_from_url_non_utf8_body(httpserver: HTTPServer) -> None: """Test that a non-UTF-8 response body does not crash ApifyRequestList.open.""" expected_url = 'https://www.someurl.com' From af0df85a7b4b1cddc161524a4e24941441603575 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 12 Jun 2026 12:58:48 +0200 Subject: [PATCH 2/3] address feedback --- src/apify/request_loaders/_apify_request_list.py | 7 ++++--- tests/unit/actor/test_request_list.py | 7 ++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/apify/request_loaders/_apify_request_list.py b/src/apify/request_loaders/_apify_request_list.py index 3b21fcedb..4a5fe9711 100644 --- a/src/apify/request_loaders/_apify_request_list.py +++ b/src/apify/request_loaders/_apify_request_list.py @@ -2,6 +2,7 @@ import asyncio import re +from copy import deepcopy from itertools import chain from typing import Annotated, Any @@ -154,9 +155,9 @@ async def _process_remote_url(request_input: _RequestsFromUrlInput, http_client: method=request_input.method, payload=request_input.payload.encode('utf-8'), headers=request_input.headers, - # Copy the user data so that `Request.from_url` does not mutate the shared input dict, - # which would break creation of the subsequent requests. - user_data=dict(request_input.user_data), + # Deep-copy so `Request.from_url` (which writes `__crawlee` into the dict) cannot corrupt + # the shared input, and nested JSON values are not aliased across the requests. + user_data=deepcopy(request_input.user_data), ) for match in matches ] diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index 1d2d61623..e806b4c36 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -203,7 +203,7 @@ async def test_request_list_open_from_url_with_user_data_and_multiple_urls(https request_list = await ApifyRequestList.open( request_list_sources_input=[ - {'requestsFromUrl': httpserver.url_for('/file.txt'), 'userData': {'depth': 1}}, + {'requestsFromUrl': httpserver.url_for('/file.txt'), 'userData': {'depth': 1, 'nested': {'key': 'value'}}}, ], ) @@ -214,6 +214,11 @@ async def test_request_list_open_from_url_with_user_data_and_multiple_urls(https assert {request.url for request in requests} == expected_urls for request in requests: assert request.user_data['depth'] == 1 + assert request.user_data['nested'] == {'key': 'value'} + + # Each request owns an independent copy; mutating one must not leak into the others. + requests[0].user_data['nested']['key'] = 'mutated' + assert requests[1].user_data['nested'] == {'key': 'value'} async def test_request_list_open_from_url_non_utf8_body(httpserver: HTTPServer) -> None: From 0ce07669c0ba5b073cd4de1cd50828cae2de0cf0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 12 Jun 2026 13:03:05 +0200 Subject: [PATCH 3/3] test: narrow user_data type before nested mutation to fix type check --- tests/unit/actor/test_request_list.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index e806b4c36..b5edeecf3 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -217,7 +217,9 @@ async def test_request_list_open_from_url_with_user_data_and_multiple_urls(https assert request.user_data['nested'] == {'key': 'value'} # Each request owns an independent copy; mutating one must not leak into the others. - requests[0].user_data['nested']['key'] = 'mutated' + nested = requests[0].user_data['nested'] + assert isinstance(nested, dict) + nested['key'] = 'mutated' assert requests[1].user_data['nested'] == {'key': 'value'}