Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/apify/request_loaders/_apify_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
from pydantic.alias_generators import to_camel

from crawlee._types import HttpMethod
from crawlee._types import HttpMethod, JsonSerializable
from crawlee.http_clients import HttpClient, ImpitHttpClient
from crawlee.request_loaders import RequestList

Expand All @@ -26,7 +26,7 @@ class _RequestDetails(BaseModel):
method: HttpMethod = 'GET'
payload: str = ''
headers: Annotated[dict[str, str], Field(default_factory=dict)]
user_data: Annotated[dict[str, str], Field(default_factory=dict)]
user_data: Annotated[dict[str, JsonSerializable], Field(default_factory=dict)]


class _RequestsFromUrlInput(_RequestDetails):
Expand Down Expand Up @@ -154,7 +154,9 @@ async def _process_remote_url(request_input: _RequestsFromUrlInput, http_client:
method=request_input.method,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data,
# Copy the user data so that `Request.from_url` does not mutate the shared input dict,
# which would break creation of the subsequent requests.
user_data=dict(request_input.user_data),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be deepcopy(request_input.user_data)?

)
for match in matches
]
28 changes: 27 additions & 1 deletion tests/unit/actor/test_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,19 @@
},
id='all_options',
),
pytest.param(
{
'userData': {'depth': 1, 'isStartUrl': True, 'nested': {'key': 'value'}},
},
id='non_string_user_data',
),
],
)
async def test_request_list_open_request_types(
request_method: HttpMethod,
optional_input: dict[str, Any],
) -> None:
"""Test proper request list generation from both minimal and full inputs for all method types for simple input."""
"""Test proper request list generation from various optional inputs for all method types for simple input."""
minimal_request_dict_input = {
'url': 'https://www.abc.com',
'method': request_method,
Expand Down Expand Up @@ -190,6 +196,26 @@ async def test_request_list_open_from_url_additional_inputs(httpserver: HTTPServ
assert request.user_data == expected_user_data


async def test_request_list_open_from_url_with_user_data_and_multiple_urls(httpserver: HTTPServer) -> None:
"""Test that a remote source with `userData` yielding multiple URLs creates all requests with that user data."""
expected_urls = {'https://www.one.com', 'https://www.two.com'}
httpserver.expect_oneshot_request('/file.txt').respond_with_data(status=200, response_data=' '.join(expected_urls))

request_list = await ApifyRequestList.open(
request_list_sources_input=[
{'requestsFromUrl': httpserver.url_for('/file.txt'), 'userData': {'depth': 1}},
],
)

requests = []
while request := await request_list.fetch_next_request():
requests.append(request)

assert {request.url for request in requests} == expected_urls
for request in requests:
assert request.user_data['depth'] == 1


async def test_request_list_open_from_url_non_utf8_body(httpserver: HTTPServer) -> None:
"""Test that a non-UTF-8 response body does not crash ApifyRequestList.open."""
expected_url = 'https://www.someurl.com'
Expand Down
Loading