diff --git a/docs/examples/code/fill_and_submit_web_form_crawler.py b/docs/examples/code/fill_and_submit_web_form_crawler.py index 7cd61be87..ef7db770a 100644 --- a/docs/examples/code/fill_and_submit_web_form_crawler.py +++ b/docs/examples/code/fill_and_submit_web_form_crawler.py @@ -1,4 +1,5 @@ import asyncio +import json from crawlee import Request from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext @@ -18,15 +19,17 @@ async def request_handler(context: HttpCrawlingContext) -> None: request = Request.from_url( url='https://httpbin.org/post', method='POST', - data={ - 'custname': 'John Doe', - 'custtel': '1234567890', - 'custemail': 'johndoe@example.com', - 'size': 'large', - 'topping': ['bacon', 'cheese', 'mushroom'], - 'delivery': '13:00', - 'comments': 'Please ring the doorbell upon arrival.', - }, + payload=json.dumps( + { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': ['bacon', 'cheese', 'mushroom'], + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + ).encode(), ) # Run the crawler with the initial list of requests. diff --git a/docs/examples/code/fill_and_submit_web_form_request.py b/docs/examples/code/fill_and_submit_web_form_request.py index 379eaec26..91eb65dff 100644 --- a/docs/examples/code/fill_and_submit_web_form_request.py +++ b/docs/examples/code/fill_and_submit_web_form_request.py @@ -1,16 +1,20 @@ +import json + from crawlee import Request # Prepare a POST request to the form endpoint. request = Request.from_url( url='https://httpbin.org/post', method='POST', - data={ - 'custname': 'John Doe', - 'custtel': '1234567890', - 'custemail': 'johndoe@example.com', - 'size': 'large', - 'topping': ['bacon', 'cheese', 'mushroom'], - 'delivery': '13:00', - 'comments': 'Please ring the doorbell upon arrival.', - }, + payload=json.dumps( + { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': ['bacon', 'cheese', 'mushroom'], + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + ).encode(), ) diff --git a/docs/examples/fill_and_submit_web_form.mdx b/docs/examples/fill_and_submit_web_form.mdx index 49a8c7d2e..8498bb301 100644 --- a/docs/examples/fill_and_submit_web_form.mdx +++ b/docs/examples/fill_and_submit_web_form.mdx @@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the {RequestExample} -Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `data` parameter is generally a better approach. +Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach. ## Implementing the crawler diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index d504989ac..7ed179b75 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -127,15 +127,14 @@ class BaseRequestData(BaseModel): method: HttpMethod = 'GET' """HTTP request method.""" - headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders())] = HttpHeaders() + headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders() """HTTP request headers.""" query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {} """URL query parameters.""" payload: HttpPayload | None = None - - data: Annotated[dict[str, Any], Field(default_factory=dict)] = {} + """HTTP request payload.""" user_data: Annotated[ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience @@ -169,6 +168,8 @@ def from_url( url: str, *, method: HttpMethod = 'GET', + headers: HttpHeaders | None = None, + query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, label: str | None = None, unique_key: str | None = None, @@ -178,9 +179,13 @@ def from_url( **kwargs: Any, ) -> Self: """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details.""" + headers = headers or HttpHeaders() + query_params = query_params or {} + unique_key = unique_key or compute_unique_key( url, method=method, + headers=headers, payload=payload, keep_url_fragment=keep_url_fragment, use_extended_unique_key=use_extended_unique_key, @@ -193,6 +198,8 @@ def from_url( unique_key=unique_key, id=id, method=method, + headers=headers, + query_params=query_params, payload=payload, **kwargs, ) @@ -243,6 +250,8 @@ def from_url( url: str, *, method: HttpMethod = 'GET', + headers: HttpHeaders | None = None, + query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, label: str | None = None, unique_key: str | None = None, @@ -261,6 +270,8 @@ def from_url( Args: url: The URL of the request. method: The HTTP method of the request. + headers: The HTTP headers of the request. + query_params: The query parameters of the URL. payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests. label: A custom label to differentiate between request types. This is stored in `user_data`, and it is used for request routing (different requests go to different handlers). @@ -274,9 +285,13 @@ def from_url( computation. This is only relevant when `unique_key` is not provided. **kwargs: Additional request properties. """ + headers = headers or HttpHeaders() + query_params = query_params or {} + unique_key = unique_key or compute_unique_key( url, method=method, + headers=headers, payload=payload, keep_url_fragment=keep_url_fragment, use_extended_unique_key=use_extended_unique_key, @@ -289,6 +304,8 @@ def from_url( unique_key=unique_key, id=id, method=method, + headers=headers, + query_params=query_params, payload=payload, **kwargs, ) @@ -377,6 +394,36 @@ def forefront(self) -> bool: def forefront(self, new_value: bool) -> None: self.crawlee_data.forefront = new_value + def __eq__(self, other: object) -> bool: + """Compare all relevant fields of the `Request` class, excluding deprecated fields `json_` and `order_no`. + + TODO: Remove this method once the issue is resolved. + https://github.com/apify/crawlee-python/issues/94 + """ + if isinstance(other, Request): + return ( + self.url == other.url + and self.unique_key == other.unique_key + and self.method == other.method + and self.headers == other.headers + and self.query_params == other.query_params + and self.payload == other.payload + and self.user_data == other.user_data + and self.retry_count == other.retry_count + and self.no_retry == other.no_retry + and self.loaded_url == other.loaded_url + and self.handled_at == other.handled_at + and self.id == other.id + and self.label == other.label + and self.state == other.state + and self.max_retries == other.max_retries + and self.session_rotation_count == other.session_rotation_count + and self.enqueue_strategy == other.enqueue_strategy + and self.last_proxy_tier == other.last_proxy_tier + and self.forefront == other.forefront + ) + return NotImplemented + class RequestWithLock(Request): """A crawling request with information about locks.""" diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index af5e53d00..84dc04a2e 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -52,7 +52,7 @@ HttpQueryParams: TypeAlias = dict[str, str] -HttpPayload: TypeAlias = Union[str, bytes] +HttpPayload: TypeAlias = bytes def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]: diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py index 3eb643d4b..24675529c 100644 --- a/src/crawlee/_utils/requests.py +++ b/src/crawlee/_utils/requests.py @@ -142,13 +142,7 @@ def compute_unique_key( def _get_payload_hash(payload: HttpPayload | None) -> str: - if payload is None: - payload_in_bytes = b'' - elif isinstance(payload, str): - payload_in_bytes = payload.encode('utf-8') - else: - payload_in_bytes = payload - + payload_in_bytes = b'' if payload is None else payload return compute_short_hash(payload_in_bytes) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 7edafc10b..bbfa80f3b 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Protocol +from typing import TYPE_CHECKING, Protocol from crawlee._utils.http import is_status_code_error from crawlee.errors import HttpStatusCodeError @@ -10,7 +10,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams + from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session @@ -115,7 +115,7 @@ async def send_request( method: HttpMethod = 'GET', headers: HttpHeaders | None = None, query_params: HttpQueryParams | None = None, - data: dict[str, Any] | None = None, + payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: @@ -128,7 +128,7 @@ async def send_request( method: The HTTP method to use. headers: The headers to include in the request. query_params: The query parameters to include in the request. - data: The data to be sent as the request body. + payload: The data to be sent as the request body. session: The session associated with the request. proxy_info: The information about the proxy to be used. diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index 2416bb043..f068bb570 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from crawlee._types import HttpMethod, HttpQueryParams + from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.statistics import Statistics @@ -132,7 +132,7 @@ async def crawl( method=request.method, headers=headers, params=request.query_params, - data=request.data, + content=request.payload, cookies=session.cookies if session else None, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, ) @@ -167,7 +167,7 @@ async def send_request( method: HttpMethod = 'GET', headers: HttpHeaders | None = None, query_params: HttpQueryParams | None = None, - data: dict[str, Any] | None = None, + payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: @@ -179,7 +179,7 @@ async def send_request( method=method, headers=dict(headers) if headers else None, params=query_params, - data=data, + content=payload, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, ) diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py index 005431645..81f903954 100644 --- a/src/crawlee/http_clients/curl_impersonate.py +++ b/src/crawlee/http_clients/curl_impersonate.py @@ -16,7 +16,7 @@ from curl_cffi.const import CurlHttpVersion from typing_extensions import override -from crawlee._types import HttpHeaders +from crawlee._types import HttpHeaders, HttpPayload from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee.errors import ProxyError from crawlee.http_clients import BaseHttpClient, HttpCrawlingResult, HttpResponse @@ -153,7 +153,7 @@ async def send_request( method: HttpMethod = 'GET', headers: HttpHeaders | None = None, query_params: HttpQueryParams | None = None, - data: dict[str, Any] | None = None, + payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: @@ -166,7 +166,7 @@ async def send_request( method=method.upper(), # type: ignore # curl-cffi requires uppercase method headers=dict(headers) if headers else None, params=query_params, - data=data, + data=payload, cookies=session.cookies if session else None, allow_redirects=True, ) diff --git a/src/crawlee/memory_storage_client/_request_queue_client.py b/src/crawlee/memory_storage_client/_request_queue_client.py index f6236305c..2de67bb03 100644 --- a/src/crawlee/memory_storage_client/_request_queue_client.py +++ b/src/crawlee/memory_storage_client/_request_queue_client.py @@ -268,7 +268,7 @@ async def add_request( persist_storage=self._memory_storage_client.persist_storage, ) - # We return wasAlreadyHandled is false even though the request may have been added as handled, + # We return was_already_handled=False even though the request may have been added as handled, # because that's how API behaves. return ProcessedRequest( id=request_model.id, @@ -519,15 +519,17 @@ async def _create_internal_request(self, request: Request, forefront: bool | Non if request.id is not None and request.id != id: raise ValueError('Request ID does not match its unique_key.') - json_request = await json_dumps({**(request.model_dump()), 'id': id}) + request_kwargs = { + **(request.model_dump()), + 'id': id, + 'order_no': order_no, + } + + del request_kwargs['json_'] + return Request( - url=request.url, - unique_key=request.unique_key, - id=id, - method=request.method, - retry_count=request.retry_count, - order_no=order_no, - json_=json_request, + **request_kwargs, + json_=await json_dumps(request_kwargs), ) def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None: @@ -538,7 +540,7 @@ def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decim timestamp = Decimal(datetime.now(timezone.utc).timestamp()) * 1000 timestamp = round(timestamp, 6) - # Make sure that this timestamp was not used yet, so that we have unique orderNos + # Make sure that this timestamp was not used yet, so that we have unique order_nos if timestamp <= self._last_used_timestamp: timestamp = self._last_used_timestamp + Decimal(0.000001) diff --git a/tests/unit/_utils/test_requests.py b/tests/unit/_utils/test_requests.py index 1dcdf64c6..007f18b0e 100644 --- a/tests/unit/_utils/test_requests.py +++ b/tests/unit/_utils/test_requests.py @@ -91,7 +91,7 @@ def test_compute_unique_key_handles_fragments() -> None: def test_compute_unique_key_handles_payload() -> None: url = 'https://crawlee.dev' - payload = '{"key": "value"}' + payload = b'{"key": "value"}' # Payload without extended unique key uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False) @@ -101,12 +101,8 @@ def test_compute_unique_key_handles_payload() -> None: uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True) assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev' - # Extended unique key and payload is string - uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True) - assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev' - # Extended unique key and payload is bytes - uk = compute_unique_key(url, method='POST', payload=payload.encode(), use_extended_unique_key=True) + uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True) assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev' diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py index 63aa8243d..afc9e79c7 100644 --- a/tests/unit/http_crawler/test_http_crawler.py +++ b/tests/unit/http_crawler/test_http_crawler.py @@ -1,17 +1,23 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING, AsyncGenerator, Awaitable, Callable from unittest.mock import AsyncMock, Mock +from urllib.parse import parse_qs, urlencode import pytest import respx from httpx import Response +from crawlee._request import Request +from crawlee.http_clients._httpx import HttpxHttpClient +from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient from crawlee.http_crawler import HttpCrawler from crawlee.sessions import SessionPool from crawlee.storages import RequestList if TYPE_CHECKING: + from crawlee.http_clients._base import BaseHttpClient from crawlee.http_crawler._http_crawling_context import HttpCrawlingContext @@ -201,3 +207,51 @@ async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRo assert len(server['html_endpoint'].calls) == 10 assert len(server['404_endpoint'].calls) == 10 assert len(server['500_endpoint'].calls) == 30 + + +@pytest.mark.parametrize( + 'http_client_class', + [CurlImpersonateHttpClient, HttpxHttpClient], + ids=['curl', 'httpx'], +) +async def test_sending_payload(http_client_class: type[BaseHttpClient]) -> None: + http_client = http_client_class() + crawler = HttpCrawler(http_client=http_client) + + # Payload, e.g. data from a form submission. + payload = { + 'custname': 'John Doe', + 'custtel': '1234567890', + 'custemail': 'johndoe@example.com', + 'size': 'large', + 'topping': '["bacon", "cheese", "mushroom"]', + 'delivery': '13:00', + 'comments': 'Please ring the doorbell upon arrival.', + } + + responses = [] + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + response = json.loads(context.http_response.read()) + # The httpbin.org/post endpoint returns the provided payload in the response. + responses.append(response) + + request = Request.from_url( + url='https://httpbin.org/post', + method='POST', + payload=urlencode(payload).encode(), + ) + + await crawler.run([request]) + + # The request handler should be called once. + assert len(responses) == 1 + + # The reconstructed payload data should match the original payload. We have to flatten the values, because + # parse_qs returns a list of values for each key. + response_data = { + k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data'].strip("b'").strip("'")).items() + } + + assert response_data == payload