Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions src/crawlee/request_loaders/_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import override

from crawlee import Request
from crawlee import Request, RequestOptions
from crawlee._utils.docs import docs_group
from crawlee._utils.globs import Glob
from crawlee._utils.recoverable_state import RecoverableState
Expand All @@ -18,9 +18,10 @@

if TYPE_CHECKING:
import re
from collections.abc import Sequence
from collections.abc import Callable, Sequence
from types import TracebackType

from crawlee import RequestTransformAction
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo
from crawlee.storage_clients.models import ProcessedRequest
Expand Down Expand Up @@ -112,6 +113,7 @@ def __init__(
exclude: list[re.Pattern[Any] | Glob] | None = None,
max_buffer_size: int = 200,
persist_state_key: str | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Buuut... shouldn't this also receive an URL of the origin sitemap?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that makes sense. A sitemap cannot contain links to another domain.
This way, users can easily create a mapping between the original link to the sitemap and the link inside transform_request_function. From my point of view, the most valuable thing that adding transform_request_function gives is the ability to add a label so that the request is processed by the appropriate handler.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes a lot of sense, thanks. But I'm afraid that this won't "click" for a lot of people. Perhaps we could add an example that showcases this?

) -> None:
"""Initialize the sitemap request loader.

Expand All @@ -125,13 +127,17 @@ def __init__(
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
When provided, allows resuming from where it left off after interruption.
If None, no state persistence occurs.
transform_request_function: An optional function to transform requests
generated by the loader. It receives `RequestOptions` with `url` and should return either
modified `RequestOptions` or a `RequestTransformAction`.
"""
self._http_client = http_client
self._sitemap_urls = sitemap_urls
self._include = include
self._exclude = exclude
self._proxy_info = proxy_info
self._max_buffer_size = max_buffer_size
self._transform_request_function = transform_request_function

# Synchronization for queue operations
self._queue_has_capacity = asyncio.Event()
Expand Down Expand Up @@ -313,8 +319,15 @@ async def fetch_next_request(self) -> Request | None:

async with self._queue_lock:
url = state.url_queue.popleft()

request = Request.from_url(url)
request_option = RequestOptions(url=url)
if self._transform_request_function:
transform_request_option = self._transform_request_function(request_option)
if transform_request_option == 'skip':
state.total_count -= 1
continue
if transform_request_option != 'unchanged':
request_option = transform_request_option
request = Request.from_url(**request_option)
state.in_progress.add(request.url)
if len(state.url_queue) < self._max_buffer_size:
self._queue_has_capacity.set()
Expand Down
35 changes: 35 additions & 0 deletions tests/unit/request_loaders/test_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from yarl import URL

from crawlee import RequestOptions, RequestTransformAction
from crawlee.http_clients._base import HttpClient
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
from crawlee.storages import KeyValueStore
Expand Down Expand Up @@ -172,3 +173,37 @@ async def test_recovery_data_persistence_for_sitemap_loading(

assert item is not None
assert item.url == next_item_in_kvs


async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
request_options['user_data'] = {'transformed': True}
return request_options

sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)],
http_client=http_client,
transform_request_function=transform_request,
)

extracted_urls = set()

while not await sitemap_loader.is_finished():
request = await sitemap_loader.fetch_next_request()
assert request is not None
assert request.user_data.get('transformed') is True

extracted_urls.add(request.url)

await sitemap_loader.mark_request_as_handled(request)

assert len(extracted_urls) == 5
assert extracted_urls == {
'http://not-exists.com/',
'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
'http://not-exists.com/catalog?item=83&desc=vacation_usa',
}
Loading