Skip to content

Commit 0e4816d

Browse files
vdusekclaude
andauthored
test: add e2e tests for Crawlee crawlers as Apify Actors (#784)
## Summary - Add 6 new E2E tests verifying each Crawlee crawler type (BasicCrawler, HttpCrawler, BeautifulSoupCrawler, ParselCrawler, PlaywrightCrawler, AdaptivePlaywrightCrawler) works correctly when deployed as an Actor on the Apify platform. - Each test deploys an Actor that crawls a local 5-page e-commerce test server, exercises link discovery (`enqueue_links` / `add_requests`), data extraction (`push_data`), and KVS storage (`Actor.set_value`). - Shared test infrastructure in `conftest.py`: ASGI test server, Playwright Dockerfile template, product data expectations, and a `_verify_crawler_results` helper that checks run status, dataset contents, and KVS records. ## Motivation - Ensuring that Crawlee crawlers can run on the Apify platform. - There are many interactions with Apify storages, which also helps validate the Apify storage client. ## Issue - Relates: #785 ## Test plan - [x] CI passes --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7b24df3 commit 0e4816d

20 files changed

Lines changed: 585 additions & 155 deletions

tests/e2e/actor_source_base/server.py

Lines changed: 99 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,133 @@
1+
"""Test HTTP server for e2e tests.
2+
3+
Serves an e-commerce test website with a category-based structure for testing crawl depth:
4+
5+
/ (depth 0) - Homepage with links to products, categories, about page, and deep chain
6+
/categories/electronics (depth 1) - Links to products 1 and 2
7+
/categories/home (depth 1) - Links to product 3
8+
/about (depth 1) - About page
9+
/deep/1 (depth 1) -> /deep/2 (depth 2) -> /deep/3 (depth 3) -> ... (infinite chain)
10+
/products/1 (depth 1 or 2) - Widget A
11+
/products/2 (depth 1 or 2) - Widget B
12+
/products/3 (depth 1 or 2) - Widget C
13+
14+
The homepage includes both direct product links (for Scrapy spiders that look for /products/ links
15+
on the start page) and category links (for testing crawl depth with Crawlee crawlers).
16+
With max_crawl_depth=2, the crawler reaches all products and categories but does not go beyond /deep/2.
117
"""
2-
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
3-
For example:
4-
http://localhost:8080/ contains links:
5-
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9
618

7-
http://localhost:8080/1 contains links:
8-
http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19
9-
10-
... and so on.
11-
"""
19+
from __future__ import annotations
1220

1321
import asyncio
1422
import logging
1523
from collections.abc import Awaitable, Callable, Coroutine
16-
from socket import socket
1724
from typing import Any
1825

1926
from uvicorn import Config
2027
from uvicorn.server import Server
21-
from yarl import URL
2228

2329
Receive = Callable[[], Awaitable[dict[str, Any]]]
2430
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
2531

32+
_PRODUCTS = {
33+
'1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
34+
'2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
35+
'3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
36+
}
2637

27-
async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
28-
"""Send an HTML response to the client."""
38+
39+
async def _send_html(send: Send, html: str, status: int = 200) -> None:
2940
await send(
3041
{
3142
'type': 'http.response.start',
3243
'status': status,
3344
'headers': [[b'content-type', b'text/html; charset=utf-8']],
3445
}
3546
)
36-
await send({'type': 'http.response.body', 'body': html_content})
37-
47+
await send({'type': 'http.response.body', 'body': html.encode()})
3848

39-
async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
40-
"""Main ASGI application handler that routes requests to specific handlers.
4149

42-
Args:
43-
scope: The ASGI connection scope.
44-
_: The ASGI receive function.
45-
send: The ASGI send function.
46-
"""
50+
async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
4751
assert scope['type'] == 'http'
4852
path = scope['path']
4953

50-
links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
51-
await send_html_response(
52-
send,
53-
f"""\
54-
<html><head>
55-
<title>Title for {path} </title>
56-
</head>
57-
<body>
58-
{links}
59-
</body></html>""".encode(),
60-
)
61-
62-
63-
class TestServer(Server):
64-
"""A test HTTP server implementation based on Uvicorn Server."""
65-
66-
@property
67-
def url(self) -> URL:
68-
"""Get the base URL of the server.
69-
70-
Returns:
71-
A URL instance with the server's base URL.
72-
"""
73-
protocol = 'https' if self.config.is_ssl else 'http'
74-
return URL(f'{protocol}://{self.config.host}:{self.config.port}/')
75-
76-
async def serve(self, sockets: list[socket] | None = None) -> None:
77-
"""Run the server."""
78-
if sockets:
79-
raise RuntimeError('Simple TestServer does not support custom sockets')
80-
self.restart_requested = asyncio.Event()
81-
82-
loop = asyncio.get_event_loop()
83-
tasks = {
84-
loop.create_task(super().serve()),
85-
}
86-
await asyncio.wait(tasks)
54+
if path == '/':
55+
await _send_html(
56+
send,
57+
'<html><head><title>E-commerce Test Store</title></head><body>'
58+
'<h1>Welcome to Test Store</h1>'
59+
'<a href="/products/1">Widget A</a>'
60+
'<a href="/products/2">Widget B</a>'
61+
'<a href="/products/3">Widget C</a>'
62+
'<a href="/categories/electronics">Electronics</a>'
63+
'<a href="/categories/home">Home &amp; Garden</a>'
64+
'<a href="/about">About Us</a>'
65+
'<a href="/deep/1">Explore More</a>'
66+
'</body></html>',
67+
)
68+
elif path == '/categories/electronics':
69+
await _send_html(
70+
send,
71+
'<html><head><title>Electronics</title></head><body>'
72+
'<h1>Electronics</h1>'
73+
'<a href="/products/1">Widget A</a>'
74+
'<a href="/products/2">Widget B</a>'
75+
'<a href="/">Back to Home</a>'
76+
'</body></html>',
77+
)
78+
elif path == '/categories/home':
79+
await _send_html(
80+
send,
81+
'<html><head><title>Home &amp; Garden</title></head><body>'
82+
'<h1>Home &amp; Garden</h1>'
83+
'<a href="/products/3">Widget C</a>'
84+
'<a href="/">Back to Home</a>'
85+
'</body></html>',
86+
)
87+
elif path.startswith('/products/'):
88+
product = _PRODUCTS.get(path.split('/')[-1])
89+
if product:
90+
await _send_html(
91+
send,
92+
f'<html><head><title>{product["name"]}</title></head><body>'
93+
f'<h1>{product["name"]}</h1>'
94+
f'<span class="price">{product["price"]}</span>'
95+
f'<p class="description">{product["description"]}</p>'
96+
f'<a href="/">Back to Home</a>'
97+
f'</body></html>',
98+
)
99+
else:
100+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
101+
elif path == '/about':
102+
await _send_html(
103+
send,
104+
'<html><head><title>About Us</title></head><body>'
105+
'<h1>About Test Store</h1>'
106+
'<p class="description">We sell the best widgets in the world.</p>'
107+
'<a href="/">Back to Home</a>'
108+
'</body></html>',
109+
)
110+
elif path.startswith('/deep/'):
111+
try:
112+
n = int(path.split('/')[-1])
113+
except ValueError:
114+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
115+
return
116+
await _send_html(
117+
send,
118+
f'<html><head><title>Deep Page {n}</title></head><body>'
119+
f'<h1>Deep Page {n}</h1>'
120+
f'<a href="/deep/{n + 1}">Go Deeper</a>'
121+
f'<a href="/">Back to Home</a>'
122+
f'</body></html>',
123+
)
124+
else:
125+
await _send_html(send, '<html><body>Not Found</body></html>', 404)
87126

88127

89128
if __name__ == '__main__':
90129
asyncio.run(
91-
TestServer(
130+
Server(
92131
config=Config(
93132
app=app,
94133
lifespan='off',

tests/e2e/conftest.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -168,12 +168,9 @@ def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]:
168168
'APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}'
169169
)
170170

171-
current_major_minor_python_version = '.'.join([str(x) for x in sys.version_info[:2]])
172-
integration_tests_python_version = (
173-
os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version
174-
)
171+
python_version = f'{sys.version_info[0]}.{sys.version_info[1]}'
175172
source_files['Dockerfile'] = str(source_files['Dockerfile']).replace(
176-
'BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version
173+
'BASE_IMAGE_VERSION_PLACEHOLDER', python_version
177174
)
178175

179176
return source_files
@@ -190,6 +187,7 @@ def __call__(
190187
main_py: str | None = None,
191188
source_files: Mapping[str, str | bytes] | None = None,
192189
additional_requirements: list[str] | None = None,
190+
memory_mbytes: int = 256,
193191
) -> Awaitable[ActorClientAsync]:
194192
"""Create a temporary Actor from the given main function or source files.
195193
@@ -204,6 +202,7 @@ def __call__(
204202
main_py: The `src/main.py` file of the Actor.
205203
source_files: A dictionary of the source files of the Actor.
206204
additional_requirements: A list of additional requirements to be added to the `requirements.txt`.
205+
memory_mbytes: The default memory allocation for the Actor run in MB.
207206
208207
Returns:
209208
A resource client for the created Actor.
@@ -229,6 +228,7 @@ async def _make_actor(
229228
main_py: str | None = None,
230229
source_files: Mapping[str, str | bytes] | None = None,
231230
additional_requirements: list[str] | None = None,
231+
memory_mbytes: int = 256,
232232
) -> ActorClientAsync:
233233
if not (main_func or main_py or source_files):
234234
raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified')
@@ -298,7 +298,7 @@ async def _make_actor(
298298
created_actor = await client.actors().create(
299299
name=actor_name,
300300
default_run_build='latest',
301-
default_run_memory_mbytes=256,
301+
default_run_memory_mbytes=memory_mbytes,
302302
default_run_timeout_secs=600,
303303
versions=[
304304
{

tests/e2e/test_crawlee/__init__.py

Whitespace-only changes.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from __future__ import annotations
2+
3+
from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
4+
5+
from apify import Actor
6+
7+
8+
async def main() -> None:
9+
async with Actor:
10+
pages_visited: list[str] = []
11+
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(max_crawl_depth=2)
12+
13+
@crawler.router.default_handler
14+
async def handler(context: AdaptivePlaywrightCrawlingContext) -> None:
15+
pages_visited.append(context.request.url)
16+
await context.enqueue_links()
17+
18+
if '/products/' in context.request.url:
19+
name = context.parsed_content.css('h1::text').get('').strip()
20+
price = context.parsed_content.css('span.price::text').get('').strip()
21+
description = context.parsed_content.css('p.description::text').get('').strip()
22+
if name:
23+
await context.push_data(
24+
{
25+
'url': context.request.url,
26+
'name': name,
27+
'price': price,
28+
'description': description,
29+
}
30+
)
31+
32+
await crawler.run(['http://localhost:8080/'])
33+
34+
await Actor.set_value(
35+
'CRAWLER_RESULT',
36+
{
37+
'pages_visited_count': len(pages_visited),
38+
'crawler_type': 'AdaptivePlaywrightCrawler',
39+
},
40+
)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from __future__ import annotations
2+
3+
from html.parser import HTMLParser
4+
from typing import TYPE_CHECKING
5+
6+
from crawlee.crawlers import BasicCrawler
7+
8+
from apify import Actor
9+
10+
if TYPE_CHECKING:
11+
from crawlee._types import BasicCrawlingContext
12+
13+
14+
class _PageParser(HTMLParser):
15+
def __init__(self) -> None:
16+
super().__init__()
17+
self.links: list[str] = []
18+
self.data: dict[str, str] = {}
19+
self._in_tag: str | None = None
20+
self._in_class: str = ''
21+
22+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
23+
attrs_dict = dict(attrs)
24+
if tag == 'a' and (href := attrs_dict.get('href')):
25+
self.links.append(href)
26+
self._in_tag = tag
27+
self._in_class = attrs_dict.get('class', '') or ''
28+
29+
def handle_endtag(self, tag: str) -> None: # noqa: ARG002
30+
self._in_tag = None
31+
self._in_class = ''
32+
33+
def handle_data(self, data: str) -> None:
34+
text = data.strip()
35+
if not text:
36+
return
37+
if self._in_tag == 'h1':
38+
self.data['name'] = text
39+
elif self._in_tag == 'span' and self._in_class == 'price':
40+
self.data['price'] = text
41+
elif self._in_tag == 'p' and self._in_class == 'description':
42+
self.data['description'] = text
43+
44+
45+
async def main() -> None:
46+
async with Actor:
47+
pages_visited: list[str] = []
48+
crawler = BasicCrawler(max_crawl_depth=2)
49+
50+
@crawler.router.default_handler
51+
async def handler(context: BasicCrawlingContext) -> None:
52+
pages_visited.append(context.request.url)
53+
54+
response = await context.send_request(context.request.url)
55+
html = (await response.read()).decode()
56+
57+
parser = _PageParser()
58+
parser.feed(html)
59+
60+
base_url = 'http://localhost:8080'
61+
await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')])
62+
63+
if '/products/' in context.request.url and parser.data.get('name'):
64+
await context.push_data({'url': context.request.url, **parser.data})
65+
66+
await crawler.run(['http://localhost:8080/'])
67+
68+
await Actor.set_value(
69+
'CRAWLER_RESULT',
70+
{
71+
'pages_visited_count': len(pages_visited),
72+
'crawler_type': 'BasicCrawler',
73+
},
74+
)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from __future__ import annotations
2+
3+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
4+
5+
from apify import Actor
6+
7+
8+
async def main() -> None:
9+
async with Actor:
10+
pages_visited: list[str] = []
11+
crawler = BeautifulSoupCrawler(max_crawl_depth=2)
12+
13+
@crawler.router.default_handler
14+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
15+
pages_visited.append(context.request.url)
16+
await context.enqueue_links()
17+
18+
if '/products/' in context.request.url:
19+
name_tag = context.soup.find('h1')
20+
price_tag = context.soup.find('span', class_='price')
21+
desc_tag = context.soup.find('p', class_='description')
22+
if name_tag:
23+
await context.push_data(
24+
{
25+
'url': context.request.url,
26+
'name': name_tag.get_text(strip=True),
27+
'price': price_tag.get_text(strip=True) if price_tag else '',
28+
'description': desc_tag.get_text(strip=True) if desc_tag else '',
29+
}
30+
)
31+
32+
await crawler.run(['http://localhost:8080/'])
33+
34+
await Actor.set_value(
35+
'CRAWLER_RESULT',
36+
{
37+
'pages_visited_count': len(pages_visited),
38+
'crawler_type': 'BeautifulSoupCrawler',
39+
},
40+
)

0 commit comments

Comments
 (0)