apify
diff --git a/‎tests/e2e/actor_source_base/server.py‎
Lines changed: 99 additions & 60 deletions b/‎tests/e2e/actor_source_base/server.py‎
Lines changed: 99 additions & 60 deletions
diff --git a/‎tests/e2e/conftest.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/e2e/conftest.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/e2e/test_crawlee/__init__.py‎ b/‎tests/e2e/test_crawlee/__init__.py‎
diff --git a/‎tests/e2e/test_crawlee/actor_source/main_adaptive_playwright_crawler.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/e2e/test_crawlee/actor_source/main_adaptive_playwright_crawler.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee/actor_source/main_basic_crawler.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/e2e/test_crawlee/actor_source/main_basic_crawler.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎tests/e2e/test_crawlee/actor_source/main_beautifulsoup_crawler.py‎
Lines changed: 40 additions & 0 deletions b/‎tests/e2e/test_crawlee/actor_source/main_beautifulsoup_crawler.py‎
Lines changed: 40 additions & 0 deletions
@@ -1,94 +1,133 @@
+"""Test HTTP server for e2e tests.
+
+Serves an e-commerce test website with a category-based structure for testing crawl depth:
+
+    / (depth 0) - Homepage with links to products, categories, about page, and deep chain
+    /categories/electronics (depth 1) - Links to products 1 and 2
+    /categories/home (depth 1) - Links to product 3
+    /about (depth 1) - About page
+    /deep/1 (depth 1) -> /deep/2 (depth 2) -> /deep/3 (depth 3) -> ... (infinite chain)
+    /products/1 (depth 1 or 2) - Widget A
+    /products/2 (depth 1 or 2) - Widget B
+    /products/3 (depth 1 or 2) - Widget C
+
+The homepage includes both direct product links (for Scrapy spiders that look for /products/ links
+on the start page) and category links (for testing crawl depth with Crawlee crawlers).
+With max_crawl_depth=2, the crawler reaches all products and categories but does not go beyond /deep/2.
 """
-Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
-For example:
-    http://localhost:8080/ contains links:
-http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9
 
-    http://localhost:8080/1 contains links:
-http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19
-
-... and so on.
-"""
+from __future__ import annotations
 
 import asyncio
 import logging
 from collections.abc import Awaitable, Callable, Coroutine
-from socket import socket
 from typing import Any
 
 from uvicorn import Config
 from uvicorn.server import Server
-from yarl import URL
 
 Receive = Callable[[], Awaitable[dict[str, Any]]]
 Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
 
+_PRODUCTS = {
+    '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
+    '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
+    '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
+}
 
-async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
-    """Send an HTML response to the client."""
+
+async def _send_html(send: Send, html: str, status: int = 200) -> None:
     await send(
         {
             'type': 'http.response.start',
             'status': status,
             'headers': [[b'content-type', b'text/html; charset=utf-8']],
         }
     )
-    await send({'type': 'http.response.body', 'body': html_content})
-
+    await send({'type': 'http.response.body', 'body': html.encode()})
 
-async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
-    """Main ASGI application handler that routes requests to specific handlers.
 
-    Args:
-        scope: The ASGI connection scope.
-        _: The ASGI receive function.
-        send: The ASGI send function.
-    """
+async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
     assert scope['type'] == 'http'
     path = scope['path']
 
-    links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
-    await send_html_response(
-        send,
-        f"""\
-<html><head>
-    <title>Title for {path} </title>
-</head>
-<body>
-    {links}
-</body></html>""".encode(),
-    )
-
-
-class TestServer(Server):
-    """A test HTTP server implementation based on Uvicorn Server."""
-
-    @property
-    def url(self) -> URL:
-        """Get the base URL of the server.
-
-        Returns:
-            A URL instance with the server's base URL.
-        """
-        protocol = 'https' if self.config.is_ssl else 'http'
-        return URL(f'{protocol}://{self.config.host}:{self.config.port}/')
-
-    async def serve(self, sockets: list[socket] | None = None) -> None:
-        """Run the server."""
-        if sockets:
-            raise RuntimeError('Simple TestServer does not support custom sockets')
-        self.restart_requested = asyncio.Event()
-
-        loop = asyncio.get_event_loop()
-        tasks = {
-            loop.create_task(super().serve()),
-        }
-        await asyncio.wait(tasks)
+    if path == '/':
+        await _send_html(
+            send,
+            '<html><head><title>E-commerce Test Store</title></head><body>'
+            '<h1>Welcome to Test Store</h1>'
+            '<a href="/products/1">Widget A</a>'
+            '<a href="/products/2">Widget B</a>'
+            '<a href="/products/3">Widget C</a>'
+            '<a href="/categories/electronics">Electronics</a>'
+            '<a href="/categories/home">Home &amp; Garden</a>'
+            '<a href="/about">About Us</a>'
+            '<a href="/deep/1">Explore More</a>'
+            '</body></html>',
+        )
+    elif path == '/categories/electronics':
+        await _send_html(
+            send,
+            '<html><head><title>Electronics</title></head><body>'
+            '<h1>Electronics</h1>'
+            '<a href="/products/1">Widget A</a>'
+            '<a href="/products/2">Widget B</a>'
+            '<a href="/">Back to Home</a>'
+            '</body></html>',
+        )
+    elif path == '/categories/home':
+        await _send_html(
+            send,
+            '<html><head><title>Home &amp; Garden</title></head><body>'
+            '<h1>Home &amp; Garden</h1>'
+            '<a href="/products/3">Widget C</a>'
+            '<a href="/">Back to Home</a>'
+            '</body></html>',
+        )
+    elif path.startswith('/products/'):
+        product = _PRODUCTS.get(path.split('/')[-1])
+        if product:
+            await _send_html(
+                send,
+                f'<html><head><title>{product["name"]}</title></head><body>'
+                f'<h1>{product["name"]}</h1>'
+                f'<span class="price">{product["price"]}</span>'
+                f'<p class="description">{product["description"]}</p>'
+                f'<a href="/">Back to Home</a>'
+                f'</body></html>',
+            )
+        else:
+            await _send_html(send, '<html><body>Not Found</body></html>', 404)
+    elif path == '/about':
+        await _send_html(
+            send,
+            '<html><head><title>About Us</title></head><body>'
+            '<h1>About Test Store</h1>'
+            '<p class="description">We sell the best widgets in the world.</p>'
+            '<a href="/">Back to Home</a>'
+            '</body></html>',
+        )
+    elif path.startswith('/deep/'):
+        try:
+            n = int(path.split('/')[-1])
+        except ValueError:
+            await _send_html(send, '<html><body>Not Found</body></html>', 404)
+            return
+        await _send_html(
+            send,
+            f'<html><head><title>Deep Page {n}</title></head><body>'
+            f'<h1>Deep Page {n}</h1>'
+            f'<a href="/deep/{n + 1}">Go Deeper</a>'
+            f'<a href="/">Back to Home</a>'
+            f'</body></html>',
+        )
+    else:
+        await _send_html(send, '<html><body>Not Found</body></html>', 404)
 
 
 if __name__ == '__main__':
     asyncio.run(
-        TestServer(
+        Server(
             config=Config(
                 app=app,
                 lifespan='off',
 
@@ -168,12 +168,9 @@ def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]:
         'APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}'
     )
 
-    current_major_minor_python_version = '.'.join([str(x) for x in sys.version_info[:2]])
-    integration_tests_python_version = (
-        os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version
-    )
+    python_version = f'{sys.version_info[0]}.{sys.version_info[1]}'
     source_files['Dockerfile'] = str(source_files['Dockerfile']).replace(
-        'BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version
+        'BASE_IMAGE_VERSION_PLACEHOLDER', python_version
     )
 
     return source_files
@@ -190,6 +187,7 @@ def __call__(
         main_py: str | None = None,
         source_files: Mapping[str, str | bytes] | None = None,
         additional_requirements: list[str] | None = None,
+        memory_mbytes: int = 256,
     ) -> Awaitable[ActorClientAsync]:
         """Create a temporary Actor from the given main function or source files.
 
@@ -204,6 +202,7 @@ def __call__(
             main_py: The `src/main.py` file of the Actor.
             source_files: A dictionary of the source files of the Actor.
             additional_requirements: A list of additional requirements to be added to the `requirements.txt`.
+            memory_mbytes: The default memory allocation for the Actor run in MB.
 
         Returns:
             A resource client for the created Actor.
@@ -229,6 +228,7 @@ async def _make_actor(
         main_py: str | None = None,
         source_files: Mapping[str, str | bytes] | None = None,
         additional_requirements: list[str] | None = None,
+        memory_mbytes: int = 256,
     ) -> ActorClientAsync:
         if not (main_func or main_py or source_files):
             raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified')
@@ -298,7 +298,7 @@ async def _make_actor(
         created_actor = await client.actors().create(
             name=actor_name,
             default_run_build='latest',
-            default_run_memory_mbytes=256,
+            default_run_memory_mbytes=memory_mbytes,
             default_run_timeout_secs=600,
             versions=[
                 {
 
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(max_crawl_depth=2)
+
+        @crawler.router.default_handler
+        async def handler(context: AdaptivePlaywrightCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            await context.enqueue_links()
+
+            if '/products/' in context.request.url:
+                name = context.parsed_content.css('h1::text').get('').strip()
+                price = context.parsed_content.css('span.price::text').get('').strip()
+                description = context.parsed_content.css('p.description::text').get('').strip()
+                if name:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name,
+                            'price': price,
+                            'description': description,
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'AdaptivePlaywrightCrawler',
+            },
+        )
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from html.parser import HTMLParser
+from typing import TYPE_CHECKING
+
+from crawlee.crawlers import BasicCrawler
+
+from apify import Actor
+
+if TYPE_CHECKING:
+    from crawlee._types import BasicCrawlingContext
+
+
+class _PageParser(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self.links: list[str] = []
+        self.data: dict[str, str] = {}
+        self._in_tag: str | None = None
+        self._in_class: str = ''
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        attrs_dict = dict(attrs)
+        if tag == 'a' and (href := attrs_dict.get('href')):
+            self.links.append(href)
+        self._in_tag = tag
+        self._in_class = attrs_dict.get('class', '') or ''
+
+    def handle_endtag(self, tag: str) -> None:  # noqa: ARG002
+        self._in_tag = None
+        self._in_class = ''
+
+    def handle_data(self, data: str) -> None:
+        text = data.strip()
+        if not text:
+            return
+        if self._in_tag == 'h1':
+            self.data['name'] = text
+        elif self._in_tag == 'span' and self._in_class == 'price':
+            self.data['price'] = text
+        elif self._in_tag == 'p' and self._in_class == 'description':
+            self.data['description'] = text
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = BasicCrawler(max_crawl_depth=2)
+
+        @crawler.router.default_handler
+        async def handler(context: BasicCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+
+            response = await context.send_request(context.request.url)
+            html = (await response.read()).decode()
+
+            parser = _PageParser()
+            parser.feed(html)
+
+            base_url = 'http://localhost:8080'
+            await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')])
+
+            if '/products/' in context.request.url and parser.data.get('name'):
+                await context.push_data({'url': context.request.url, **parser.data})
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'BasicCrawler',
+            },
+        )
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+from apify import Actor
+
+
+async def main() -> None:
+    async with Actor:
+        pages_visited: list[str] = []
+        crawler = BeautifulSoupCrawler(max_crawl_depth=2)
+
+        @crawler.router.default_handler
+        async def handler(context: BeautifulSoupCrawlingContext) -> None:
+            pages_visited.append(context.request.url)
+            await context.enqueue_links()
+
+            if '/products/' in context.request.url:
+                name_tag = context.soup.find('h1')
+                price_tag = context.soup.find('span', class_='price')
+                desc_tag = context.soup.find('p', class_='description')
+                if name_tag:
+                    await context.push_data(
+                        {
+                            'url': context.request.url,
+                            'name': name_tag.get_text(strip=True),
+                            'price': price_tag.get_text(strip=True) if price_tag else '',
+                            'description': desc_tag.get_text(strip=True) if desc_tag else '',
+                        }
+                    )
+
+        await crawler.run(['http://localhost:8080/'])
+
+        await Actor.set_value(
+            'CRAWLER_RESULT',
+            {
+                'pages_visited_count': len(pages_visited),
+                'crawler_type': 'BeautifulSoupCrawler',
+            },
+        )