crawl4ai实操7

import asyncio
import time

from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import (
    FilterChain,
    URLPatternFilter,
    DomainFilter,
    ContentTypeFilter,
    ContentRelevanceFilter,
    SEOFilter,
)
from crawl4ai.deep_crawling.scorers import (
    KeywordRelevanceScorer,
)


# 1️⃣ Basic Deep Crawl Setup
async def basic_deep_crawl():
    """
    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.

    This function shows:
    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
    - Setting depth and domain parameters
    - Processing the results to show the hierarchy
    """
    print("\n===== BASIC DEEP CRAWL SETUP =====")

    # Configure a 2-level deep crawl using Breadth-First Search strategy
    # max_depth=2 means: initial page (depth 0) + 2 more levels
    # include_external=False means: only follow links within the same domain
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True,  # Show progress during crawling
    )

    async with AsyncWebCrawler() as crawler:
        start_time = time.perf_counter()
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        # Group results by depth to visualize the crawl tree
        pages_by_depth = {}
        for result in results:
            depth = result.metadata.get("depth", 0)
            if depth not in pages_by_depth:
                pages_by_depth[depth] = []
            pages_by_depth[depth].append(result.url)

        print(f"✅ Crawled {len(results)} pages total")

        # Display crawl structure by depth
        for depth, urls in sorted(pages_by_depth.items()):
            print(f"\nDepth {depth}: {len(urls)} pages")
            # Show first 3 URLs for each depth as examples
            for url in urls[:3]:
                print(f"  → {url}")
            if len(urls) > 3:
                print(f"  ... and {len(urls) - 3} more")

        print(
            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
        )

# 2️⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream():
    """
    PART 2: Demonstrates the difference between stream and non-stream execution.

    Non-stream: Waits for all results before processing
    Stream: Processes results as they become available
    """
    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")

    # Common configuration for both examples
    base_config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=False,
    )

    async with AsyncWebCrawler() as crawler:
        # NON-STREAMING MODE
        print("\n NON-STREAMING MODE:")
        print("  In this mode, all results are collected before being returned.")

        non_stream_config = base_config.clone()
        non_stream_config.stream = False

        start_time = time.perf_counter()
        results = await crawler.arun(
            url="https://docs.crawl4ai.com", config=non_stream_config
        )

        print(f"  ✅ Received all {len(results)} results at once")
        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")

        # STREAMING MODE
        print("\n STREAMING MODE:")
        print("  In this mode, results are processed as they become available.")

        stream_config = base_config.clone()
        stream_config.stream = True

        start_time = time.perf_counter()
        result_count = 0
        first_result_time = None

        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=stream_config
        ):
            result_count += 1
            if result_count == 1:
                first_result_time = time.perf_counter() - start_time
                print(
                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
                )
            elif result_count % 5 == 0:  # Show every 5th result for brevity
                print(f"  → Result #{result_count}: {result.url}")

        print(f"  ✅ Total: {result_count} results")
        print(f"  ✅ First result: {first_result_time:.2f} seconds")
        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
        print("\n Key Takeaway: Streaming allows processing results immediately")

# 3️⃣ Introduce Filters & Scorers
async def filters_and_scorers():
    """
    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.

    This function progressively adds:
    1. A single URL pattern filter
    2. Multiple filters in a chain
    3. Scorers for prioritizing pages
    """
    print("\n===== FILTERS AND SCORERS =====")

    async with AsyncWebCrawler() as crawler:
        # SINGLE FILTER EXAMPLE
        print("\n EXAMPLE 1: SINGLE URL PATTERN FILTER")
        print("  Only crawl pages containing 'core' in the URL")

        # Create a filter that only allows URLs with 'guide' in them
        url_filter = URLPatternFilter(patterns=["*core*"])

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1,
                include_external=False,
                filter_chain=FilterChain([url_filter]),  # Single filter
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            cache_mode=CacheMode.BYPASS,
            verbose=True,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
        for result in results[:3]:  # Show first 3 results
            print(f"  → {result.url}")
        if len(results) > 3:
            print(f"  ... and {len(results) - 3} more")

        # MULTIPLE FILTERS EXAMPLE
        print("\n EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
        print("  Only crawl pages that:")
        print("  1. Contain '2024' in the URL")
        print("  2. Are from 'techcrunch.com'")
        print("  3. Are of text/html or application/javascript content type")

        # Create a chain of filters
        filter_chain = FilterChain(
            [
                URLPatternFilter(patterns=["*2024*"]),
                DomainFilter(
                    allowed_domains=["techcrunch.com"],
                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
                ),
                ContentTypeFilter(
                    allowed_types=["text/html", "application/javascript"]
                ),
            ]
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, include_external=False, filter_chain=filter_chain
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
        )

        results = await crawler.arun(url="https://techcrunch.com", config=config)

        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
        for result in results[:3]:
            print(f"  → {result.url}")
        if len(results) > 3:
            print(f"  ... and {len(results) - 3} more")

        # SCORERS EXAMPLE
        print("\n EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
        print(
            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
        )

        # Create a keyword relevance scorer
        keyword_scorer = KeywordRelevanceScorer(
            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BestFirstCrawlingStrategy(  
                max_depth=1, include_external=False, url_scorer=keyword_scorer
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            cache_mode=CacheMode.BYPASS,
            verbose=True,
            stream=True,
        )

        results = []
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score")
            print(f"  → Score: {score:.2f} | {result.url}")

        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
        print("   Note: BestFirstCrawlingStrategy visits highest-scoring pages first")

# 4️⃣ Advanced Filters
async def advanced_filters():
    """
    PART 4: Demonstrates advanced filtering techniques for specialized crawling.

    This function covers:
    - SEO filters
    - Text relevancy filtering
    - Combining advanced filters
    """
    print("\n===== ADVANCED FILTERS =====")

    async with AsyncWebCrawler() as crawler:
        # SEO FILTER EXAMPLE
        print("\n EXAMPLE 1: SEO FILTERS")
        print(
            "Quantitative SEO quality assessment filter based searching keywords in the head section"
        )

        seo_filter = SEOFilter(
            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, filter_chain=FilterChain([seo_filter])
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Found {len(results)} pages with relevant keywords")
        for result in results:
            print(f"  → {result.url}")

        # ADVANCED TEXT RELEVANCY FILTER
        print("\n EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")

        # More sophisticated content relevance filter
        relevance_filter = ContentRelevanceFilter(
            query="Interact with the web using your authentic digital identity",
            threshold=0.7,
        )

        config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=1, filter_chain=FilterChain([relevance_filter])
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )

        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)

        print(f"  ✅ Found {len(results)} pages")
        for result in results:
            relevance_score = result.metadata.get("relevance_score", 0)
            print(f"  → Score: {relevance_score:.2f} | {result.url}")

# 5️⃣ Max Pages and Score Thresholds
async def max_pages_and_thresholds():
    """
    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
    
    This function shows:
    - How to limit the number of pages crawled
    - How to set score thresholds for more targeted crawling
    - Comparing BFS, DFS, and Best-First strategies with these parameters
    """
    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
    
    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
    
    async with AsyncWebCrawler() as crawler:
        # Define a common keyword scorer for all examples
        keyword_scorer = KeywordRelevanceScorer(
            keywords=["browser", "crawler", "web", "automation"], 
            weight=1.0
        )
        
        # EXAMPLE 1: BFS WITH MAX PAGES
        print("\n EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
        print("  Limit the crawler to a maximum of 5 pages")
        
        bfs_config = CrawlerRunConfig(
            deep_crawl_strategy=BFSDeepCrawlStrategy(
                max_depth=2, 
                include_external=False,
                url_scorer=keyword_scorer,
                max_pages=5  # Only crawl 5 pages
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )
        
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
        
        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
        for result in results:
            depth = result.metadata.get("depth", 0)
            print(f"  → Depth: {depth} | {result.url}")
            
        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
        print("\n EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
        print("  Only crawl pages with a relevance score above 0.5")
        
        dfs_config = CrawlerRunConfig(
            deep_crawl_strategy=DFSDeepCrawlStrategy(
                max_depth=2,
                include_external=False, 
                url_scorer=keyword_scorer,
                score_threshold=0.7,  # Only process URLs with scores above 0.5
                max_pages=10
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
        )
        
        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
        
        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
        for result in results:
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
            
        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
        print("\n EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
        
        bf_config = CrawlerRunConfig(
            deep_crawl_strategy=BestFirstCrawlingStrategy(
                max_depth=2,
                include_external=False,
                url_scorer=keyword_scorer,
                max_pages=7,          # Limit to 7 pages total
            ),
            scraping_strategy=LXMLWebScrapingStrategy(),
            verbose=True,
            cache_mode=CacheMode.BYPASS,
            stream=True,
        )
        
        results = []
        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
            
        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
        if results:
            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
            print(f"  ✅ Average score: {avg_score:.2f}")
            print("   Note: BestFirstCrawlingStrategy visited highest-scoring pages first")

# 6️⃣ Wrap-Up and Key Takeaways
async def wrap_up():
    """
    PART 6: Wrap-Up and Key Takeaways

    Summarize the key concepts learned in this tutorial.
    """
    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
    print("Combining filters, scorers, and streaming for an optimized crawl")

    # Create a sophisticated filter chain
    filter_chain = FilterChain(
        [
            DomainFilter(
                allowed_domains=["docs.crawl4ai.com"],
                blocked_domains=["old.docs.crawl4ai.com"],
            ),
            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
            ContentTypeFilter(allowed_types=["text/html"]),
        ]
    )

    # Create a composite scorer that combines multiple scoring strategies
    keyword_scorer = KeywordRelevanceScorer(
        keywords=["crawl", "example", "async", "configuration"], weight=0.7
    )
    # Set up the configuration
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=1,
            include_external=False,
            filter_chain=filter_chain,
            url_scorer=keyword_scorer,
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        stream=True,
        verbose=True,
    )

    # Execute the crawl
    results = []
    start_time = time.perf_counter()

    async with AsyncWebCrawler() as crawler:
        async for result in await crawler.arun(
            url="https://docs.crawl4ai.com", config=config
        ):
            results.append(result)
            score = result.metadata.get("score", 0)
            depth = result.metadata.get("depth", 0)
            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")

    duration = time.perf_counter() - start_time

    # Summarize the results
    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
    print(
        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
    )

    # Group by depth
    depth_counts = {}
    for result in results:
        depth = result.metadata.get("depth", 0)
        depth_counts[depth] = depth_counts.get(depth, 0) + 1

    print("\n Pages crawled by depth:")
    for depth, count in sorted(depth_counts.items()):
        print(f"  Depth {depth}: {count} pages")


async def run_tutorial():
    """
    Executes all tutorial sections in sequence.
    """
    print("\n CRAWL4AI DEEP CRAWLING TUTORIAL ")
    print("======================================")
    print("This tutorial will walk you through deep crawling techniques,")
    print("from basic to advanced, using the Crawl4AI library.")

    # Define sections - uncomment to run specific parts during development
    tutorial_sections = [
        basic_deep_crawl,
        stream_vs_nonstream,
        filters_and_scorers,
        max_pages_and_thresholds, 
        advanced_filters,
        wrap_up,
    ]

    for section in tutorial_sections:
        await section()

    print("\n TUTORIAL COMPLETE! ")
    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
    print("For more information, check out https://docs.crawl4ai.com")

# Execute the tutorial when run directly
if __name__ == "__main__":
    asyncio.run(run_tutorial())
| ⏱: 0.08s
[COMPLETE] ● https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 2.24s
[FETCH]... ↓ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 2.56s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 2.59s
[FETCH]... ↓ https://docs.crawl4ai.com/core/examples
|| ⏱: 2.89s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/examples
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/examples
|| ⏱: 2.93s
[FETCH]... ↓ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.57s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.62s
[FETCH]... ↓ https://docs.crawl4ai.com/core/installation
|| ⏱: 2.31s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/installation
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/installation
|| ⏱: 2.33s
[FETCH]... ↓ https://docs.crawl4ai.com/core/link-media
|| ⏱: 2.39s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/link-media
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/link-media
|| ⏱: 2.43s
[FETCH]... ↓ https://docs.crawl4ai.com/core/local-files
|| ⏱: 2.45s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/local-files
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/local-files
|| ⏱: 2.47s
[FETCH]... ↓ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 2.09s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 2.11s
[FETCH]... ↓ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 2.72s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 2.75s
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/llm-strategies
|| ⏱: 2.76s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/llm-strategies
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/llm-strategies
|| ⏱: 2.80s
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/clustring-strategies
|| ⏱: 2.42s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/clustring-strategies
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/clustring-strategies
|| ⏱: 2.45s
[FETCH]... ↓ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 3.15s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 3.19s
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/no-llm-strategies
|| ⏱: 2.89s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/no-llm-strategies
|| ⏱: 0.05s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/no-llm-strategies
|| ⏱: 2.95s
[ERROR]... × https://old.docs.crawl4ai.com                      | Error: Unexpected 
error in _crawl_web at line 744 in _crawl_web
(D:\anaconda3\envs\crawl4ai-python311\Lib\site-packages\crawl4ai\async_crawler_strat
egy.py):
Error: Failed on navigating ACS-GOTO:
Page.goto: net::ERR_CONNECTION_CLOSED at https://old.docs.crawl4ai.com/
Call log:
  - navigating to "https://old.docs.crawl4ai.com/", waiting until "domcontentloaded"


Code context:
 739                       response = await page.goto(
 740                           url, wait_until=config.wait_until,
timeout=config.page_timeout
 741                       )
 742                       redirected_url = page.url
 743                   except Error as e:
 744 →                     raise RuntimeError(f"Failed on navigating
ACS-GOTO:\n{str(e)}")
 745
 746                   await self.execute_hook(
 747                       "after_goto", page, context=context, url=url,
response=response, config=config
 748                   )
 749
[FETCH]... ↓ https://docs.crawl4ai.com/blog/releases/0.4.0
|| ⏱: 1.14s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog/releases/0.4.0
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/blog/releases/0.4.0
|| ⏱: 1.16s
[FETCH]... ↓ https://docs.crawl4ai.com/blog/releases/0.4.2
|| ⏱: 1.46s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog/releases/0.4.2
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/blog/releases/0.4.2
|| ⏱: 1.48s
[FETCH]... ↓ https://docs.crawl4ai.com/deploy/docker/README.md
|| ⏱: 1.47s
[SCRAPE].. ◆ https://docs.crawl4ai.com/deploy/docker/README.md
|| ⏱: 0.00s
[COMPLETE] ● https://docs.crawl4ai.com/deploy/docker/README.md
|| ⏱: 1.48s
[FETCH]... ↓ https://docs.crawl4ai.com/blog/releases/0.6.0
|| ⏱: 2.05s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog/releases/0.6.0
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/blog/releases/0.6.0
|| ⏱: 2.07s
[FETCH]... ↓ https://docs.crawl4ai.com/blog/releases/0.5.0
|| ⏱: 2.32s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog/releases/0.5.0
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/blog/releases/0.5.0
|| ⏱: 2.36s
[FETCH]... ↓ https://docs.crawl4ai.com/blog/releases/0.4.1
|| ⏱: 2.41s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog/releases/0.4.1
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/blog/releases/0.4.1
|| ⏱: 2.43s
✅ Crawled 49 pages total

Depth 0: 1 pages
  → https://docs.crawl4ai.com

Depth 1: 42 pages
  → https://docs.crawl4ai.com
  → https://docs.crawl4ai.com/advanced/ssl-certificate
  → https://docs.crawl4ai.com/api/arun
  ... and 39 more

Depth 2: 6 pages
  → https://docs.crawl4ai.com/blog/releases/0.4.0
  → https://docs.crawl4ai.com/blog/releases/0.4.2
  → https://docs.crawl4ai.com/deploy/docker/README.md
  ... and 3 more

✅ Performance: 49 pages in 22.85 seconds

===== STREAM VS. NON-STREAM EXECUTION =====
[INIT].... → Crawl4AI 0.6.3 

 NON-STREAMING MODE:
  In this mode, all results are collected before being returned.
  ✅ Received all 43 results at once
  ✅ Total duration: 17.67 seconds

 STREAMING MODE:
  In this mode, results are processed as they become available.
  ✅ First result received after 6.64 seconds: https://docs.crawl4ai.com
  → Result #5: https://docs.crawl4ai.com/api/arun_many
  → Result #10: https://docs.crawl4ai.com/api/crawl-result
  → Result #15: https://docs.crawl4ai.com/advanced/lazy-loading
  → Result #20: https://docs.crawl4ai.com/advanced/multi-url-crawling
  → Result #25: https://docs.crawl4ai.com/core/examples
  → Result #30: https://docs.crawl4ai.com/core/markdown-generation
  → Result #35: https://docs.crawl4ai.com/core/llmtxt
  → Result #40: https://docs.crawl4ai.com/extraction/no-llm-strategies
  ✅ Total: 42 results
  ✅ First result: 6.64 seconds
  ✅ All results: 18.32 seconds

 Key Takeaway: Streaming allows processing results immediately

===== FILTERS AND SCORERS =====
[INIT].... → Crawl4AI 0.6.3 

 EXAMPLE 1: SINGLE URL PATTERN FILTER
  Only crawl pages containing 'core' in the URL
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 6.39s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 6.41s
[FETCH]... ↓ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.86s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.87s
[FETCH]... ↓ https://docs.crawl4ai.com/core/examples
|| ⏱: 1.29s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/examples
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/examples
|| ⏱: 1.34s
[FETCH]... ↓ https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 1.55s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 1.58s
[FETCH]... ↓ https://docs.crawl4ai.com/core/cli
|| ⏱: 2.07s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/cli
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/cli
|| ⏱: 2.11s
[FETCH]... ↓ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 2.40s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 0.07s
[COMPLETE] ● https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 2.48s
[FETCH]... ↓ https://docs.crawl4ai.com/core/content-selection
|| ⏱: 2.64s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/content-selection
|| ⏱: 0.05s
[COMPLETE] ● https://docs.crawl4ai.com/core/content-selection
|| ⏱: 2.69s
[FETCH]... ↓ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 2.81s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 2.86s
[FETCH]... ↓ https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 3.05s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 3.10s
[FETCH]... ↓ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 3.16s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 0.05s
[COMPLETE] ● https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 3.22s
[FETCH]... ↓ https://docs.crawl4ai.com/core/installation
|| ⏱: 2.72s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/installation
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/installation
|| ⏱: 2.75s
[FETCH]... ↓ https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 2.81s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 2.84s
[FETCH]... ↓ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 2.94s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 2.98s
[FETCH]... ↓ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 3.00s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 3.02s
[FETCH]... ↓ https://docs.crawl4ai.com/core/link-media
|| ⏱: 3.10s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/link-media
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/link-media
|| ⏱: 3.14s
[FETCH]... ↓ https://docs.crawl4ai.com/core/local-files
|| ⏱: 3.15s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/local-files
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/local-files
|| ⏱: 3.18s
[FETCH]... ↓ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 1.95s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 1.99s
[FETCH]... ↓ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 2.27s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 2.29s
[FETCH]... ↓ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.66s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.70s
  ✅ Crawled 19 pages matching '*core*'
  → https://docs.crawl4ai.com
  → https://docs.crawl4ai.com/core/ask-ai
  → https://docs.crawl4ai.com/core/examples
  ... and 16 more

 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN
  Only crawl pages that:
  1. Contain '2024' in the URL
  2. Are from 'techcrunch.com'
  3. Are of text/html or application/javascript content type
[FETCH]... ↓ https://techcrunch.com
|| ⏱: 2.07s
[SCRAPE].. ◆ https://techcrunch.com
|| ⏱: 0.14s
[COMPLETE] ● https://techcrunch.com
|| ⏱: 2.22s
[FETCH]... ↓ https://techcrunch.com/2025/02/28/tech-layoffs-2024-list
|| ⏱: 1.50s
[SCRAPE].. ◆ https://techcrunch.com/2025/02/28/tech-layoffs-2024-list
|| ⏱: 0.07s
[COMPLETE] ● https://techcrunch.com/2025/02/28/tech-layoffs-2024-list
|| ⏱: 1.58s
  ✅ Crawled 2 pages after applying all filters
  → https://techcrunch.com
  → https://techcrunch.com/2025/02/28/tech-layoffs-2024-list

 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER
Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 7.39s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 7.41s
  → Score: 0.00 | https://docs.crawl4ai.com
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/advanced-features
|| ⏱: 1.26s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/advanced-features
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/advanced-features
|| ⏱: 1.29s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/advanced-features
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/lazy-loading
|| ⏱: 1.18s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/lazy-loading
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/lazy-loading
|| ⏱: 1.20s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/lazy-loading
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/crawl-dispatcher
|| ⏱: 1.63s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/crawl-dispatcher
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/crawl-dispatcher
|| ⏱: 1.65s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/crawl-dispatcher
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/multi-url-crawling
|| ⏱: 1.77s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/multi-url-crawling
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/multi-url-crawling
|| ⏱: 1.81s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/multi-url-crawling
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/proxy-security
|| ⏱: 2.20s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/proxy-security
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/proxy-security
|| ⏱: 2.22s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/proxy-security
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/network-console-capture
|| ⏱: 2.23s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/network-console-capture
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/network-console-capture
|| ⏱: 2.26s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/network-console-capture
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/session-management
|| ⏱: 2.38s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/session-management
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/session-management
|| ⏱: 2.41s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/session-management
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/file-downloading
|| ⏱: 2.56s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/file-downloading
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/file-downloading
|| ⏱: 2.58s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/file-downloading
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/hooks-auth
|| ⏱: 2.58s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/hooks-auth
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/hooks-auth
|| ⏱: 2.61s
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/identity-based-crawling
|| ⏱: 2.63s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/identity-based-crawling
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/identity-based-crawling
|| ⏱: 2.66s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/identity-based-crawling        
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/hooks-auth
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/ssl-certificate
|| ⏱: 1.35s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/ssl-certificate
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/ssl-certificate
|| ⏱: 1.37s
  → Score: 0.17 | https://docs.crawl4ai.com/advanced/ssl-certificate
[FETCH]... ↓ https://docs.crawl4ai.com/blog
|| ⏱: 1.82s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/blog
|| ⏱: 1.85s
  → Score: 0.17 | https://docs.crawl4ai.com/blog
[FETCH]... ↓ https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 1.91s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 1.94s
  → Score: 0.17 | https://docs.crawl4ai.com/core/cache-modes
[FETCH]... ↓ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 2.03s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 2.05s
  → Score: 0.17 | https://docs.crawl4ai.com/core/ask-ai
[FETCH]... ↓ https://docs.crawl4ai.com/api/strategies
|| ⏱: 2.06s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/strategies
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/api/strategies
|| ⏱: 2.10s
  → Score: 0.17 | https://docs.crawl4ai.com/api/strategies
[FETCH]... ↓ https://docs.crawl4ai.com/api/arun_many
|| ⏱: 2.58s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/arun_many
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/api/arun_many
|| ⏱: 2.61s
[FETCH]... ↓ https://docs.crawl4ai.com/api/arun
|| ⏱: 2.62s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/arun
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/api/arun
|| ⏱: 2.66s
  → Score: 0.17 | https://docs.crawl4ai.com/api/arun_many
  → Score: 0.17 | https://docs.crawl4ai.com/api/arun
[FETCH]... ↓ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 2.80s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 2.84s
  → Score: 0.17 | https://docs.crawl4ai.com/core/browser-crawler-config
[FETCH]... ↓ https://docs.crawl4ai.com/api/crawl-result
|| ⏱: 3.03s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/crawl-result
|| ⏱: 0.06s
[COMPLETE] ● https://docs.crawl4ai.com/api/crawl-result
|| ⏱: 3.10s
[FETCH]... ↓ https://docs.crawl4ai.com/api/parameters
|| ⏱: 3.10s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/parameters
|| ⏱: 0.05s
[COMPLETE] ● https://docs.crawl4ai.com/api/parameters
|| ⏱: 3.15s
  → Score: 0.17 | https://docs.crawl4ai.com/api/crawl-result
  → Score: 0.17 | https://docs.crawl4ai.com/api/parameters
[FETCH]... ↓ https://docs.crawl4ai.com/core/cli
|| ⏱: 1.98s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/cli
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/cli
|| ⏱: 2.01s
  → Score: 0.17 | https://docs.crawl4ai.com/core/cli
[FETCH]... ↓ https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 1.76s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 1.80s
  → Score: 0.17 | https://docs.crawl4ai.com/core/fit-markdown
[FETCH]... ↓ https://docs.crawl4ai.com/core/installation
|| ⏱: 1.94s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/installation
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/installation
|| ⏱: 1.97s
  → Score: 0.17 | https://docs.crawl4ai.com/core/installation
[FETCH]... ↓ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 1.96s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 2.00s
[FETCH]... ↓ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 2.01s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 2.02s
  → Score: 0.17 | https://docs.crawl4ai.com/core/llmtxt
  → Score: 0.17 | https://docs.crawl4ai.com/core/crawler-result
[FETCH]... ↓ https://docs.crawl4ai.com/core/content-selection
|| ⏱: 2.32s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/content-selection
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/content-selection
|| ⏱: 2.36s
  → Score: 0.17 | https://docs.crawl4ai.com/core/content-selection
[FETCH]... ↓ https://docs.crawl4ai.com/core/link-media
|| ⏱: 2.36s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/link-media
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/link-media
|| ⏱: 2.40s
  → Score: 0.17 | https://docs.crawl4ai.com/core/link-media
[FETCH]... ↓ https://docs.crawl4ai.com/core/local-files
|| ⏱: 2.59s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/local-files
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/local-files
|| ⏱: 2.61s
  → Score: 0.17 | https://docs.crawl4ai.com/core/local-files
[FETCH]... ↓ https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 2.85s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 2.90s
  → Score: 0.17 | https://docs.crawl4ai.com/core/deep-crawling
[FETCH]... ↓ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 3.24s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 0.06s
[COMPLETE] ● https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 3.30s
  → Score: 0.17 | https://docs.crawl4ai.com/core/docker-deployment
[FETCH]... ↓ https://docs.crawl4ai.com/api/async-webcrawler
|| ⏱: 1.21s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/async-webcrawler
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/api/async-webcrawler
|| ⏱: 1.24s
  → Score: 0.33 | https://docs.crawl4ai.com/api/async-webcrawler
[ERROR]... × https://old.docs.crawl4ai.com                      | Error: Unexpected 
error in _crawl_web at line 744 in _crawl_web
(D:\anaconda3\envs\crawl4ai-python311\Lib\site-packages\crawl4ai\async_crawler_strat
egy.py):
Error: Failed on navigating ACS-GOTO:
Page.goto: net::ERR_CONNECTION_CLOSED at https://old.docs.crawl4ai.com/
Call log:
  - navigating to "https://old.docs.crawl4ai.com/", waiting until "domcontentloaded"


Code context:
 739                       response = await page.goto(
 740                           url, wait_until=config.wait_until,
timeout=config.page_timeout
 741                       )
 742                       redirected_url = page.url
 743                   except Error as e:
 744 →                     raise RuntimeError(f"Failed on navigating
ACS-GOTO:\n{str(e)}")
 745
 746                   await self.execute_hook(
 747                       "after_goto", page, context=context, url=url,
response=response, config=config
 748                   )
 749
  → Score: 0.17 | https://old.docs.crawl4ai.com
[FETCH]... ↓ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 1.60s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 1.62s
  → Score: 0.17 | https://docs.crawl4ai.com/core/simple-crawling
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/chunking
|| ⏱: 2.45s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/chunking
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/chunking
|| ⏱: 2.47s
  → Score: 0.17 | https://docs.crawl4ai.com/extraction/chunking
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/clustring-strategies
|| ⏱: 2.48s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/clustring-strategies
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/clustring-strategies
|| ⏱: 2.52s
  → Score: 0.17 | https://docs.crawl4ai.com/extraction/clustring-strategies
[FETCH]... ↓ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 2.69s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 2.73s
  → Score: 0.17 | https://docs.crawl4ai.com/core/page-interaction
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/llm-strategies
|| ⏱: 2.84s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/llm-strategies
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/llm-strategies
|| ⏱: 2.88s
  → Score: 0.17 | https://docs.crawl4ai.com/extraction/llm-strategies
[FETCH]... ↓ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.98s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/quickstart
|| ⏱: 3.02s
  → Score: 0.17 | https://docs.crawl4ai.com/core/quickstart
[FETCH]... ↓ https://docs.crawl4ai.com/extraction/no-llm-strategies
|| ⏱: 3.04s
[SCRAPE].. ◆ https://docs.crawl4ai.com/extraction/no-llm-strategies
|| ⏱: 0.05s
[COMPLETE] ● https://docs.crawl4ai.com/extraction/no-llm-strategies
|| ⏱: 3.09s
  → Score: 0.17 | https://docs.crawl4ai.com/extraction/no-llm-strategies
[FETCH]... ↓ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 3.12s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 3.16s
  → Score: 0.17 | https://docs.crawl4ai.com/core/markdown-generation
[FETCH]... ↓ https://docs.crawl4ai.com/core/examples
|| ⏱: 1.28s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/examples
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/examples
|| ⏱: 1.32s
  → Score: 0.33 | https://docs.crawl4ai.com/core/examples
  ✅ Crawler prioritized 42 pages by relevance score
   Note: BestFirstCrawlingStrategy visits highest-scoring pages first

===== MAX PAGES AND SCORE THRESHOLDS =====
[INIT].... → Crawl4AI 0.6.3 

 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT
  Limit the crawler to a maximum of 5 pages
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 7.23s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 7.25s
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 0.69s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 0.72s
[FETCH]... ↓ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 1.26s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 1.30s
[FETCH]... ↓ https://docs.crawl4ai.com/api/async-webcrawler
|| ⏱: 1.57s
[SCRAPE].. ◆ https://docs.crawl4ai.com/api/async-webcrawler
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/api/async-webcrawler
|| ⏱: 1.60s
[FETCH]... ↓ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 1.65s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 1.68s
  ✅ Crawled exactly 5 pages as specified by max_pages
  → Depth: 0 | https://docs.crawl4ai.com
  → Depth: 1 | https://docs.crawl4ai.com
  → Depth: 1 | https://docs.crawl4ai.com/core/crawler-result
  → Depth: 1 | https://docs.crawl4ai.com/api/async-webcrawler
  → Depth: 1 | https://docs.crawl4ai.com/core/browser-crawler-config

 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD
  Only crawl pages with a relevance score above 0.5
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 0.53s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 0.55s
  ✅ Crawled 1 pages with scores above threshold
  → Depth: 0 | Score: 0.00 | https://docs.crawl4ai.com

 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS
  Limit to 7 pages with scores above 0.3, prioritizing highest scores
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 10.89s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 10.92s
  → Depth: 0 | Score: 0.00 | https://docs.crawl4ai.com
[FETCH]... ↓ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.92s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.93s
  → Depth: 1 | Score: 0.00 | https://docs.crawl4ai.com/core/ask-ai
[FETCH]... ↓ https://docs.crawl4ai.com/core/installation
|| ⏱: 1.09s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/installation
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/installation
|| ⏱: 1.11s
  → Depth: 1 | Score: 0.00 | https://docs.crawl4ai.com/core/installation
[FETCH]... ↓ https://docs.crawl4ai.com/core/examples
|| ⏱: 1.76s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/examples
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/examples
|| ⏱: 1.80s
  → Depth: 1 | Score: 0.00 | https://docs.crawl4ai.com/core/examples
[FETCH]... ↓ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 1.82s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/quickstart
|| ⏱: 1.85s
[FETCH]... ↓ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 1.87s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 1.88s
  → Depth: 1 | Score: 0.00 | https://docs.crawl4ai.com/core/llmtxt
  → Depth: 1 | Score: 0.00 | https://docs.crawl4ai.com/core/quickstart
[FETCH]... ↓ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 2.31s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 0.06s
[COMPLETE] ● https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 2.36s
  → Depth: 1 | Score: 0.00 | https://docs.crawl4ai.com/core/docker-deployment       
  ✅ Crawled 7 high-value pages with scores above 0.3
  ✅ Average score: 0.00
   Note: BestFirstCrawlingStrategy visited highest-scoring pages first

===== ADVANCED FILTERS =====
[INIT].... → Crawl4AI 0.6.3 

 EXAMPLE 1: SEO FILTERS
Quantitative SEO quality assessment filter based searching keywords in the head section
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 9.53s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 9.56s
  ✅ Found 1 pages with relevant keywords
  → https://docs.crawl4ai.com

 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 0.52s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 0.54s
  ✅ Found 1 pages
  → Score: 0.00 | https://docs.crawl4ai.com

===== COMPLETE CRAWLER EXAMPLE =====
Combining filters, scorers, and streaming for an optimized crawl
[INIT].... → Crawl4AI 0.6.3 
[FETCH]... ↓ https://docs.crawl4ai.com
|| ⏱: 6.00s
[SCRAPE].. ◆ https://docs.crawl4ai.com
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com
|| ⏱: 6.03s
→ Depth: 0 | Score: 0.00 | https://docs.crawl4ai.com
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/advanced-features
|| ⏱: 1.32s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/advanced-features
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/advanced-features
|| ⏱: 1.36s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/advanced-features
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/session-management
|| ⏱: 2.32s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/session-management
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/session-management
|| ⏱: 2.35s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/session-management    
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/network-console-capture
|| ⏱: 2.76s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/network-console-capture
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/network-console-capture
|| ⏱: 2.80s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/network-console-capture
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/lazy-loading
|| ⏱: 2.91s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/lazy-loading
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/lazy-loading
|| ⏱: 2.94s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/lazy-loading
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/identity-based-crawling
|| ⏱: 3.00s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/identity-based-crawling
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/identity-based-crawling
|| ⏱: 3.04s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/identity-based-crawling
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/file-downloading
|| ⏱: 3.19s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/file-downloading
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/file-downloading
|| ⏱: 3.21s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/file-downloading      
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/hooks-auth
|| ⏱: 3.23s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/hooks-auth
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/hooks-auth
|| ⏱: 3.26s
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/multi-url-crawling
|| ⏱: 3.27s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/multi-url-crawling
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/multi-url-crawling
|| ⏱: 3.31s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/hooks-auth
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/multi-url-crawling    
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/proxy-security
|| ⏱: 3.45s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/proxy-security
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/proxy-security
|| ⏱: 3.47s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/proxy-security        
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/crawl-dispatcher
|| ⏱: 3.94s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/crawl-dispatcher
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/crawl-dispatcher
|| ⏱: 3.96s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/crawl-dispatcher
[FETCH]... ↓ https://docs.crawl4ai.com/advanced/ssl-certificate
|| ⏱: 1.19s
[SCRAPE].. ◆ https://docs.crawl4ai.com/advanced/ssl-certificate
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/advanced/ssl-certificate
|| ⏱: 1.22s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/advanced/ssl-certificate       
[FETCH]... ↓ https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 1.53s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/deep-crawling
|| ⏱: 1.57s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/deep-crawling
[FETCH]... ↓ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 1.94s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 0.06s
[COMPLETE] ● https://docs.crawl4ai.com/core/docker-deployment
|| ⏱: 2.00s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/docker-deployment
[FETCH]... ↓ https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 2.17s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/cache-modes
|| ⏱: 2.20s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/cache-modes
[FETCH]... ↓ https://docs.crawl4ai.com/blog
|| ⏱: 2.45s
[SCRAPE].. ◆ https://docs.crawl4ai.com/blog
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/blog
|| ⏱: 2.47s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/blog
[FETCH]... ↓ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 2.50s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/ask-ai
|| ⏱: 2.52s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/ask-ai
[FETCH]... ↓ https://docs.crawl4ai.com/core/cli
|| ⏱: 2.59s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/cli
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/cli
|| ⏱: 2.63s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/cli
[FETCH]... ↓ https://docs.crawl4ai.com/core/content-selection
|| ⏱: 3.00s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/content-selection
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/content-selection
|| ⏱: 3.04s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/content-selection
[FETCH]... ↓ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 3.04s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/crawler-result
|| ⏱: 3.07s
[FETCH]... ↓ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 3.07s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/browser-crawler-config
|| ⏱: 3.11s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/browser-crawler-config
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/crawler-result
[FETCH]... ↓ https://docs.crawl4ai.com/core/examples
|| ⏱: 1.96s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/examples
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/examples
|| ⏱: 1.99s
→ Depth: 1 | Score: 0.35 | https://docs.crawl4ai.com/core/examples
[FETCH]... ↓ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 1.83s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/simple-crawling
|| ⏱: 1.86s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/simple-crawling
[FETCH]... ↓ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 1.87s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 0.01s
[COMPLETE] ● https://docs.crawl4ai.com/core/llmtxt
|| ⏱: 1.88s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/llmtxt
[FETCH]... ↓ https://docs.crawl4ai.com/core/local-files
|| ⏱: 2.37s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/local-files
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/local-files
|| ⏱: 2.40s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/local-files
[FETCH]... ↓ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.53s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/quickstart
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/quickstart
|| ⏱: 2.58s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/quickstart
[FETCH]... ↓ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 2.63s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 0.05s
[COMPLETE] ● https://docs.crawl4ai.com/core/markdown-generation
|| ⏱: 2.69s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/markdown-generation       
[FETCH]... ↓ https://docs.crawl4ai.com/core/installation
|| ⏱: 2.71s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/installation
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/installation
|| ⏱: 2.73s
[FETCH]... ↓ https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 2.74s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 0.02s
[COMPLETE] ● https://docs.crawl4ai.com/core/fit-markdown
|| ⏱: 2.77s
[FETCH]... ↓ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 2.78s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 0.03s
[COMPLETE] ● https://docs.crawl4ai.com/core/page-interaction
|| ⏱: 2.81s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/fit-markdown
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/page-interaction
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/installation
[FETCH]... ↓ https://docs.crawl4ai.com/core/link-media
|| ⏱: 2.83s
[SCRAPE].. ◆ https://docs.crawl4ai.com/core/link-media
|| ⏱: 0.04s
[COMPLETE] ● https://docs.crawl4ai.com/core/link-media
|| ⏱: 2.87s
→ Depth: 1 | Score: 0.17 | https://docs.crawl4ai.com/core/link-media

✅ Crawled 31 high-value pages in 22.47 seconds
✅ Average score: 0.17

 Pages crawled by depth:
  Depth 0: 1 pages
  Depth 1: 30 pages

 TUTORIAL COMPLETE! 
You now have a comprehensive understanding of deep crawling with Crawl4AI.
For more information, check out https://docs.crawl4ai.com
PS E:\AI-lab\n8n> 

你可能感兴趣的:(网络爬虫,python,开发语言)