diff --git a/README.md b/README.md index d971b91..95d97ae 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,21 @@ This framework supports APIs for Firecrawl, Apify, ScraperAPI, ScrapingBee, Zyte Below are evaluation results across different engines. -| Engine | Coverage (Success Rate) (%) | Quality (F1) | -|-----------------|-----------------------------|--------------| -| Firecrawl | 80.9 | 0.68 | -| Exa | 76.3 | 0.53 | -| Tavily | 67.6 | 0.50 | -| ScraperAPI | 63.5 | 0.45 | -| Zyte | 62.9 | 0.47 | -| ScrapingBee | 60.6 | 0.45 | -| Apify | 60.2 | 0.42 | -| Crawl4ai | 58.0 | 0.45 | -| Selenium | 55.0 | 0.40 | -| Scrapy | 54.0 | 0.43 | -| Puppeteer | 53.7 | 0.41 | -| Rest (requests) | 50.6 | 0.36 | -| Playwright | 39.5 | 0.34 | +| Engine | Coverage (Success Rate) (%) | Quality (F1) | +|-----------------|------------------------------|--------------| +| Firecrawl | 80.9 | 0.68 | +| Exa | 76.3 | 0.53 | +| Apify | 75.8 | 0.58 | +| Tavily | 67.6 | 0.50 | +| ScraperAPI | 63.5 | 0.45 | +| Zyte | 62.9 | 0.47 | +| ScrapingBee | 60.6 | 0.45 | +| Crawl4ai | 58.0 | 0.45 | +| Selenium | 55.0 | 0.40 | +| Scrapy | 54.0 | 0.43 | +| Puppeteer | 53.7 | 0.41 | +| Rest (requests) | 50.6 | 0.36 | +| Playwright | 39.5 | 0.34 | ## Install diff --git a/engines/apify_api.py b/engines/apify_api.py index 1bcf6f9..46e95db 100644 --- a/engines/apify_api.py +++ b/engines/apify_api.py @@ -24,33 +24,28 @@ def __init__(self): if not self.api_token: raise RuntimeError("APIFY_API_TOKEN environment variable not set.") self.client = ApifyClient(self.api_token) - self.actor_id = "apify/web-scraper" + self.actor_id = "apify/website-content-crawler" def scrape(self, url: str, run_id: str) -> ScrapeResult: error = None - html = "" + markdown = "" content_size = 0 - status_code = 500 + status_code = 500 try: # Start the actor and wait for it to finish actor_client = self.client.actor(self.actor_id) run_result = actor_client.call( run_input={ "startUrls": [{"url": url}], - "maxRequestsPerCrawl": 1, - "pseudoUrls": [], - "linkSelector": "", - "proxyConfiguration": {"useApifyProxy": True}, - "crawlerType": "chrome", - "pageFunction": """ - async function(context) { - const $ = context.jQuery; - return { - html: $('body').html(), - status_code: context.response ? context.response.status : null - }; - } - """ + "crawlerType": "playwright:adaptive", + "maxCrawlPages": 1, + "saveFiles": False, + "saveHtml": False, + "saveHtmlAsFile": False, + "saveMarkdown": True, + "saveScreenshots": False, + "signHttpRequests": False, + "proxyConfiguration": {"useApifyProxy": True} }, timeout_secs=120 # Wait up to 2 minutes ) @@ -60,12 +55,13 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult: dataset_id = run_result["defaultDatasetId"] dataset_client = self.client.dataset(dataset_id) items = dataset_client.list_items().items - if items and "html" in items[0]: - html = items[0]["html"] or "" - status_code = items[0].get("status_code") - content_size = len(html.encode("utf-8")) if html else 0 + if items and "markdown" in items[0]: + markdown = items[0]["markdown"] or "" + crawl_data = items[0].get("crawl") + status_code = crawl_data.get("httpStatusCode") + content_size = len(markdown.encode("utf-8")) if markdown else 0 else: - error = "No HTML found in Apify dataset result." + error = "No markdown found in Apify dataset result." except Exception as e: error = str(e) @@ -76,7 +72,7 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult: status_code=status_code or 500, error=error, content_size=content_size, - format="html", + format="markdown", created_at=datetime.now().isoformat(), - content=html, + content=markdown, ) diff --git a/runs/results/apify_api_quality.json b/runs/results/apify_api_quality.json index 0b980c9..e01669e 100644 --- a/runs/results/apify_api_quality.json +++ b/runs/results/apify_api_quality.json @@ -1,6 +1,6 @@ { - "avg_recall": 0.4088987780290431, - "avg_precision": 0.4309147557081136, - "avg_f1": 0.4166200898332274, - "success_rate": 0.6021505376344086 -} \ No newline at end of file + "success_rate": 0.758, + "avg_recall": 0.490671096073996, + "avg_precision": 0.5579099299255283, + "avg_f1": 0.5082330459356168 +}