Skip to content

Commit 66175e1

Browse files
authored
Merge pull request unclecode#1590 from unclecode/fix/async-llm-extraction-arunMany
This commit resolves issue unclecode#1055 where LLM extraction was blocking async
2 parents 2c91815 + a30548a commit 66175e1

File tree

4 files changed

+492
-1
lines changed

4 files changed

+492
-1
lines changed

crawl4ai/async_webcrawler.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,17 @@ async def aprocess_html(
617617
else config.chunking_strategy
618618
)
619619
sections = chunking.chunk(content)
620-
extracted_content = config.extraction_strategy.run(url, sections)
620+
# extracted_content = config.extraction_strategy.run(url, sections)
621+
622+
# Use async version if available for better parallelism
623+
if hasattr(config.extraction_strategy, 'arun'):
624+
extracted_content = await config.extraction_strategy.arun(url, sections)
625+
else:
626+
# Fallback to sync version run in thread pool to avoid blocking
627+
extracted_content = await asyncio.to_thread(
628+
config.extraction_strategy.run, url, sections
629+
)
630+
621631
extracted_content = json.dumps(
622632
extracted_content, indent=4, default=str, ensure_ascii=False
623633
)

crawl4ai/extraction_strategy.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,20 @@ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any
9494
extracted_content.extend(future.result())
9595
return extracted_content
9696

97+
async def arun(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
98+
"""
99+
Async version: Process sections of text in parallel using asyncio.
100+
101+
Default implementation runs the sync version in a thread pool.
102+
Subclasses can override this for true async processing.
103+
104+
:param url: The URL of the webpage.
105+
:param sections: List of sections (strings) to process.
106+
:return: A list of processed JSON blocks.
107+
"""
108+
import asyncio
109+
return await asyncio.to_thread(self.run, url, sections, *q, **kwargs)
110+
97111

98112
class NoExtractionStrategy(ExtractionStrategy):
99113
"""
@@ -780,6 +794,177 @@ def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
780794

781795
return extracted_content
782796

797+
async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
798+
"""
799+
Async version: Extract meaningful blocks or chunks from the given HTML using an LLM.
800+
801+
How it works:
802+
1. Construct a prompt with variables.
803+
2. Make an async request to the LLM using the prompt.
804+
3. Parse the response and extract blocks or chunks.
805+
806+
Args:
807+
url: The URL of the webpage.
808+
ix: Index of the block.
809+
html: The HTML content of the webpage.
810+
811+
Returns:
812+
A list of extracted blocks or chunks.
813+
"""
814+
from .utils import aperform_completion_with_backoff
815+
816+
if self.verbose:
817+
print(f"[LOG] Call LLM for {url} - block index: {ix}")
818+
819+
variable_values = {
820+
"URL": url,
821+
"HTML": escape_json_string(sanitize_html(html)),
822+
}
823+
824+
prompt_with_variables = PROMPT_EXTRACT_BLOCKS
825+
if self.instruction:
826+
variable_values["REQUEST"] = self.instruction
827+
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
828+
829+
if self.extract_type == "schema" and self.schema:
830+
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
831+
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
832+
833+
if self.extract_type == "schema" and not self.schema:
834+
prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
835+
836+
for variable in variable_values:
837+
prompt_with_variables = prompt_with_variables.replace(
838+
"{" + variable + "}", variable_values[variable]
839+
)
840+
841+
try:
842+
response = await aperform_completion_with_backoff(
843+
self.llm_config.provider,
844+
prompt_with_variables,
845+
self.llm_config.api_token,
846+
base_url=self.llm_config.base_url,
847+
json_response=self.force_json_response,
848+
extra_args=self.extra_args,
849+
)
850+
# Track usage
851+
usage = TokenUsage(
852+
completion_tokens=response.usage.completion_tokens,
853+
prompt_tokens=response.usage.prompt_tokens,
854+
total_tokens=response.usage.total_tokens,
855+
completion_tokens_details=response.usage.completion_tokens_details.__dict__
856+
if response.usage.completion_tokens_details
857+
else {},
858+
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
859+
if response.usage.prompt_tokens_details
860+
else {},
861+
)
862+
self.usages.append(usage)
863+
864+
# Update totals
865+
self.total_usage.completion_tokens += usage.completion_tokens
866+
self.total_usage.prompt_tokens += usage.prompt_tokens
867+
self.total_usage.total_tokens += usage.total_tokens
868+
869+
try:
870+
content = response.choices[0].message.content
871+
blocks = None
872+
873+
if self.force_json_response:
874+
blocks = json.loads(content)
875+
if isinstance(blocks, dict):
876+
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
877+
blocks = list(blocks.values())[0]
878+
else:
879+
blocks = [blocks]
880+
elif isinstance(blocks, list):
881+
blocks = blocks
882+
else:
883+
blocks = extract_xml_data(["blocks"], content)["blocks"]
884+
blocks = json.loads(blocks)
885+
886+
for block in blocks:
887+
block["error"] = False
888+
except Exception:
889+
parsed, unparsed = split_and_parse_json_objects(
890+
response.choices[0].message.content
891+
)
892+
blocks = parsed
893+
if unparsed:
894+
blocks.append(
895+
{"index": 0, "error": True, "tags": ["error"], "content": unparsed}
896+
)
897+
898+
if self.verbose:
899+
print(
900+
"[LOG] Extracted",
901+
len(blocks),
902+
"blocks from URL:",
903+
url,
904+
"block index:",
905+
ix,
906+
)
907+
return blocks
908+
except Exception as e:
909+
if self.verbose:
910+
print(f"[LOG] Error in LLM extraction: {e}")
911+
return [
912+
{
913+
"index": ix,
914+
"error": True,
915+
"tags": ["error"],
916+
"content": str(e),
917+
}
918+
]
919+
920+
async def arun(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
921+
"""
922+
Async version: Process sections with true parallelism using asyncio.gather.
923+
924+
Args:
925+
url: The URL of the webpage.
926+
sections: List of sections (strings) to process.
927+
928+
Returns:
929+
A list of extracted blocks or chunks.
930+
"""
931+
import asyncio
932+
933+
merged_sections = self._merge(
934+
sections,
935+
self.chunk_token_threshold,
936+
overlap=int(self.chunk_token_threshold * self.overlap_rate),
937+
)
938+
939+
extracted_content = []
940+
941+
# Create tasks for all sections to run in parallel
942+
tasks = [
943+
self.aextract(url, ix, sanitize_input_encode(section))
944+
for ix, section in enumerate(merged_sections)
945+
]
946+
947+
# Execute all tasks concurrently
948+
results = await asyncio.gather(*tasks, return_exceptions=True)
949+
950+
# Process results
951+
for result in results:
952+
if isinstance(result, Exception):
953+
if self.verbose:
954+
print(f"Error in async extraction: {result}")
955+
extracted_content.append(
956+
{
957+
"index": 0,
958+
"error": True,
959+
"tags": ["error"],
960+
"content": str(result),
961+
}
962+
)
963+
else:
964+
extracted_content.extend(result)
965+
966+
return extracted_content
967+
783968
def show_usage(self) -> None:
784969
"""Print a detailed token usage report showing total and per-request usage."""
785970
print("\n=== Token Usage Summary ===")

crawl4ai/utils.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1825,6 +1825,82 @@ def perform_completion_with_backoff(
18251825
# ]
18261826

18271827

1828+
async def aperform_completion_with_backoff(
1829+
provider,
1830+
prompt_with_variables,
1831+
api_token,
1832+
json_response=False,
1833+
base_url=None,
1834+
**kwargs,
1835+
):
1836+
"""
1837+
Async version: Perform an API completion request with exponential backoff.
1838+
1839+
How it works:
1840+
1. Sends an async completion request to the API.
1841+
2. Retries on rate-limit errors with exponential delays (async).
1842+
3. Returns the API response or an error after all retries.
1843+
1844+
Args:
1845+
provider (str): The name of the API provider.
1846+
prompt_with_variables (str): The input prompt for the completion request.
1847+
api_token (str): The API token for authentication.
1848+
json_response (bool): Whether to request a JSON response. Defaults to False.
1849+
base_url (Optional[str]): The base URL for the API. Defaults to None.
1850+
**kwargs: Additional arguments for the API request.
1851+
1852+
Returns:
1853+
dict: The API response or an error message after all retries.
1854+
"""
1855+
1856+
from litellm import acompletion
1857+
from litellm.exceptions import RateLimitError
1858+
import asyncio
1859+
1860+
max_attempts = 3
1861+
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
1862+
1863+
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
1864+
if json_response:
1865+
extra_args["response_format"] = {"type": "json_object"}
1866+
1867+
if kwargs.get("extra_args"):
1868+
extra_args.update(kwargs["extra_args"])
1869+
1870+
for attempt in range(max_attempts):
1871+
try:
1872+
response = await acompletion(
1873+
model=provider,
1874+
messages=[{"role": "user", "content": prompt_with_variables}],
1875+
**extra_args,
1876+
)
1877+
return response # Return the successful response
1878+
except RateLimitError as e:
1879+
print("Rate limit error:", str(e))
1880+
1881+
if attempt == max_attempts - 1:
1882+
# Last attempt failed, raise the error.
1883+
raise
1884+
1885+
# Check if we have exhausted our max attempts
1886+
if attempt < max_attempts - 1:
1887+
# Calculate the delay and wait
1888+
delay = base_delay * (2**attempt) # Exponential backoff formula
1889+
print(f"Waiting for {delay} seconds before retrying...")
1890+
await asyncio.sleep(delay)
1891+
else:
1892+
# Return an error response after exhausting all retries
1893+
return [
1894+
{
1895+
"index": 0,
1896+
"tags": ["error"],
1897+
"content": ["Rate limit error. Please try again later."],
1898+
}
1899+
]
1900+
except Exception as e:
1901+
raise e # Raise any other exceptions immediately
1902+
1903+
18281904
def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None, base_url=None):
18291905
"""
18301906
Extract content blocks from website HTML using an AI provider.

0 commit comments

Comments
 (0)