@@ -94,6 +94,20 @@ def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any
9494 extracted_content .extend (future .result ())
9595 return extracted_content
9696
97+ async def arun (self , url : str , sections : List [str ], * q , ** kwargs ) -> List [Dict [str , Any ]]:
98+ """
99+ Async version: Process sections of text in parallel using asyncio.
100+
101+ Default implementation runs the sync version in a thread pool.
102+ Subclasses can override this for true async processing.
103+
104+ :param url: The URL of the webpage.
105+ :param sections: List of sections (strings) to process.
106+ :return: A list of processed JSON blocks.
107+ """
108+ import asyncio
109+ return await asyncio .to_thread (self .run , url , sections , * q , ** kwargs )
110+
97111
98112class NoExtractionStrategy (ExtractionStrategy ):
99113 """
@@ -780,6 +794,177 @@ def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
780794
781795 return extracted_content
782796
797+ async def aextract (self , url : str , ix : int , html : str ) -> List [Dict [str , Any ]]:
798+ """
799+ Async version: Extract meaningful blocks or chunks from the given HTML using an LLM.
800+
801+ How it works:
802+ 1. Construct a prompt with variables.
803+ 2. Make an async request to the LLM using the prompt.
804+ 3. Parse the response and extract blocks or chunks.
805+
806+ Args:
807+ url: The URL of the webpage.
808+ ix: Index of the block.
809+ html: The HTML content of the webpage.
810+
811+ Returns:
812+ A list of extracted blocks or chunks.
813+ """
814+ from .utils import aperform_completion_with_backoff
815+
816+ if self .verbose :
817+ print (f"[LOG] Call LLM for { url } - block index: { ix } " )
818+
819+ variable_values = {
820+ "URL" : url ,
821+ "HTML" : escape_json_string (sanitize_html (html )),
822+ }
823+
824+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS
825+ if self .instruction :
826+ variable_values ["REQUEST" ] = self .instruction
827+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
828+
829+ if self .extract_type == "schema" and self .schema :
830+ variable_values ["SCHEMA" ] = json .dumps (self .schema , indent = 2 )
831+ prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
832+
833+ if self .extract_type == "schema" and not self .schema :
834+ prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
835+
836+ for variable in variable_values :
837+ prompt_with_variables = prompt_with_variables .replace (
838+ "{" + variable + "}" , variable_values [variable ]
839+ )
840+
841+ try :
842+ response = await aperform_completion_with_backoff (
843+ self .llm_config .provider ,
844+ prompt_with_variables ,
845+ self .llm_config .api_token ,
846+ base_url = self .llm_config .base_url ,
847+ json_response = self .force_json_response ,
848+ extra_args = self .extra_args ,
849+ )
850+ # Track usage
851+ usage = TokenUsage (
852+ completion_tokens = response .usage .completion_tokens ,
853+ prompt_tokens = response .usage .prompt_tokens ,
854+ total_tokens = response .usage .total_tokens ,
855+ completion_tokens_details = response .usage .completion_tokens_details .__dict__
856+ if response .usage .completion_tokens_details
857+ else {},
858+ prompt_tokens_details = response .usage .prompt_tokens_details .__dict__
859+ if response .usage .prompt_tokens_details
860+ else {},
861+ )
862+ self .usages .append (usage )
863+
864+ # Update totals
865+ self .total_usage .completion_tokens += usage .completion_tokens
866+ self .total_usage .prompt_tokens += usage .prompt_tokens
867+ self .total_usage .total_tokens += usage .total_tokens
868+
869+ try :
870+ content = response .choices [0 ].message .content
871+ blocks = None
872+
873+ if self .force_json_response :
874+ blocks = json .loads (content )
875+ if isinstance (blocks , dict ):
876+ if len (blocks ) == 1 and isinstance (list (blocks .values ())[0 ], list ):
877+ blocks = list (blocks .values ())[0 ]
878+ else :
879+ blocks = [blocks ]
880+ elif isinstance (blocks , list ):
881+ blocks = blocks
882+ else :
883+ blocks = extract_xml_data (["blocks" ], content )["blocks" ]
884+ blocks = json .loads (blocks )
885+
886+ for block in blocks :
887+ block ["error" ] = False
888+ except Exception :
889+ parsed , unparsed = split_and_parse_json_objects (
890+ response .choices [0 ].message .content
891+ )
892+ blocks = parsed
893+ if unparsed :
894+ blocks .append (
895+ {"index" : 0 , "error" : True , "tags" : ["error" ], "content" : unparsed }
896+ )
897+
898+ if self .verbose :
899+ print (
900+ "[LOG] Extracted" ,
901+ len (blocks ),
902+ "blocks from URL:" ,
903+ url ,
904+ "block index:" ,
905+ ix ,
906+ )
907+ return blocks
908+ except Exception as e :
909+ if self .verbose :
910+ print (f"[LOG] Error in LLM extraction: { e } " )
911+ return [
912+ {
913+ "index" : ix ,
914+ "error" : True ,
915+ "tags" : ["error" ],
916+ "content" : str (e ),
917+ }
918+ ]
919+
920+ async def arun (self , url : str , sections : List [str ]) -> List [Dict [str , Any ]]:
921+ """
922+ Async version: Process sections with true parallelism using asyncio.gather.
923+
924+ Args:
925+ url: The URL of the webpage.
926+ sections: List of sections (strings) to process.
927+
928+ Returns:
929+ A list of extracted blocks or chunks.
930+ """
931+ import asyncio
932+
933+ merged_sections = self ._merge (
934+ sections ,
935+ self .chunk_token_threshold ,
936+ overlap = int (self .chunk_token_threshold * self .overlap_rate ),
937+ )
938+
939+ extracted_content = []
940+
941+ # Create tasks for all sections to run in parallel
942+ tasks = [
943+ self .aextract (url , ix , sanitize_input_encode (section ))
944+ for ix , section in enumerate (merged_sections )
945+ ]
946+
947+ # Execute all tasks concurrently
948+ results = await asyncio .gather (* tasks , return_exceptions = True )
949+
950+ # Process results
951+ for result in results :
952+ if isinstance (result , Exception ):
953+ if self .verbose :
954+ print (f"Error in async extraction: { result } " )
955+ extracted_content .append (
956+ {
957+ "index" : 0 ,
958+ "error" : True ,
959+ "tags" : ["error" ],
960+ "content" : str (result ),
961+ }
962+ )
963+ else :
964+ extracted_content .extend (result )
965+
966+ return extracted_content
967+
783968 def show_usage (self ) -> None :
784969 """Print a detailed token usage report showing total and per-request usage."""
785970 print ("\n === Token Usage Summary ===" )
0 commit comments