11import gc
22import os
3+
4+ os .environ .setdefault ("TOKENIZERS_PARALLELISM" , "false" )
5+
36import time
7+ import json
48from copy import deepcopy
59from pathlib import Path
610from typing import Optional
2630from constants import VECTOR_MODELS
2731
2832
29- def _flatten_to_text (x ):
30- if x is None :
31- return ""
32- if isinstance (x , str ):
33- return x
34- if isinstance (x , bytes ):
35- try :
36- return x .decode ("utf-8" , "ignore" )
37- except Exception :
38- return ""
39- if isinstance (x , (bool , int , float )):
40- return str (x )
41- if hasattr (x , '__iter__' ) and not isinstance (x , (str , bytes )):
42- parts = []
43- try :
44- for item in x :
45- s = _flatten_to_text (item )
46- if s :
47- parts .append (s )
48- return " " .join (parts )
49- except Exception :
50- return str (x )
51- try :
52- return str (x )
53- except Exception :
54- return ""
55-
5633class BaseEmbeddingModel :
5734
5835 def __init__ (self , model_name , model_kwargs , encode_kwargs , is_query : bool = False ):
@@ -70,7 +47,7 @@ def prepare_kwargs(self):
7047 tok_kw = hf_embed_kw .setdefault ("tokenizer_kwargs" , {})
7148 tok_kw .update ({
7249 "trust_remote_code" : True ,
73- "use_fast" : True ,
50+ "use_fast" : False ,
7451 "model_max_length" : 512 ,
7552 })
7653
@@ -95,10 +72,10 @@ def prepare_encode_kwargs(self):
9572 encode_kwargs .setdefault ("convert_to_tensor" , False )
9673 encode_kwargs .setdefault ("show_progress_bar" , not self .is_query )
9774
98- params_to_remove = [ 'model_max_length' , 'return_token_type_ids' , 'show_progress_bar' ,
99- 'padding' , ' truncation' , 'max_length' ]
75+ encode_kwargs . setdefault ( "padding" , True )
76+ encode_kwargs . setdefault ( " truncation" , True )
10077
101- for param in params_to_remove :
78+ for param in [ 'model_max_length' , 'return_token_type_ids' , 'show_progress_bar' ] :
10279 encode_kwargs .pop (param , None )
10380
10481 return encode_kwargs
@@ -360,29 +337,48 @@ def create_database(self, texts, embeddings):
360337 chunk_counters = defaultdict (int )
361338 skipped_chunks = 0
362339
363- for idx , doc in enumerate (texts ):
340+ for original_idx , doc in enumerate (texts ):
341+ raw_content = None
342+ clean_content = None
343+
364344 try :
365345 if hasattr (doc , 'page_content' ):
366346 raw_content = doc .page_content
367347 else :
368348 raw_content = doc
369-
349+
370350 if raw_content is None :
351+ print (f"Skipping chunk { original_idx } : no content" )
352+ skipped_chunks += 1
353+ continue
354+
355+ if isinstance (raw_content , (list , tuple , dict )):
356+ print (f"Skipping chunk { original_idx } : invalid type { type (raw_content )} " )
371357 skipped_chunks += 1
372358 continue
373359
374360 if not isinstance (raw_content , str ):
375- raw_content = str (raw_content )
361+ try :
362+ raw_content = str (raw_content )
363+ except Exception as e :
364+ print (f"Skipping chunk { original_idx } : cannot coerce { type (raw_content )} to str ({ e } )" )
365+ skipped_chunks += 1
366+ continue
376367
377368 clean_content = raw_content .replace ('\x00 ' , ' ' )
369+ clean_content = ' ' .join (clean_content .split ())
378370
379- words = clean_content .split ()
380- clean_content = ' ' .join (words )
371+ if not clean_content or not clean_content .strip ():
372+ skipped_chunks += 1
373+ continue
381374
382- if not clean_content :
375+ if isinstance (clean_content , (list , tuple , dict )):
376+ print (f"Skipping chunk { original_idx } : invalid type after clean { type (clean_content )} " )
383377 skipped_chunks += 1
384378 continue
385379
380+ clean_content .encode ('utf-8' )
381+
386382 file_hash = doc .metadata .get ('hash' ) if hasattr (doc , 'metadata' ) else None
387383 chunk_counters [file_hash ] += 1
388384 tiledb_id = str (random .randint (0 , MAX_UINT64 - 1 ))
@@ -391,9 +387,19 @@ def create_database(self, texts, embeddings):
391387 all_metadatas .append (doc .metadata if hasattr (doc , 'metadata' ) else {})
392388 all_ids .append (tiledb_id )
393389 hash_id_mappings .append ((tiledb_id , file_hash ))
394-
390+
395391 except Exception as e :
396- print (f"Error processing chunk { idx } : { e } " )
392+ preview = None
393+ try :
394+ preview = (clean_content if isinstance (clean_content , str ) else raw_content )
395+ if isinstance (preview , str ):
396+ preview = preview [:120 ].replace ('\n ' , ' ' )
397+ else :
398+ preview = repr (preview )[:120 ]
399+ except Exception :
400+ preview = "<unavailable>"
401+
402+ print (f"Error processing chunk { original_idx } : { e } | preview: { preview } " )
397403 skipped_chunks += 1
398404 continue
399405
@@ -410,6 +416,10 @@ def create_database(self, texts, embeddings):
410416
411417 embedding_start_time = time .time ()
412418
419+ for i , t in enumerate (all_texts ):
420+ if not isinstance (t , str ):
421+ raise TypeError (f"Non-string at index { i } : { type (t )} " )
422+
413423 vectors = embeddings .embed_documents (all_texts )
414424
415425 embedding_end_time = time .time ()
@@ -486,12 +496,12 @@ def create_metadata_db(self, documents, hash_id_mappings):
486496 for doc in documents
487497 ]
488498 cursor .executemany ('''
489- INSERT INTO document_metadata (file_name, hash, file_path, page_content)
499+ INSERT OR REPLACE INTO document_metadata (file_name, hash, file_path, page_content)
490500 VALUES (?, ?, ?, ?)
491501 ''' , doc_rows )
492502
493503 cursor .executemany ('''
494- INSERT INTO hash_chunk_ids (tiledb_id, hash)
504+ INSERT OR REPLACE INTO hash_chunk_ids (tiledb_id, hash)
495505 VALUES (?, ?)
496506 ''' , hash_id_mappings )
497507
@@ -727,7 +737,7 @@ def search(self, query, k: Optional[int] = None, score_threshold: Optional[float
727737 if search_term :
728738 filtered_contexts = [
729739 (doc , score ) for doc , score in relevant_contexts
730- if search_term in str (doc .page_content ).lower ()
740+ if search_term not in str (doc .page_content ).lower ()
731741 ]
732742 else :
733743 filtered_contexts = relevant_contexts
0 commit comments