@@ -383,16 +383,42 @@ def create_database(self, texts, embeddings):
383383 chunk_counters [file_hash ] += 1
384384 tiledb_id = str (random .randint (0 , MAX_UINT64 - 1 ))
385385
386- text_str = str (doc .page_content or "" ).strip ()
387- if not text_str : # silently drop zero-length chunks
386+ # CRITICAL FIX: Ensure page_content is a string and handle edge cases
387+ if hasattr (doc , 'page_content' ):
388+ if doc .page_content is None :
389+ text_str = ""
390+ elif isinstance (doc .page_content , str ):
391+ text_str = doc .page_content .strip ()
392+ elif isinstance (doc .page_content , (list , tuple )):
393+ # Handle list/tuple by joining with newlines
394+ text_str = "\n " .join (str (item ) for item in doc .page_content ).strip ()
395+ elif isinstance (doc .page_content , bytes ):
396+ # Handle bytes by decoding
397+ try :
398+ text_str = doc .page_content .decode ('utf-8' , errors = 'ignore' ).strip ()
399+ except :
400+ text_str = str (doc .page_content ).strip ()
401+ else :
402+ # Fallback for any other type
403+ text_str = str (doc .page_content ).strip ()
404+ else :
405+ # If no page_content attribute, convert the whole doc to string
406+ text_str = str (doc ).strip ()
407+
408+ if not text_str : # silently drop zero-length chunks
409+ continue
410+
411+ # Final validation - ensure it's really a string
412+ if not isinstance (text_str , str ):
413+ logging .error (f"Failed to convert to string: { type (text_str )} - { text_str [:100 ]} " )
388414 continue
389- all_texts .append (text_str )
390415
416+ all_texts .append (text_str )
391417 all_metadatas .append (doc .metadata )
392418 all_ids .append (tiledb_id )
393419 hash_id_mappings .append ((tiledb_id , file_hash ))
394420
395- # ── NEW GUARD: ensure every chunk is a string ──────────────────────
421+ # Debug check - log if we find any non-strings (this should never happen now)
396422 bad_chunks = [
397423 (idx , type (txt ), str (txt )[:60 ])
398424 for idx , txt in enumerate (all_texts )
@@ -403,11 +429,26 @@ def create_database(self, texts, embeddings):
403429 for idx , typ , preview in bad_chunks [:10 ]:
404430 print (f" #{ idx } : { typ } → { preview !r} " )
405431 raise ValueError (f"Found { len (bad_chunks )} non-string chunks — fix loaders" )
406- # ───────────────────────────────────────────────────────────────────
407432
408433 with open (self .ROOT_DIRECTORY / "config.yaml" , 'r' , encoding = 'utf-8' ) as config_file :
409434 config_data = yaml .safe_load (config_file )
410435
436+ # Additional safety: validate all_texts one more time and ensure proper format
437+ validated_texts = []
438+ for i , text in enumerate (all_texts ):
439+ if isinstance (text , str ):
440+ # Remove any null characters or other problematic characters
441+ cleaned_text = text .replace ('\x00 ' , '' ).strip ()
442+ if cleaned_text : # Only add non-empty strings
443+ validated_texts .append (cleaned_text )
444+ else :
445+ logging .warning (f"Skipping empty text at index { i } " )
446+ else :
447+ logging .error (f"Non-string found at index { i } : { type (text )} " )
448+
449+ # Replace all_texts with validated version
450+ all_texts = validated_texts
451+
411452 # precompute vectors
412453 vectors = embeddings .embed_documents (all_texts )
413454 text_embed_pairs = [
@@ -422,8 +463,8 @@ def create_database(self, texts, embeddings):
422463 TileDB .from_embeddings (
423464 text_embeddings = text_embed_pairs ,
424465 embedding = embeddings ,
425- metadatas = all_metadatas ,
426- ids = all_ids ,
466+ metadatas = all_metadatas [: len ( all_texts )], # Ensure metadata matches text count
467+ ids = all_ids [: len ( all_texts )], # Ensure IDs match text count
427468 metric = "euclidean" ,
428469 index_uri = str (self .PERSIST_DIRECTORY ),
429470 index_type = "FLAT" ,
0 commit comments