Skip to content

Commit 0374785

Browse files
authored
Add files via upload
1 parent ba3e2a8 commit 0374785

File tree

1 file changed

+48
-7
lines changed

1 file changed

+48
-7
lines changed

src/database_interactions.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -383,16 +383,42 @@ def create_database(self, texts, embeddings):
383383
chunk_counters[file_hash] += 1
384384
tiledb_id = str(random.randint(0, MAX_UINT64 - 1))
385385

386-
text_str = str(doc.page_content or "").strip()
387-
if not text_str: # silently drop zero-length chunks
386+
# CRITICAL FIX: Ensure page_content is a string and handle edge cases
387+
if hasattr(doc, 'page_content'):
388+
if doc.page_content is None:
389+
text_str = ""
390+
elif isinstance(doc.page_content, str):
391+
text_str = doc.page_content.strip()
392+
elif isinstance(doc.page_content, (list, tuple)):
393+
# Handle list/tuple by joining with newlines
394+
text_str = "\n".join(str(item) for item in doc.page_content).strip()
395+
elif isinstance(doc.page_content, bytes):
396+
# Handle bytes by decoding
397+
try:
398+
text_str = doc.page_content.decode('utf-8', errors='ignore').strip()
399+
except:
400+
text_str = str(doc.page_content).strip()
401+
else:
402+
# Fallback for any other type
403+
text_str = str(doc.page_content).strip()
404+
else:
405+
# If no page_content attribute, convert the whole doc to string
406+
text_str = str(doc).strip()
407+
408+
if not text_str: # silently drop zero-length chunks
409+
continue
410+
411+
# Final validation - ensure it's really a string
412+
if not isinstance(text_str, str):
413+
logging.error(f"Failed to convert to string: {type(text_str)} - {text_str[:100]}")
388414
continue
389-
all_texts.append(text_str)
390415

416+
all_texts.append(text_str)
391417
all_metadatas.append(doc.metadata)
392418
all_ids.append(tiledb_id)
393419
hash_id_mappings.append((tiledb_id, file_hash))
394420

395-
# ── NEW GUARD: ensure every chunk is a string ──────────────────────
421+
# Debug check - log if we find any non-strings (this should never happen now)
396422
bad_chunks = [
397423
(idx, type(txt), str(txt)[:60])
398424
for idx, txt in enumerate(all_texts)
@@ -403,11 +429,26 @@ def create_database(self, texts, embeddings):
403429
for idx, typ, preview in bad_chunks[:10]:
404430
print(f" #{idx}: {typ}{preview!r}")
405431
raise ValueError(f"Found {len(bad_chunks)} non-string chunks — fix loaders")
406-
# ───────────────────────────────────────────────────────────────────
407432

408433
with open(self.ROOT_DIRECTORY / "config.yaml", 'r', encoding='utf-8') as config_file:
409434
config_data = yaml.safe_load(config_file)
410435

436+
# Additional safety: validate all_texts one more time and ensure proper format
437+
validated_texts = []
438+
for i, text in enumerate(all_texts):
439+
if isinstance(text, str):
440+
# Remove any null characters or other problematic characters
441+
cleaned_text = text.replace('\x00', '').strip()
442+
if cleaned_text: # Only add non-empty strings
443+
validated_texts.append(cleaned_text)
444+
else:
445+
logging.warning(f"Skipping empty text at index {i}")
446+
else:
447+
logging.error(f"Non-string found at index {i}: {type(text)}")
448+
449+
# Replace all_texts with validated version
450+
all_texts = validated_texts
451+
411452
# precompute vectors
412453
vectors = embeddings.embed_documents(all_texts)
413454
text_embed_pairs = [
@@ -422,8 +463,8 @@ def create_database(self, texts, embeddings):
422463
TileDB.from_embeddings(
423464
text_embeddings=text_embed_pairs,
424465
embedding=embeddings,
425-
metadatas=all_metadatas,
426-
ids=all_ids,
466+
metadatas=all_metadatas[:len(all_texts)], # Ensure metadata matches text count
467+
ids=all_ids[:len(all_texts)], # Ensure IDs match text count
427468
metric="euclidean",
428469
index_uri=str(self.PERSIST_DIRECTORY),
429470
index_type="FLAT",

0 commit comments

Comments
 (0)