Skip to content

Commit d543c24

Browse files
authored
Add files via upload
1 parent c937e34 commit d543c24

13 files changed

+107
-185
lines changed

src/chat_kobold.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def connect_to_kobold(self, augmented_query):
4747

4848
response = None
4949
try:
50-
response = requests.post(self.api_url, json=payload, stream=True)
50+
response = requests.post(self.api_url, json=payload, stream=True, timeout=20)
5151
response.raise_for_status()
5252
client = sseclient.SSEClient(response)
5353

src/database_interactions.py

Lines changed: 52 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import gc
22
import os
3+
4+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
5+
36
import time
7+
import json
48
from copy import deepcopy
59
from pathlib import Path
610
from typing import Optional
@@ -26,33 +30,6 @@
2630
from constants import VECTOR_MODELS
2731

2832

29-
def _flatten_to_text(x):
30-
if x is None:
31-
return ""
32-
if isinstance(x, str):
33-
return x
34-
if isinstance(x, bytes):
35-
try:
36-
return x.decode("utf-8", "ignore")
37-
except Exception:
38-
return ""
39-
if isinstance(x, (bool, int, float)):
40-
return str(x)
41-
if hasattr(x, '__iter__') and not isinstance(x, (str, bytes)):
42-
parts = []
43-
try:
44-
for item in x:
45-
s = _flatten_to_text(item)
46-
if s:
47-
parts.append(s)
48-
return " ".join(parts)
49-
except Exception:
50-
return str(x)
51-
try:
52-
return str(x)
53-
except Exception:
54-
return ""
55-
5633
class BaseEmbeddingModel:
5734

5835
def __init__(self, model_name, model_kwargs, encode_kwargs, is_query: bool = False):
@@ -70,7 +47,7 @@ def prepare_kwargs(self):
7047
tok_kw = hf_embed_kw.setdefault("tokenizer_kwargs", {})
7148
tok_kw.update({
7249
"trust_remote_code": True,
73-
"use_fast": True,
50+
"use_fast": False,
7451
"model_max_length": 512,
7552
})
7653

@@ -95,10 +72,10 @@ def prepare_encode_kwargs(self):
9572
encode_kwargs.setdefault("convert_to_tensor", False)
9673
encode_kwargs.setdefault("show_progress_bar", not self.is_query)
9774

98-
params_to_remove = ['model_max_length', 'return_token_type_ids', 'show_progress_bar',
99-
'padding', 'truncation', 'max_length']
75+
encode_kwargs.setdefault("padding", True)
76+
encode_kwargs.setdefault("truncation", True)
10077

101-
for param in params_to_remove:
78+
for param in ['model_max_length', 'return_token_type_ids', 'show_progress_bar']:
10279
encode_kwargs.pop(param, None)
10380

10481
return encode_kwargs
@@ -360,29 +337,48 @@ def create_database(self, texts, embeddings):
360337
chunk_counters = defaultdict(int)
361338
skipped_chunks = 0
362339

363-
for idx, doc in enumerate(texts):
340+
for original_idx, doc in enumerate(texts):
341+
raw_content = None
342+
clean_content = None
343+
364344
try:
365345
if hasattr(doc, 'page_content'):
366346
raw_content = doc.page_content
367347
else:
368348
raw_content = doc
369-
349+
370350
if raw_content is None:
351+
print(f"Skipping chunk {original_idx}: no content")
352+
skipped_chunks += 1
353+
continue
354+
355+
if isinstance(raw_content, (list, tuple, dict)):
356+
print(f"Skipping chunk {original_idx}: invalid type {type(raw_content)}")
371357
skipped_chunks += 1
372358
continue
373359

374360
if not isinstance(raw_content, str):
375-
raw_content = str(raw_content)
361+
try:
362+
raw_content = str(raw_content)
363+
except Exception as e:
364+
print(f"Skipping chunk {original_idx}: cannot coerce {type(raw_content)} to str ({e})")
365+
skipped_chunks += 1
366+
continue
376367

377368
clean_content = raw_content.replace('\x00', ' ')
369+
clean_content = ' '.join(clean_content.split())
378370

379-
words = clean_content.split()
380-
clean_content = ' '.join(words)
371+
if not clean_content or not clean_content.strip():
372+
skipped_chunks += 1
373+
continue
381374

382-
if not clean_content:
375+
if isinstance(clean_content, (list, tuple, dict)):
376+
print(f"Skipping chunk {original_idx}: invalid type after clean {type(clean_content)}")
383377
skipped_chunks += 1
384378
continue
385379

380+
clean_content.encode('utf-8')
381+
386382
file_hash = doc.metadata.get('hash') if hasattr(doc, 'metadata') else None
387383
chunk_counters[file_hash] += 1
388384
tiledb_id = str(random.randint(0, MAX_UINT64 - 1))
@@ -391,9 +387,19 @@ def create_database(self, texts, embeddings):
391387
all_metadatas.append(doc.metadata if hasattr(doc, 'metadata') else {})
392388
all_ids.append(tiledb_id)
393389
hash_id_mappings.append((tiledb_id, file_hash))
394-
390+
395391
except Exception as e:
396-
print(f"Error processing chunk {idx}: {e}")
392+
preview = None
393+
try:
394+
preview = (clean_content if isinstance(clean_content, str) else raw_content)
395+
if isinstance(preview, str):
396+
preview = preview[:120].replace('\n', ' ')
397+
else:
398+
preview = repr(preview)[:120]
399+
except Exception:
400+
preview = "<unavailable>"
401+
402+
print(f"Error processing chunk {original_idx}: {e} | preview: {preview}")
397403
skipped_chunks += 1
398404
continue
399405

@@ -410,6 +416,10 @@ def create_database(self, texts, embeddings):
410416

411417
embedding_start_time = time.time()
412418

419+
for i, t in enumerate(all_texts):
420+
if not isinstance(t, str):
421+
raise TypeError(f"Non-string at index {i}: {type(t)}")
422+
413423
vectors = embeddings.embed_documents(all_texts)
414424

415425
embedding_end_time = time.time()
@@ -486,12 +496,12 @@ def create_metadata_db(self, documents, hash_id_mappings):
486496
for doc in documents
487497
]
488498
cursor.executemany('''
489-
INSERT INTO document_metadata (file_name, hash, file_path, page_content)
499+
INSERT OR REPLACE INTO document_metadata (file_name, hash, file_path, page_content)
490500
VALUES (?, ?, ?, ?)
491501
''', doc_rows)
492502

493503
cursor.executemany('''
494-
INSERT INTO hash_chunk_ids (tiledb_id, hash)
504+
INSERT OR REPLACE INTO hash_chunk_ids (tiledb_id, hash)
495505
VALUES (?, ?)
496506
''', hash_id_mappings)
497507

@@ -727,7 +737,7 @@ def search(self, query, k: Optional[int] = None, score_threshold: Optional[float
727737
if search_term:
728738
filtered_contexts = [
729739
(doc, score) for doc, score in relevant_contexts
730-
if search_term in str(doc.page_content).lower()
740+
if search_term not in str(doc.page_content).lower()
731741
]
732742
else:
733743
filtered_contexts = relevant_contexts

src/document_processor.py

Lines changed: 14 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,14 @@
2727
BSHTMLLoader
2828
)
2929

30-
from typing import Optional, Any, Iterator, Union
30+
from typing import Optional, Any, Iterator, Union, List
3131
from langchain_community.document_loaders.blob_loaders import Blob
3232
from langchain_community.document_loaders.parsers import PyMuPDFParser
3333
import pymupdf
3434

3535
from constants import DOCUMENT_LOADERS
3636
from extract_metadata import extract_document_metadata, add_pymupdf_page_metadata, compute_content_hash
3737

38-
# logging.basicConfig(
39-
# level=logging.ERROR,
40-
# format='%(asctime)s - %(levelname)s - %(message)s',
41-
# handlers=[
42-
# logging.FileHandler('document_processor.log', mode='w')
43-
# ]
44-
# )
45-
4638
warnings.filterwarnings("ignore", category=FutureWarning)
4739
warnings.filterwarnings("ignore", category=UserWarning)
4840

@@ -51,33 +43,25 @@
5143
INGEST_THREADS = max(2, os.cpu_count() - 12)
5244

5345

54-
from typing import List
55-
5646
class FixedSizeTextSplitter:
57-
"""Splits text into equally-sized character chunks.
58-
59-
Parameters
60-
----------
61-
chunk_size : int
62-
Maximum characters per chunk. Taken from config.yaml.
63-
"""
6447

65-
def __init__(self, chunk_size: int):
48+
def __init__(self, chunk_size: int, chunk_overlap: int = 0):
6649
self.chunk_size = chunk_size
50+
self.chunk_overlap = chunk_overlap
6751

6852
def split_documents(self, docs: List[Document]) -> List[Document]:
6953
chunks: List[Document] = []
7054
for doc in docs:
7155
text = doc.page_content or ""
72-
for start in range(0, len(text), self.chunk_size):
73-
piece = text[start : start + self.chunk_size].strip()
74-
if not piece:
75-
continue
76-
# shallow-copy metadata so each chunk carries origin info
77-
chunks.append(Document(page_content=piece, metadata=dict(doc.metadata)))
56+
start = 0
57+
while start < len(text):
58+
end = start + self.chunk_size
59+
piece = text[start:end].strip()
60+
if piece:
61+
chunks.append(Document(page_content=piece, metadata=dict(doc.metadata)))
62+
start += self.chunk_size - self.chunk_overlap
7863
return chunks
7964

80-
8165
class CustomPyMuPDFParser(PyMuPDFParser):
8266
def _lazy_parse(self, blob: Blob, text_kwargs: Optional[dict[str, Any]] = None) -> Iterator[Document]:
8367
with PyMuPDFParser._lock:
@@ -103,7 +87,6 @@ def __init__(self, file_path: Union[str, PurePath], **kwargs: Any) -> None:
10387
extract_images=kwargs.get('extract_images', False)
10488
)
10589

106-
# map loaders
10790
for ext, loader_name in DOCUMENT_LOADERS.items():
10891
DOCUMENT_LOADERS[ext] = globals()[loader_name]
10992

@@ -208,7 +191,6 @@ def load_documents(source_dir: Path) -> list:
208191
if doc_paths:
209192
n_workers = min(INGEST_THREADS, max(len(doc_paths), 1))
210193

211-
total_cores = os.cpu_count()
212194
threads_per_process = 2
213195

214196
with ProcessPoolExecutor(n_workers) as executor:
@@ -228,24 +210,17 @@ def split_documents(documents=None, text_documents_pdf=None):
228210
try:
229211
print("\nSplitting documents into chunks.")
230212

231-
with open("config.yaml", "r", encoding='utf-8') as config_file:
213+
config_path = Path(__file__).resolve().parent / "config.yaml"
214+
with open(config_path, "r", encoding='utf-8') as config_file:
232215
config = yaml.safe_load(config_file)
233216
chunk_size = config["database"]["chunk_size"]
234217
chunk_overlap = config["database"]["chunk_overlap"]
235218

236-
# instantiate text splitter
237-
text_splitter = FixedSizeTextSplitter(chunk_size=chunk_size)
238-
239-
# text_splitter = RecursiveCharacterTextSplitter(
240-
# chunk_size=chunk_size,
241-
# chunk_overlap=chunk_overlap,
242-
# keep_separator=False,
243-
# )
219+
text_splitter = FixedSizeTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
244220

245221
texts = []
246222

247223
if documents:
248-
# use text splitter directly
249224
texts = text_splitter.split_documents(documents)
250225

251226
if text_documents_pdf:
@@ -265,4 +240,4 @@ def split_documents(documents=None, text_documents_pdf=None):
265240
except Exception as e:
266241
logging.exception("Error during document splitting")
267242
logging.error(f"Error type: {type(e)}")
268-
raise
243+
raise

src/extract_metadata.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,15 @@ def extract_common_metadata(file_path, content_hash=None):
3232
"modification_date": modification_date,
3333
"hash": file_hash
3434
}
35-
36-
#=========================================================================
35+
3736
clean_metadata = {}
3837
for k, v in metadata.items():
3938
if isinstance(v, (str, int, float, bool, type(None))):
4039
clean_metadata[k] = v
41-
elif isinstance(v, enumerate):
42-
print(f"❌ ENUMERATE in metadata key '{k}' - converting to string")
43-
clean_metadata[k] = str(list(v))
4440
else:
4541
clean_metadata[k] = str(v)
46-
42+
4743
return clean_metadata
48-
#=========================================================================
4944

5045
def extract_image_metadata(file_path):
5146
metadata = extract_common_metadata(file_path)
@@ -81,21 +76,21 @@ def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[Tuple[str
8176
end = len(clean_text)
8277
chunk = clean_text[start:end].strip()
8378

84-
page_num = None
79+
page_num = 1
8580
for marker_pos, page in reversed(page_markers):
8681
if marker_pos <= start:
8782
page_num = page
8883
break
8984

90-
if chunk and page_num is not None:
85+
if chunk:
9186
chunks.append((chunk, page_num))
87+
9288
start += chunk_size - chunk_overlap
9389

9490
return chunks
9591

9692
chunks = split_text(doc.page_content, chunk_size, chunk_overlap)
9793

98-
#================================================================================================
9994
new_docs = []
10095
for chunk, page_num in chunks:
10196
new_metadata = {}
@@ -105,9 +100,6 @@ def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[Tuple[str
105100
key = str(k)
106101
if isinstance(v, (str, int, float, bool)):
107102
new_metadata[key] = v
108-
elif isinstance(v, enumerate):
109-
print(f"❌ ENUMERATE in chunk metadata key '{key}' - converting to string")
110-
new_metadata[key] = str(list(v))
111103
else:
112104
new_metadata[key] = str(v)
113105

@@ -118,6 +110,5 @@ def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[Tuple[str
118110
metadata=new_metadata
119111
)
120112
new_docs.append(new_doc)
121-
#================================================================================================
122113

123114
return new_docs

0 commit comments

Comments
 (0)