Skip to content

Commit c69d839

Browse files
authored
v7.8.1
1 parent 69c87ea commit c69d839

File tree

3 files changed

+169
-30
lines changed

3 files changed

+169
-30
lines changed

src/constants.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
"accelerate==1.7.0",
104104
"aiofiles==24.1.0",
105105
"aiohappyeyeballs==2.6.1",
106-
"aiohttp==3.11.18", # langchain libraries require <4
106+
"aiohttp==3.12.0", # langchain libraries require <4
107107
"aiosignal==1.3.2", # only required by aiohttp
108108
"anndata==0.11.4",
109109
"annotated-types==0.7.0",
@@ -120,8 +120,8 @@
120120
"cffi==1.17.1",
121121
"chardet==5.2.0",
122122
"charset-normalizer==3.4.2", # requests requires <4
123-
"chattts==0.2.3",
124-
"click==8.2.1",
123+
"chattts==0.2.4",
124+
"click==8.1.8", # gtts 2.5.4 requires <8.2, >=7.1
125125
"cloudpickle==3.1.1", # only required by tiledb-cloud and 3+ is only supported by tiledb-cloud 0.13+
126126
"colorama==0.4.6",
127127
"coloredlogs==15.0.1",
@@ -160,22 +160,22 @@
160160
"httpcore==1.0.9",
161161
"httpx==0.28.1",
162162
"httpx-sse==0.4.0",
163-
"huggingface-hub==0.31.4", # tokenizers 0.21.1 requires >=0.16.4,<1.0
163+
"huggingface-hub==0.32.0", # tokenizers 0.21.1 requires >=0.16.4,<1.0
164164
"humanfriendly==10.0",
165165
"HyperPyYAML==1.2.2",
166166
"idna==3.10",
167167
"img2pdf==0.6.1",
168168
"importlib_metadata==8.7.0",
169169
"Jinja2==3.1.6",
170170
"jiter==0.10.0", # required by openai newer versions
171-
"joblib==1.5.0",
171+
"joblib==1.5.1",
172172
"jsonpatch==1.33",
173173
"jsonpath-python==1.0.6",
174174
"jsonpointer==3.0.0",
175175
"kiwisolver==1.4.8",
176176
"langchain==0.3.25",
177177
"langchain-community==0.3.24",
178-
"langchain-core==0.3.60",
178+
"langchain-core==0.3.61",
179179
"langchain-huggingface==0.2.0",
180180
"langchain-text-splitters==0.3.8",
181181
"langdetect==1.0.9",
@@ -199,16 +199,14 @@
199199
"networkx==3.4.2",
200200
"nltk==3.9.1", # not higher; gives unexplained error
201201
"numba==0.61.2", # only required by openai-whisper and chattts
202-
# langchain 0.3.23 requires <3,>=1.26.4
203-
# langchain-community 0.3.21 requires >=1.26.2,<3
204-
# langchain-core 0.3.51 requires numpy >=1.24.0, <2.0.0 if python is less than 3.12 and <3,>=1.26.0 if 3.12+
205-
# numba 0.61.0 requires >=1.24,<2.2
206-
# scipy 1.15.2 requires >=1.23.5,<2.5
207-
# chattts 0.2.3 says it requires <2.0.0 but Claude cays 2+ is compatible
208-
"numpy==1.26.4",
202+
# langchain requires at least 1.26.2
203+
# numba requires less than 2.3
204+
# Scipy requires less than 2.5
205+
# "numpy==1.26.4",
206+
"numpy==2.2.2",
209207
"ocrmypdf==16.10.1",
210208
"olefile==0.47",
211-
"openai==1.79.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
209+
"openai==1.82.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
212210
"openai-whisper==20240930", # only required by whisper_s2t (if using openai vanilla backend)
213211
"openpyxl==3.1.5",
214212
"optimum==1.25.3",
@@ -230,8 +228,8 @@
230228
"pyarrow==20.0.0",
231229
"pybase16384==0.3.8", # only required by chattts
232230
"pycparser==2.22",
233-
"pydantic==2.11.4", # unstructured-client==0.35.0 requires pydantic>=2.11.2
234-
"pydantic_core==2.33.2", # pydantic 2.11.4 requires pydantic_core==2.33.2
231+
"pydantic==2.11.5", # unstructured-client==0.35.0 requires pydantic>=2.11.2
232+
"pydantic_core==2.33.2", # pydantic 2.11.5 requires pydantic_core==2.33.2
235233
"pydantic-settings==2.9.1", # langchain-community==0.3.23 requires pydantic-settings>=2.4.0,<3.0.0
236234
"Pygments==2.19.1",
237235
"PyOpenGL==3.1.9",
@@ -253,7 +251,7 @@
253251
"requests==2.32.3",
254252
"requests-toolbelt==1.0.0",
255253
"rich==14.0.0",
256-
"ruamel.yaml==0.18.10",
254+
"ruamel.yaml==0.18.11",
257255
"ruamel.yaml.clib==0.2.12",
258256
"safetensors==0.5.3",
259257
"scikit-learn==1.6.1",
@@ -283,8 +281,8 @@
283281
"timm==1.0.15",
284282
"tokenizers==0.21.1",
285283
"tqdm==4.67.1",
286-
"transformers==4.52.1",
287-
"typing-inspect==0.9.0",
284+
"transformers==4.52.3",
285+
"typing-inspection==0.4.1", # required by pydantic and pydantic-settings
288286
"typing_extensions==4.13.2",
289287
"unstructured-client==0.35.0",
290288
"tzdata==2025.2",
@@ -304,7 +302,7 @@
304302

305303
full_install_libs = [
306304
"PySide6==6.9.0",
307-
"pymupdf==1.25.5",
305+
"pymupdf==1.26.0",
308306
"unstructured==0.17.2"
309307
]
310308

@@ -572,7 +570,7 @@
572570
],
573571
'infly': [
574572
{
575-
'name': 'infly-retriever-v1-1.5b',
573+
'name': 'inf-retriever-v1-1.5b',
576574
'dimensions': 1536,
577575
'max_sequence': 8192,
578576
'size_mb': 3090,
@@ -583,12 +581,12 @@
583581
'precision': 'bfloat16'
584582
},
585583
{
586-
'name': 'infly-retriever-v1-7b',
584+
'name': 'inf-retriever-v1-7b',
587585
'dimensions': 3584,
588586
'max_sequence': 8192,
589587
'size_mb': 14130,
590588
'repo_id': 'infly/inf-retriever-v1',
591-
'cache_dir': 'infly--inf-retriever-v1',
589+
'cache_dir': 'infly--inf-retriever-v1-7b',
592590
'type': 'vector',
593591
'parameters': '7070m',
594592
'precision': 'bfloat16'

src/database_interactions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ def create_database(self, texts, embeddings):
417417

418418
# IMMEDIATE CLEANUP - free ~50-75% of memory
419419
del all_texts, vectors
420-
gc.collect()
420+
# gc.collect()
421421

422422
TileDB.from_embeddings(
423423
text_embeddings=text_embed_pairs,
@@ -430,7 +430,7 @@ def create_database(self, texts, embeddings):
430430
allow_dangerous_deserialization=True,
431431
)
432432

433-
my_cprint(f"Processed {len(all_texts)} chunks", "yellow")
433+
my_cprint(f"Processed all chunks", "yellow")
434434

435435
end_time = time.time()
436436
elapsed_time = end_time - start_time

src/module_process_images.py

Lines changed: 146 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ def process_single_image(self, raw_image):
162162
)
163163
return parsed.get('<MORE_DETAILED_CAPTION>', generated_text)
164164

165+
165166
class loader_glmv4(BaseLoader):
166167
def initialize_model_and_tokenizer(self):
167168
chosen_model = self.config['vision']['chosen_model']
@@ -225,6 +226,83 @@ def process_single_image(self, raw_image):
225226
return ""
226227

227228

229+
# class loader_ovis(BaseLoader):
230+
# def __init__(self, config):
231+
# super().__init__(config)
232+
# native = VISION_MODELS[self.config["vision"]["chosen_model"]]["precision"]
233+
# # Choose dtype on GPU: bfloat16 if supported, else float16; always float32 on CPU
234+
# if self.device == "cuda":
235+
# if native in ("float32", "bfloat16") and has_bfloat16_support():
236+
# self.dtype = torch.bfloat16
237+
# elif native == "float32":
238+
# self.dtype = torch.float16
239+
# else:
240+
# self.dtype = torch.float16
241+
# else:
242+
# self.dtype = torch.float32
243+
244+
# def initialize_model_and_tokenizer(self):
245+
# chosen_model = self.config["vision"]["chosen_model"]
246+
# info = VISION_MODELS[chosen_model]
247+
248+
# cache_dir = CACHE_DIR / info["cache_dir"]
249+
# cache_dir.mkdir(parents=True, exist_ok=True)
250+
251+
# model = AutoModelForCausalLM.from_pretrained(
252+
# info["repo_id"],
253+
# torch_dtype=self.dtype,
254+
# trust_remote_code=True,
255+
# multimodal_max_length=8192,
256+
# cache_dir=cache_dir
257+
# ).to(self.device)
258+
# model.eval()
259+
260+
# text_tokenizer = model.get_text_tokenizer()
261+
# visual_tokenizer = model.get_visual_tokenizer()
262+
263+
# for module in visual_tokenizer.modules():
264+
# if isinstance(module, torch.nn.Linear):
265+
# module.to(device=self.device, dtype=self.dtype)
266+
267+
# return model, text_tokenizer, visual_tokenizer
268+
269+
# @torch.inference_mode()
270+
# def process_single_image(self, raw_image):
271+
# prompt = (
272+
# "Explain everything you see in this picture "
273+
# "but your response should be no more than one paragraph."
274+
# )
275+
# query = f"<image>\n{prompt}"
276+
277+
# _, input_ids, pixel_values = self.model.preprocess_inputs(query, [raw_image])
278+
# attention_mask = torch.ne(input_ids, self.tokenizer.pad_token_id)
279+
280+
# # Batchify and move to the correct device & dtype
281+
# input_ids = input_ids.unsqueeze(0).to(self.device) # [1, seq_len]
282+
# attention_mask = attention_mask.unsqueeze(0).to(self.device) # [1, seq_len]
283+
# pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) # [num_patches,3,14,14]
284+
# pixel_values = [pixel_values] # wrap in list for generate()
285+
286+
# gen_kwargs = {
287+
# "max_new_tokens": 1024,
288+
# "do_sample": False,
289+
# "pad_token_id": self.tokenizer.pad_token_id,
290+
# "eos_token_id": self.tokenizer.eos_token_id,
291+
# "use_cache": True,
292+
# }
293+
294+
# # **Pass input_ids positionally** so Ovis2’s generate() sees it as text_input_ids
295+
# output_ids = self.model.generate(
296+
# input_ids,
297+
# pixel_values=pixel_values,
298+
# attention_mask=attention_mask,
299+
# **gen_kwargs
300+
# )[0]
301+
302+
# description = self.tokenizer.decode(output_ids, skip_special_tokens=True)
303+
# return " ".join(line.strip() for line in description.split("\n") if line.strip())
304+
305+
228306
class loader_ovis(BaseLoader):
229307
def __init__(self, config):
230308
super().__init__(config)
@@ -233,12 +311,18 @@ def __init__(self, config):
233311
if self.device == "cuda":
234312
if native in ("float32", "bfloat16") and has_bfloat16_support():
235313
self.dtype = torch.bfloat16
314+
print(f"OVIS: Selected bfloat16 precision based on native={native}")
236315
elif native == "float32":
237316
self.dtype = torch.float16
317+
print(f"OVIS: Selected float16 precision based on native={native}")
238318
else:
239319
self.dtype = torch.float16
320+
print(f"OVIS: Selected float16 precision based on native={native}")
240321
else:
241322
self.dtype = torch.float32
323+
print(f"OVIS: Selected float32 precision for CPU based on native={native}")
324+
325+
print(f"OVIS: Device={self.device}, Initial dtype selection={self.dtype}")
242326

243327
def initialize_model_and_tokenizer(self):
244328
chosen_model = self.config["vision"]["chosen_model"]
@@ -247,22 +331,64 @@ def initialize_model_and_tokenizer(self):
247331
cache_dir = CACHE_DIR / info["cache_dir"]
248332
cache_dir.mkdir(parents=True, exist_ok=True)
249333

334+
print(f"OVIS: Loading model with dtype={self.dtype}")
335+
250336
model = AutoModelForCausalLM.from_pretrained(
251337
info["repo_id"],
252338
torch_dtype=self.dtype,
253339
trust_remote_code=True,
254340
multimodal_max_length=8192,
255341
cache_dir=cache_dir
256342
).to(self.device)
343+
344+
# Print model layers precision before eval
345+
print("OVIS: Model layer precisions after loading:")
346+
for name, module in model.named_modules():
347+
if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.LayerNorm)):
348+
if hasattr(module, "weight") and module.weight is not None:
349+
print(f" Layer {name}: {module.weight.dtype}")
350+
257351
model.eval()
352+
353+
# Print model layers precision after eval
354+
print("OVIS: Model layer precisions after eval():")
355+
for name, module in model.named_modules():
356+
if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.LayerNorm)):
357+
if hasattr(module, "weight") and module.weight is not None:
358+
print(f" Layer {name}: {module.weight.dtype}")
258359

259360
text_tokenizer = model.get_text_tokenizer()
260361
visual_tokenizer = model.get_visual_tokenizer()
261362

363+
# Print visual tokenizer layer info before conversion
364+
print("OVIS: Visual tokenizer layer precisions before conversion:")
365+
for name, module in visual_tokenizer.named_modules():
366+
if isinstance(module, torch.nn.Linear):
367+
if hasattr(module, "weight") and module.weight is not None:
368+
print(f" VT Layer {name}: {module.weight.dtype}")
369+
370+
# Count modules before conversion
371+
linear_count = sum(1 for module in visual_tokenizer.modules()
372+
if isinstance(module, torch.nn.Linear))
373+
print(f"OVIS: Found {linear_count} Linear modules in visual_tokenizer")
374+
262375
for module in visual_tokenizer.modules():
263376
if isinstance(module, torch.nn.Linear):
377+
old_dtype = module.weight.dtype if hasattr(module, "weight") else "unknown"
264378
module.to(device=self.device, dtype=self.dtype)
379+
new_dtype = module.weight.dtype if hasattr(module, "weight") else "unknown"
380+
print(f"OVIS: Converting module from {old_dtype} to {self.dtype}, result={new_dtype}")
381+
382+
# Print visual tokenizer layer info after conversion
383+
print("OVIS: Visual tokenizer layer precisions after conversion:")
384+
for name, module in visual_tokenizer.named_modules():
385+
if isinstance(module, torch.nn.Linear):
386+
if hasattr(module, "weight") and module.weight is not None:
387+
print(f" VT Layer {name}: {module.weight.dtype}")
265388

389+
# Save model for process_single_image
390+
self.model = model
391+
266392
return model, text_tokenizer, visual_tokenizer
267393

268394
@torch.inference_mode()
@@ -273,14 +399,29 @@ def process_single_image(self, raw_image):
273399
)
274400
query = f"<image>\n{prompt}"
275401

402+
print("OVIS: Starting image processing")
276403
_, input_ids, pixel_values = self.model.preprocess_inputs(query, [raw_image])
404+
print(f"OVIS: After preprocess_inputs - pixel_values dtype={pixel_values.dtype}")
405+
277406
attention_mask = torch.ne(input_ids, self.tokenizer.pad_token_id)
278407

279408
# Batchify and move to the correct device & dtype
280-
input_ids = input_ids.unsqueeze(0).to(self.device) # [1, seq_len]
281-
attention_mask = attention_mask.unsqueeze(0).to(self.device) # [1, seq_len]
282-
pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) # [num_patches,3,14,14]
283-
pixel_values = [pixel_values] # wrap in list for generate()
409+
input_ids = input_ids.unsqueeze(0).to(self.device)
410+
attention_mask = attention_mask.unsqueeze(0).to(self.device)
411+
412+
print(f"OVIS: Before pixel_values conversion - dtype={pixel_values.dtype}")
413+
pixel_values = pixel_values.to(device=self.device, dtype=self.dtype)
414+
print(f"OVIS: After pixel_values conversion - dtype={pixel_values.dtype}")
415+
416+
pixel_values = [pixel_values] # wrap in list for generate()
417+
418+
# Check model precision during inference
419+
print("OVIS: Model layer precisions during inference:")
420+
for name, module in self.model.named_modules():
421+
if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
422+
if hasattr(module, "weight") and module.weight is not None:
423+
if name.startswith("transformer") or name.startswith("lm_head"):
424+
print(f" Inference layer {name}: {module.weight.dtype}")
284425

285426
gen_kwargs = {
286427
"max_new_tokens": 1024,
@@ -290,7 +431,7 @@ def process_single_image(self, raw_image):
290431
"use_cache": True,
291432
}
292433

293-
# **Pass input_ids positionally** so Ovis2s generate() sees it as text_input_ids
434+
# **Pass input_ids positionally** so Ovis2's generate() sees it as text_input_ids
294435
output_ids = self.model.generate(
295436
input_ids,
296437
pixel_values=pixel_values,

0 commit comments

Comments
 (0)