@@ -162,6 +162,7 @@ def process_single_image(self, raw_image):
162162 )
163163 return parsed .get ('<MORE_DETAILED_CAPTION>' , generated_text )
164164
165+
165166class loader_glmv4 (BaseLoader ):
166167 def initialize_model_and_tokenizer (self ):
167168 chosen_model = self .config ['vision' ]['chosen_model' ]
@@ -225,6 +226,83 @@ def process_single_image(self, raw_image):
225226 return ""
226227
227228
229+ # class loader_ovis(BaseLoader):
230+ # def __init__(self, config):
231+ # super().__init__(config)
232+ # native = VISION_MODELS[self.config["vision"]["chosen_model"]]["precision"]
233+ # # Choose dtype on GPU: bfloat16 if supported, else float16; always float32 on CPU
234+ # if self.device == "cuda":
235+ # if native in ("float32", "bfloat16") and has_bfloat16_support():
236+ # self.dtype = torch.bfloat16
237+ # elif native == "float32":
238+ # self.dtype = torch.float16
239+ # else:
240+ # self.dtype = torch.float16
241+ # else:
242+ # self.dtype = torch.float32
243+
244+ # def initialize_model_and_tokenizer(self):
245+ # chosen_model = self.config["vision"]["chosen_model"]
246+ # info = VISION_MODELS[chosen_model]
247+
248+ # cache_dir = CACHE_DIR / info["cache_dir"]
249+ # cache_dir.mkdir(parents=True, exist_ok=True)
250+
251+ # model = AutoModelForCausalLM.from_pretrained(
252+ # info["repo_id"],
253+ # torch_dtype=self.dtype,
254+ # trust_remote_code=True,
255+ # multimodal_max_length=8192,
256+ # cache_dir=cache_dir
257+ # ).to(self.device)
258+ # model.eval()
259+
260+ # text_tokenizer = model.get_text_tokenizer()
261+ # visual_tokenizer = model.get_visual_tokenizer()
262+
263+ # for module in visual_tokenizer.modules():
264+ # if isinstance(module, torch.nn.Linear):
265+ # module.to(device=self.device, dtype=self.dtype)
266+
267+ # return model, text_tokenizer, visual_tokenizer
268+
269+ # @torch.inference_mode()
270+ # def process_single_image(self, raw_image):
271+ # prompt = (
272+ # "Explain everything you see in this picture "
273+ # "but your response should be no more than one paragraph."
274+ # )
275+ # query = f"<image>\n{prompt}"
276+
277+ # _, input_ids, pixel_values = self.model.preprocess_inputs(query, [raw_image])
278+ # attention_mask = torch.ne(input_ids, self.tokenizer.pad_token_id)
279+
280+ # # Batchify and move to the correct device & dtype
281+ # input_ids = input_ids.unsqueeze(0).to(self.device) # [1, seq_len]
282+ # attention_mask = attention_mask.unsqueeze(0).to(self.device) # [1, seq_len]
283+ # pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) # [num_patches,3,14,14]
284+ # pixel_values = [pixel_values] # wrap in list for generate()
285+
286+ # gen_kwargs = {
287+ # "max_new_tokens": 1024,
288+ # "do_sample": False,
289+ # "pad_token_id": self.tokenizer.pad_token_id,
290+ # "eos_token_id": self.tokenizer.eos_token_id,
291+ # "use_cache": True,
292+ # }
293+
294+ # # **Pass input_ids positionally** so Ovis2’s generate() sees it as text_input_ids
295+ # output_ids = self.model.generate(
296+ # input_ids,
297+ # pixel_values=pixel_values,
298+ # attention_mask=attention_mask,
299+ # **gen_kwargs
300+ # )[0]
301+
302+ # description = self.tokenizer.decode(output_ids, skip_special_tokens=True)
303+ # return " ".join(line.strip() for line in description.split("\n") if line.strip())
304+
305+
228306class loader_ovis (BaseLoader ):
229307 def __init__ (self , config ):
230308 super ().__init__ (config )
@@ -233,12 +311,18 @@ def __init__(self, config):
233311 if self .device == "cuda" :
234312 if native in ("float32" , "bfloat16" ) and has_bfloat16_support ():
235313 self .dtype = torch .bfloat16
314+ print (f"OVIS: Selected bfloat16 precision based on native={ native } " )
236315 elif native == "float32" :
237316 self .dtype = torch .float16
317+ print (f"OVIS: Selected float16 precision based on native={ native } " )
238318 else :
239319 self .dtype = torch .float16
320+ print (f"OVIS: Selected float16 precision based on native={ native } " )
240321 else :
241322 self .dtype = torch .float32
323+ print (f"OVIS: Selected float32 precision for CPU based on native={ native } " )
324+
325+ print (f"OVIS: Device={ self .device } , Initial dtype selection={ self .dtype } " )
242326
243327 def initialize_model_and_tokenizer (self ):
244328 chosen_model = self .config ["vision" ]["chosen_model" ]
@@ -247,22 +331,64 @@ def initialize_model_and_tokenizer(self):
247331 cache_dir = CACHE_DIR / info ["cache_dir" ]
248332 cache_dir .mkdir (parents = True , exist_ok = True )
249333
334+ print (f"OVIS: Loading model with dtype={ self .dtype } " )
335+
250336 model = AutoModelForCausalLM .from_pretrained (
251337 info ["repo_id" ],
252338 torch_dtype = self .dtype ,
253339 trust_remote_code = True ,
254340 multimodal_max_length = 8192 ,
255341 cache_dir = cache_dir
256342 ).to (self .device )
343+
344+ # Print model layers precision before eval
345+ print ("OVIS: Model layer precisions after loading:" )
346+ for name , module in model .named_modules ():
347+ if isinstance (module , (torch .nn .Linear , torch .nn .Conv2d , torch .nn .LayerNorm )):
348+ if hasattr (module , "weight" ) and module .weight is not None :
349+ print (f" Layer { name } : { module .weight .dtype } " )
350+
257351 model .eval ()
352+
353+ # Print model layers precision after eval
354+ print ("OVIS: Model layer precisions after eval():" )
355+ for name , module in model .named_modules ():
356+ if isinstance (module , (torch .nn .Linear , torch .nn .Conv2d , torch .nn .LayerNorm )):
357+ if hasattr (module , "weight" ) and module .weight is not None :
358+ print (f" Layer { name } : { module .weight .dtype } " )
258359
259360 text_tokenizer = model .get_text_tokenizer ()
260361 visual_tokenizer = model .get_visual_tokenizer ()
261362
363+ # Print visual tokenizer layer info before conversion
364+ print ("OVIS: Visual tokenizer layer precisions before conversion:" )
365+ for name , module in visual_tokenizer .named_modules ():
366+ if isinstance (module , torch .nn .Linear ):
367+ if hasattr (module , "weight" ) and module .weight is not None :
368+ print (f" VT Layer { name } : { module .weight .dtype } " )
369+
370+ # Count modules before conversion
371+ linear_count = sum (1 for module in visual_tokenizer .modules ()
372+ if isinstance (module , torch .nn .Linear ))
373+ print (f"OVIS: Found { linear_count } Linear modules in visual_tokenizer" )
374+
262375 for module in visual_tokenizer .modules ():
263376 if isinstance (module , torch .nn .Linear ):
377+ old_dtype = module .weight .dtype if hasattr (module , "weight" ) else "unknown"
264378 module .to (device = self .device , dtype = self .dtype )
379+ new_dtype = module .weight .dtype if hasattr (module , "weight" ) else "unknown"
380+ print (f"OVIS: Converting module from { old_dtype } to { self .dtype } , result={ new_dtype } " )
381+
382+ # Print visual tokenizer layer info after conversion
383+ print ("OVIS: Visual tokenizer layer precisions after conversion:" )
384+ for name , module in visual_tokenizer .named_modules ():
385+ if isinstance (module , torch .nn .Linear ):
386+ if hasattr (module , "weight" ) and module .weight is not None :
387+ print (f" VT Layer { name } : { module .weight .dtype } " )
265388
389+ # Save model for process_single_image
390+ self .model = model
391+
266392 return model , text_tokenizer , visual_tokenizer
267393
268394 @torch .inference_mode ()
@@ -273,14 +399,29 @@ def process_single_image(self, raw_image):
273399 )
274400 query = f"<image>\n { prompt } "
275401
402+ print ("OVIS: Starting image processing" )
276403 _ , input_ids , pixel_values = self .model .preprocess_inputs (query , [raw_image ])
404+ print (f"OVIS: After preprocess_inputs - pixel_values dtype={ pixel_values .dtype } " )
405+
277406 attention_mask = torch .ne (input_ids , self .tokenizer .pad_token_id )
278407
279408 # Batchify and move to the correct device & dtype
280- input_ids = input_ids .unsqueeze (0 ).to (self .device ) # [1, seq_len]
281- attention_mask = attention_mask .unsqueeze (0 ).to (self .device ) # [1, seq_len]
282- pixel_values = pixel_values .to (device = self .device , dtype = self .dtype ) # [num_patches,3,14,14]
283- pixel_values = [pixel_values ] # wrap in list for generate()
409+ input_ids = input_ids .unsqueeze (0 ).to (self .device )
410+ attention_mask = attention_mask .unsqueeze (0 ).to (self .device )
411+
412+ print (f"OVIS: Before pixel_values conversion - dtype={ pixel_values .dtype } " )
413+ pixel_values = pixel_values .to (device = self .device , dtype = self .dtype )
414+ print (f"OVIS: After pixel_values conversion - dtype={ pixel_values .dtype } " )
415+
416+ pixel_values = [pixel_values ] # wrap in list for generate()
417+
418+ # Check model precision during inference
419+ print ("OVIS: Model layer precisions during inference:" )
420+ for name , module in self .model .named_modules ():
421+ if isinstance (module , (torch .nn .Linear , torch .nn .Conv2d )):
422+ if hasattr (module , "weight" ) and module .weight is not None :
423+ if name .startswith ("transformer" ) or name .startswith ("lm_head" ):
424+ print (f" Inference layer { name } : { module .weight .dtype } " )
284425
285426 gen_kwargs = {
286427 "max_new_tokens" : 1024 ,
@@ -290,7 +431,7 @@ def process_single_image(self, raw_image):
290431 "use_cache" : True ,
291432 }
292433
293- # **Pass input_ids positionally** so Ovis2’ s generate() sees it as text_input_ids
434+ # **Pass input_ids positionally** so Ovis2' s generate() sees it as text_input_ids
294435 output_ids = self .model .generate (
295436 input_ids ,
296437 pixel_values = pixel_values ,
0 commit comments