1515from tqdm import tqdm
1616from transformers import (
1717 AutoModelForCausalLM , AutoModel , AutoTokenizer , AutoProcessor , BlipForConditionalGeneration , BlipProcessor ,
18- LlamaTokenizer , LlavaForConditionalGeneration , BitsAndBytesConfig
18+ LlamaTokenizer , LlavaForConditionalGeneration , LlavaNextForConditionalGeneration , LlavaNextProcessor , BitsAndBytesConfig
1919)
2020
2121from langchain_community .docstore .document import Document
3333warnings .filterwarnings ("ignore" , category = DeprecationWarning )
3434logging .getLogger ().setLevel (logging .WARNING )
3535
36+ # warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
37+ # # logging.getLogger("transformers").setLevel(logging.CRITICAL)
38+ # logging.getLogger("transformers").setLevel(logging.ERROR)
39+ # logging.getLogger("transformers").setLevel(logging.WARNING)
40+ # logging.getLogger("transformers").setLevel(logging.INFO)
41+ # logging.getLogger("transformers").setLevel(logging.DEBUG)
42+
3643ALLOWED_EXTENSIONS = ['.png' , '.jpg' , '.jpeg' , '.bmp' , '.gif' , '.tif' , '.tiff' ]
3744
45+ current_directory = Path (__file__ ).parent
46+ CACHE_DIR = current_directory / "models" / "vision"
47+ CACHE_DIR .mkdir (parents = True , exist_ok = True )
48+
3849current_directory = Path (__file__ ).parent
3950VISION_DIR = current_directory / "models" / "vision"
4051VISION_DIR .mkdir (parents = True , exist_ok = True )
@@ -65,10 +76,20 @@ def choose_image_loader():
6576
6677 chosen_model = config ["vision" ]["chosen_model" ]
6778
68- if chosen_model == 'Moondream2' :
79+ if chosen_model in ['llava 1.5 - 7b' , 'bakllava 1.5 - 7b' , 'llava 1.5 - 13b' , ]:
80+ loader_func = loader_llava (config ).process_images
81+ elif chosen_model == 'Cogvlm' :
82+ loader_func = loader_cogvlm (config ).process_images
83+ elif chosen_model == 'Moondream2' :
6984 loader_func = loader_moondream (config ).process_images
7085 elif chosen_model in ["Florence-2-large" , "Florence-2-base" ]:
7186 loader_func = loader_florence2 (config ).process_images
87+ elif chosen_model == 'Phi-3-vision-128k-instruct' :
88+ loader_func = loader_phi3vision (config ).process_images
89+ elif chosen_model == 'MiniCPM-Llama3-V-2_5-int4' :
90+ loader_func = loader_minicpm_llama3v (config ).process_images
91+ elif chosen_model in ['Llava 1.6 Vicuna - 7b' , 'Llava 1.6 Vicuna - 13b' ]:
92+ loader_func = loader_llava_next (config ).process_images
7293 else :
7394 my_cprint ("No valid image model specified in config.yaml" , "red" )
7495 return []
@@ -143,6 +164,125 @@ def process_images(self):
143164 def process_single_image (self , raw_image ):
144165 raise NotImplementedError ("Subclasses must implement process_single_image method" )
145166
167+ class loader_cogvlm (BaseLoader ):
168+ def initialize_model_and_tokenizer (self ):
169+ model_name = 'THUDM/cogvlm-chat-hf'
170+ TORCH_TYPE = torch .bfloat16 if torch .cuda .is_available () and torch .cuda .get_device_capability ()[0 ] >= 8 else torch .float16
171+ quantization_config = BitsAndBytesConfig (load_in_4bit = True , bnb_4bit_compute_dtype = TORCH_TYPE )
172+
173+ tokenizer = LlamaTokenizer .from_pretrained ('lmsys/vicuna-7b-v1.5' )
174+ model = AutoModelForCausalLM .from_pretrained (
175+ model_name ,
176+ quantization_config = quantization_config ,
177+ torch_dtype = TORCH_TYPE ,
178+ low_cpu_mem_usage = True ,
179+ trust_remote_code = True
180+ )
181+
182+ my_cprint (f"Cogvlm vision model loaded into memory..." , "green" )
183+ return model , tokenizer , None
184+
185+ @torch .inference_mode ()
186+ def process_single_image (self , raw_image ):
187+ TORCH_TYPE = torch .bfloat16 if torch .cuda .is_available () and torch .cuda .get_device_capability ()[0 ] >= 8 else torch .float16
188+ prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
189+ inputs = self .model .build_conversation_input_ids (self .tokenizer , query = prompt , history = [], images = [raw_image ])
190+ inputs = {
191+ 'input_ids' : inputs ['input_ids' ].unsqueeze (0 ).to (self .device ),
192+ 'token_type_ids' : inputs ['token_type_ids' ].unsqueeze (0 ).to (self .device ),
193+ 'attention_mask' : inputs ['attention_mask' ].unsqueeze (0 ).to (self .device ),
194+ 'images' : [[inputs ['images' ][0 ].to ('cuda' ).to (TORCH_TYPE )]],
195+ }
196+
197+ gen_kwargs = {"max_length" : 2048 , "do_sample" : False }
198+ output = self .model .generate (** inputs , ** gen_kwargs )
199+ output = output [:, inputs ['input_ids' ].shape [1 ]:]
200+ model_response = self .tokenizer .decode (output [0 ], skip_special_tokens = True ).split ("ASSISTANT: " )[- 1 ]
201+ return model_response
202+
203+ class loader_llava (BaseLoader ):
204+ def initialize_model_and_tokenizer (self ):
205+ chosen_model = self .config ['vision' ]['chosen_model' ]
206+
207+ model_info = VISION_MODELS [chosen_model ]
208+ model_id = model_info ['repo_id' ]
209+ precision = model_info ['precision' ]
210+ save_dir = model_info ["cache_dir" ]
211+ cache_dir = CACHE_DIR / save_dir
212+ cache_dir .mkdir (parents = True , exist_ok = True )
213+
214+ quantization_config = BitsAndBytesConfig (load_in_4bit = True , bnb_4bit_compute_dtype = torch .float16 )
215+
216+ model = LlavaForConditionalGeneration .from_pretrained (
217+ model_id ,
218+ quantization_config = quantization_config ,
219+ torch_dtype = torch .float16 ,
220+ low_cpu_mem_usage = True ,
221+ cache_dir = cache_dir
222+ )
223+
224+ my_cprint (f"{ chosen_model } vision model loaded into memory..." , "green" )
225+
226+ processor = AutoProcessor .from_pretrained (model_id , cache_dir = cache_dir )
227+
228+ return model , None , processor
229+
230+ @torch .inference_mode ()
231+ def process_single_image (self , raw_image ):
232+ prompt = "USER: <image>\n Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.\n ASSISTANT:"
233+ inputs = self .processor (prompt , raw_image , return_tensors = 'pt' ).to (self .device )
234+ inputs = inputs .to (torch .float32 )
235+
236+ output = self .model .generate (** inputs , max_new_tokens = 512 , do_sample = False )
237+ full_response = self .processor .decode (output [0 ][2 :], skip_special_tokens = True , do_sample = False )
238+ model_response = full_response .split ("ASSISTANT: " )[- 1 ]
239+ return model_response
240+
241+
242+ class loader_llava_next (BaseLoader ):
243+ def initialize_model_and_tokenizer (self ):
244+ chosen_model = self .config ['vision' ]['chosen_model' ]
245+
246+ model_info = VISION_MODELS [chosen_model ]
247+ model_id = model_info ['repo_id' ]
248+ precision = model_info ['precision' ]
249+ save_dir = model_info ["cache_dir" ]
250+ cache_dir = CACHE_DIR / save_dir
251+ cache_dir .mkdir (parents = True , exist_ok = True )
252+
253+ quantization_config = BitsAndBytesConfig (
254+ load_in_4bit = True ,
255+ bnb_4bit_quant_type = "nf4" ,
256+ bnb_4bit_compute_dtype = torch .float16 ,
257+ )
258+
259+ model = LlavaNextForConditionalGeneration .from_pretrained (
260+ model_id ,
261+ quantization_config = quantization_config ,
262+ torch_dtype = torch .float16 ,
263+ low_cpu_mem_usage = True ,
264+ cache_dir = cache_dir
265+ )
266+
267+ my_cprint (f"{ chosen_model } vision model loaded into memory..." , "green" )
268+
269+ processor = LlavaNextProcessor .from_pretrained (model_id , cache_dir = cache_dir )
270+
271+ return model , None , processor
272+
273+ @ torch .inference_mode ()
274+ def process_single_image (self , raw_image ):
275+ user_prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
276+ prompt = f"USER: <image>\n { user_prompt } ASSISTANT:"
277+ inputs = self .processor (text = prompt , images = raw_image , return_tensors = "pt" ).to (self .device )
278+
279+ output = self .model .generate (** inputs , max_new_tokens = 512 , do_sample = False )
280+
281+ response = self .processor .decode (output [0 ], skip_special_tokens = True ) # possibly adjust to "full_response = self.processor.decode(output[0][2:], skip_special_tokens=True)" or something similar if output is preceded by special tokens inexplicatly
282+ model_response = response .split ("ASSISTANT:" )[- 1 ].strip ()
283+
284+ return model_response
285+
146286
147287class loader_moondream (BaseLoader ):
148288 def initialize_model_and_tokenizer (self ):
@@ -159,7 +299,7 @@ def initialize_model_and_tokenizer(self):
159299
160300 my_cprint (f"Moondream2 vision model loaded into memory..." , "green" )
161301
162- tokenizer = AutoTokenizer .from_pretrained (model_id , revision = "2024-05-20 " , cache_dir = cache_dir )
302+ tokenizer = AutoTokenizer .from_pretrained (model_id , revision = "2024-07-23 " , cache_dir = cache_dir )
163303
164304 return model , tokenizer , None
165305
@@ -180,11 +320,14 @@ def __init__(self, config):
180320
181321 def initialize_model_and_tokenizer (self ):
182322 chosen_model = self .config ['vision' ]['chosen_model' ]
183- model_id = VISION_MODELS [chosen_model ][' repo_id' ]
184- cache_dir = VISION_DIR
323+ repo_id = VISION_MODELS [chosen_model ][" repo_id" ]
324+ save_dir = VISION_MODELS [ chosen_model ][ "cache_dir" ]
185325
186- model = AutoModelForCausalLM .from_pretrained (model_id , trust_remote_code = True , cache_dir = cache_dir )
187- processor = AutoProcessor .from_pretrained (model_id , trust_remote_code = True , cache_dir = cache_dir )
326+ cache_dir = CACHE_DIR / save_dir
327+ cache_dir .mkdir (parents = True , exist_ok = True )
328+
329+ model = AutoModelForCausalLM .from_pretrained (repo_id , trust_remote_code = True , cache_dir = cache_dir )
330+ processor = AutoProcessor .from_pretrained (repo_id , trust_remote_code = True , cache_dir = cache_dir )
188331
189332 device_type , precision_type = self .get_device_and_precision ()
190333
@@ -228,3 +371,173 @@ def process_single_image(self, raw_image):
228371 parsed_answer = self .processor .post_process_generation (generated_text , task = prompt , image_size = (raw_image .width , raw_image .height ))
229372
230373 return parsed_answer ['<MORE_DETAILED_CAPTION>' ]
374+
375+
376+ class loader_phi3vision (BaseLoader ):
377+ def initialize_model_and_tokenizer (self ):
378+ chosen_model = self .config ['vision' ]['chosen_model' ]
379+ repo_id = VISION_MODELS [chosen_model ]["repo_id" ]
380+ save_dir = VISION_MODELS [chosen_model ]["cache_dir" ]
381+ cache_dir = CACHE_DIR / save_dir
382+ cache_dir .mkdir (parents = True , exist_ok = True )
383+
384+ quantization_config = BitsAndBytesConfig (
385+ load_in_4bit = True ,
386+ bnb_4bit_compute_dtype = torch .bfloat16 ,
387+ bnb_4bit_quant_type = "nf4"
388+ )
389+
390+ # microsoft/Phi-3-vision-128k-instruct
391+ model = AutoModelForCausalLM .from_pretrained (
392+ repo_id ,
393+ device_map = "cuda" ,
394+ trust_remote_code = True ,
395+ torch_dtype = "auto" ,
396+ attn_implementation = 'flash_attention_2' ,
397+ quantization_config = quantization_config ,
398+ cache_dir = cache_dir
399+ )
400+
401+ processor = AutoProcessor .from_pretrained (repo_id , trust_remote_code = True , cache_dir = cache_dir )
402+
403+ my_cprint (f"Microsoft-Phi-3-vision model loaded into memory..." , "green" )
404+
405+ return model , None , processor
406+
407+ @torch .inference_mode ()
408+ def process_single_image (self , raw_image ):
409+ prompt = f"""<|user|>
410+ <|image_1|>
411+ Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.<|end|>
412+ <|assistant|>
413+ """
414+ inputs = self .processor (prompt , [raw_image ], return_tensors = "pt" ).to (self .device )
415+
416+ generation_args = {
417+ "max_new_tokens" : 500 ,
418+ "temperature" : None ,
419+ "do_sample" : False ,
420+ }
421+
422+ generate_ids = self .model .generate (
423+ ** inputs ,
424+ eos_token_id = self .processor .tokenizer .eos_token_id ,
425+ ** generation_args
426+ )
427+
428+ generate_ids = generate_ids [:, inputs ['input_ids' ].shape [1 ]:]
429+ response = self .processor .batch_decode (
430+ generate_ids ,
431+ skip_special_tokens = True ,
432+ clean_up_tokenization_spaces = False
433+ )[0 ]
434+
435+ return response
436+
437+
438+ class loader_minicpm_llama3v (BaseLoader ):
439+ def initialize_model_and_tokenizer (self ):
440+ chosen_model = self .config ['vision' ]['chosen_model' ]
441+ repo_id = VISION_MODELS [chosen_model ]["repo_id" ]
442+ save_dir = VISION_MODELS [chosen_model ]["cache_dir" ]
443+ cache_dir = CACHE_DIR / save_dir
444+ cache_dir .mkdir (parents = True , exist_ok = True )
445+
446+ warnings .filterwarnings ("ignore" , category = UserWarning )
447+
448+ # openbmb/MiniCPM-Llama3-V-2_5-int4
449+ model = AutoModel .from_pretrained (
450+ repo_id ,
451+ trust_remote_code = True ,
452+ low_cpu_mem_usage = True ,
453+ cache_dir = cache_dir
454+ )
455+ tokenizer = AutoTokenizer .from_pretrained (
456+ repo_id ,
457+ trust_remote_code = True ,
458+ cache_dir = cache_dir
459+ )
460+ model .eval ()
461+
462+ my_cprint (f"MiniCPM-Llama3-V vision model loaded into memory..." , "green" )
463+
464+ return model , tokenizer , None
465+
466+ @torch .inference_mode ()
467+ def process_single_image (self , raw_image ):
468+ question = 'Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.'
469+ msgs = [{'role' : 'user' , 'content' : question }]
470+
471+ response = self .model .chat (
472+ image = raw_image ,
473+ msgs = msgs ,
474+ context = None ,
475+ tokenizer = self .tokenizer ,
476+ sampling = False ,
477+ temperature = None
478+ )
479+
480+ if isinstance (response , tuple ) and len (response ) == 3 :
481+ res , context , _ = response
482+ else :
483+ res = response
484+
485+ return res
486+
487+ '''
488+ class loader_bunny(BaseLoader):
489+ def initialize_model_and_tokenizer(self):
490+ transformers.logging.set_verbosity_error()
491+ transformers.logging.disable_progress_bar()
492+ warnings.filterwarnings('ignore')
493+
494+ #BAAI/Bunny-v1_1-4B
495+ # BAAI/Bunny-v1_1-Llama-3-8B-V
496+
497+ chosen_model = self.config['vision']['chosen_model']
498+ model_path = VISION_MODELS[chosen_model]["model_path"]
499+
500+ quantization_config = BitsAndBytesConfig(
501+ load_in_4bit=True,
502+ bnb_4bit_compute_dtype=torch.float16,
503+ bnb_4bit_quant_type="nf4"
504+ )
505+
506+ model = AutoModelForCausalLM.from_pretrained(
507+ model_path,
508+ torch_dtype=torch.float16,
509+ device_map='auto',
510+ trust_remote_code=True,
511+ quantization_config=quantization_config
512+ )
513+
514+ tokenizer = AutoTokenizer.from_pretrained(
515+ model_path,
516+ trust_remote_code=True
517+ )
518+
519+ my_cprint(f"Bunny vision model loaded into memory...", "green")
520+
521+ return model, tokenizer, None
522+
523+ @torch.inference_mode()
524+ def process_single_image(self, raw_image):
525+ prompt = "Describe what this image depicts in as much detail as possible."
526+ text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n {prompt} ASSISTANT:"
527+
528+ text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
529+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.device)
530+
531+ image_tensor = self.model.process_images([raw_image], self.model.config).to(dtype=self.model.dtype, device=self.device)
532+
533+ output_ids = self.model.generate(
534+ input_ids,
535+ images=image_tensor,
536+ max_length=4096,
537+ use_cache=True,
538+ repetition_penalty=1.0
539+ )[0].to(self.device)
540+
541+ result = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
542+ return result
543+ '''
0 commit comments