1515from tqdm import tqdm
1616from transformers import (
1717 AutoModelForCausalLM , AutoModel , AutoTokenizer , AutoProcessor , BlipForConditionalGeneration , BlipProcessor ,
18- LlamaTokenizer , LlavaForConditionalGeneration , LlavaNextForConditionalGeneration , LlavaNextProcessor , BitsAndBytesConfig
18+ LlamaTokenizer , LlavaForConditionalGeneration , BitsAndBytesConfig
1919)
2020
2121from langchain_community .docstore .document import Document
3333warnings .filterwarnings ("ignore" , category = DeprecationWarning )
3434logging .getLogger ().setLevel (logging .WARNING )
3535
36- # warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
37- # # logging.getLogger("transformers").setLevel(logging.CRITICAL)
38- # logging.getLogger("transformers").setLevel(logging.ERROR)
39- # logging.getLogger("transformers").setLevel(logging.WARNING)
40- # logging.getLogger("transformers").setLevel(logging.INFO)
41- # logging.getLogger("transformers").setLevel(logging.DEBUG)
42-
4336ALLOWED_EXTENSIONS = ['.png' , '.jpg' , '.jpeg' , '.bmp' , '.gif' , '.tif' , '.tiff' ]
4437
45- current_directory = Path (__file__ ).parent
46- CACHE_DIR = current_directory / "models" / "vision"
47- CACHE_DIR .mkdir (parents = True , exist_ok = True )
48-
4938current_directory = Path (__file__ ).parent
5039VISION_DIR = current_directory / "models" / "vision"
5140VISION_DIR .mkdir (parents = True , exist_ok = True )
@@ -76,20 +65,10 @@ def choose_image_loader():
7665
7766 chosen_model = config ["vision" ]["chosen_model" ]
7867
79- if chosen_model in ['llava 1.5 - 7b' , 'bakllava 1.5 - 7b' , 'llava 1.5 - 13b' , ]:
80- loader_func = loader_llava (config ).process_images
81- elif chosen_model == 'Cogvlm' :
82- loader_func = loader_cogvlm (config ).process_images
83- elif chosen_model == 'Moondream2' :
68+ if chosen_model == 'Moondream2' :
8469 loader_func = loader_moondream (config ).process_images
8570 elif chosen_model in ["Florence-2-large" , "Florence-2-base" ]:
8671 loader_func = loader_florence2 (config ).process_images
87- elif chosen_model == 'Phi-3-vision-128k-instruct' :
88- loader_func = loader_phi3vision (config ).process_images
89- elif chosen_model == 'MiniCPM-Llama3-V-2_5-int4' :
90- loader_func = loader_minicpm_llama3v (config ).process_images
91- elif chosen_model in ['Llava 1.6 Vicuna - 7b' , 'Llava 1.6 Vicuna - 13b' ]:
92- loader_func = loader_llava_next (config ).process_images
9372 else :
9473 my_cprint ("No valid image model specified in config.yaml" , "red" )
9574 return []
@@ -164,125 +143,6 @@ def process_images(self):
164143 def process_single_image (self , raw_image ):
165144 raise NotImplementedError ("Subclasses must implement process_single_image method" )
166145
167- class loader_cogvlm (BaseLoader ):
168- def initialize_model_and_tokenizer (self ):
169- model_name = 'THUDM/cogvlm-chat-hf'
170- TORCH_TYPE = torch .bfloat16 if torch .cuda .is_available () and torch .cuda .get_device_capability ()[0 ] >= 8 else torch .float16
171- quantization_config = BitsAndBytesConfig (load_in_4bit = True , bnb_4bit_compute_dtype = TORCH_TYPE )
172-
173- tokenizer = LlamaTokenizer .from_pretrained ('lmsys/vicuna-7b-v1.5' )
174- model = AutoModelForCausalLM .from_pretrained (
175- model_name ,
176- quantization_config = quantization_config ,
177- torch_dtype = TORCH_TYPE ,
178- low_cpu_mem_usage = True ,
179- trust_remote_code = True
180- )
181-
182- my_cprint (f"Cogvlm vision model loaded into memory..." , "green" )
183- return model , tokenizer , None
184-
185- @torch .inference_mode ()
186- def process_single_image (self , raw_image ):
187- TORCH_TYPE = torch .bfloat16 if torch .cuda .is_available () and torch .cuda .get_device_capability ()[0 ] >= 8 else torch .float16
188- prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
189- inputs = self .model .build_conversation_input_ids (self .tokenizer , query = prompt , history = [], images = [raw_image ])
190- inputs = {
191- 'input_ids' : inputs ['input_ids' ].unsqueeze (0 ).to (self .device ),
192- 'token_type_ids' : inputs ['token_type_ids' ].unsqueeze (0 ).to (self .device ),
193- 'attention_mask' : inputs ['attention_mask' ].unsqueeze (0 ).to (self .device ),
194- 'images' : [[inputs ['images' ][0 ].to ('cuda' ).to (TORCH_TYPE )]],
195- }
196-
197- gen_kwargs = {"max_length" : 2048 , "do_sample" : False }
198- output = self .model .generate (** inputs , ** gen_kwargs )
199- output = output [:, inputs ['input_ids' ].shape [1 ]:]
200- model_response = self .tokenizer .decode (output [0 ], skip_special_tokens = True ).split ("ASSISTANT: " )[- 1 ]
201- return model_response
202-
203- class loader_llava (BaseLoader ):
204- def initialize_model_and_tokenizer (self ):
205- chosen_model = self .config ['vision' ]['chosen_model' ]
206-
207- model_info = VISION_MODELS [chosen_model ]
208- model_id = model_info ['repo_id' ]
209- precision = model_info ['precision' ]
210- save_dir = model_info ["cache_dir" ]
211- cache_dir = CACHE_DIR / save_dir
212- cache_dir .mkdir (parents = True , exist_ok = True )
213-
214- quantization_config = BitsAndBytesConfig (load_in_4bit = True , bnb_4bit_compute_dtype = torch .float16 )
215-
216- model = LlavaForConditionalGeneration .from_pretrained (
217- model_id ,
218- quantization_config = quantization_config ,
219- torch_dtype = torch .float16 ,
220- low_cpu_mem_usage = True ,
221- cache_dir = cache_dir
222- )
223-
224- my_cprint (f"{ chosen_model } vision model loaded into memory..." , "green" )
225-
226- processor = AutoProcessor .from_pretrained (model_id , cache_dir = cache_dir )
227-
228- return model , None , processor
229-
230- @torch .inference_mode ()
231- def process_single_image (self , raw_image ):
232- prompt = "USER: <image>\n Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.\n ASSISTANT:"
233- inputs = self .processor (prompt , raw_image , return_tensors = 'pt' ).to (self .device )
234- inputs = inputs .to (torch .float32 )
235-
236- output = self .model .generate (** inputs , max_new_tokens = 512 , do_sample = False )
237- full_response = self .processor .decode (output [0 ][2 :], skip_special_tokens = True , do_sample = False )
238- model_response = full_response .split ("ASSISTANT: " )[- 1 ]
239- return model_response
240-
241-
242- class loader_llava_next (BaseLoader ):
243- def initialize_model_and_tokenizer (self ):
244- chosen_model = self .config ['vision' ]['chosen_model' ]
245-
246- model_info = VISION_MODELS [chosen_model ]
247- model_id = model_info ['repo_id' ]
248- precision = model_info ['precision' ]
249- save_dir = model_info ["cache_dir" ]
250- cache_dir = CACHE_DIR / save_dir
251- cache_dir .mkdir (parents = True , exist_ok = True )
252-
253- quantization_config = BitsAndBytesConfig (
254- load_in_4bit = True ,
255- bnb_4bit_quant_type = "nf4" ,
256- bnb_4bit_compute_dtype = torch .float16 ,
257- )
258-
259- model = LlavaNextForConditionalGeneration .from_pretrained (
260- model_id ,
261- quantization_config = quantization_config ,
262- torch_dtype = torch .float16 ,
263- low_cpu_mem_usage = True ,
264- cache_dir = cache_dir
265- )
266-
267- my_cprint (f"{ chosen_model } vision model loaded into memory..." , "green" )
268-
269- processor = LlavaNextProcessor .from_pretrained (model_id , cache_dir = cache_dir )
270-
271- return model , None , processor
272-
273- @ torch .inference_mode ()
274- def process_single_image (self , raw_image ):
275- user_prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
276- prompt = f"USER: <image>\n { user_prompt } ASSISTANT:"
277- inputs = self .processor (text = prompt , images = raw_image , return_tensors = "pt" ).to (self .device )
278-
279- output = self .model .generate (** inputs , max_new_tokens = 512 , do_sample = False )
280-
281- response = self .processor .decode (output [0 ], skip_special_tokens = True ) # possibly adjust to "full_response = self.processor.decode(output[0][2:], skip_special_tokens=True)" or something similar if output is preceded by special tokens inexplicatly
282- model_response = response .split ("ASSISTANT:" )[- 1 ].strip ()
283-
284- return model_response
285-
286146
287147class loader_moondream (BaseLoader ):
288148 def initialize_model_and_tokenizer (self ):
@@ -320,14 +180,11 @@ def __init__(self, config):
320180
321181 def initialize_model_and_tokenizer (self ):
322182 chosen_model = self .config ['vision' ]['chosen_model' ]
323- repo_id = VISION_MODELS [chosen_model ]["repo_id" ]
324- save_dir = VISION_MODELS [chosen_model ]["cache_dir" ]
325-
326- cache_dir = CACHE_DIR / save_dir
327- cache_dir .mkdir (parents = True , exist_ok = True )
183+ model_id = VISION_MODELS [chosen_model ]['repo_id' ]
184+ cache_dir = VISION_DIR
328185
329- model = AutoModelForCausalLM .from_pretrained (repo_id , trust_remote_code = True , cache_dir = cache_dir )
330- processor = AutoProcessor .from_pretrained (repo_id , trust_remote_code = True , cache_dir = cache_dir )
186+ model = AutoModelForCausalLM .from_pretrained (model_id , trust_remote_code = True , cache_dir = cache_dir )
187+ processor = AutoProcessor .from_pretrained (model_id , trust_remote_code = True , cache_dir = cache_dir )
331188
332189 device_type , precision_type = self .get_device_and_precision ()
333190
@@ -371,173 +228,3 @@ def process_single_image(self, raw_image):
371228 parsed_answer = self .processor .post_process_generation (generated_text , task = prompt , image_size = (raw_image .width , raw_image .height ))
372229
373230 return parsed_answer ['<MORE_DETAILED_CAPTION>' ]
374-
375-
376- class loader_phi3vision (BaseLoader ):
377- def initialize_model_and_tokenizer (self ):
378- chosen_model = self .config ['vision' ]['chosen_model' ]
379- repo_id = VISION_MODELS [chosen_model ]["repo_id" ]
380- save_dir = VISION_MODELS [chosen_model ]["cache_dir" ]
381- cache_dir = CACHE_DIR / save_dir
382- cache_dir .mkdir (parents = True , exist_ok = True )
383-
384- quantization_config = BitsAndBytesConfig (
385- load_in_4bit = True ,
386- bnb_4bit_compute_dtype = torch .bfloat16 ,
387- bnb_4bit_quant_type = "nf4"
388- )
389-
390- # microsoft/Phi-3-vision-128k-instruct
391- model = AutoModelForCausalLM .from_pretrained (
392- repo_id ,
393- device_map = "cuda" ,
394- trust_remote_code = True ,
395- torch_dtype = "auto" ,
396- attn_implementation = 'flash_attention_2' ,
397- quantization_config = quantization_config ,
398- cache_dir = cache_dir
399- )
400-
401- processor = AutoProcessor .from_pretrained (repo_id , trust_remote_code = True , cache_dir = cache_dir )
402-
403- my_cprint (f"Microsoft-Phi-3-vision model loaded into memory..." , "green" )
404-
405- return model , None , processor
406-
407- @torch .inference_mode ()
408- def process_single_image (self , raw_image ):
409- prompt = f"""<|user|>
410- <|image_1|>
411- Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.<|end|>
412- <|assistant|>
413- """
414- inputs = self .processor (prompt , [raw_image ], return_tensors = "pt" ).to (self .device )
415-
416- generation_args = {
417- "max_new_tokens" : 500 ,
418- "temperature" : None ,
419- "do_sample" : False ,
420- }
421-
422- generate_ids = self .model .generate (
423- ** inputs ,
424- eos_token_id = self .processor .tokenizer .eos_token_id ,
425- ** generation_args
426- )
427-
428- generate_ids = generate_ids [:, inputs ['input_ids' ].shape [1 ]:]
429- response = self .processor .batch_decode (
430- generate_ids ,
431- skip_special_tokens = True ,
432- clean_up_tokenization_spaces = False
433- )[0 ]
434-
435- return response
436-
437-
438- class loader_minicpm_llama3v (BaseLoader ):
439- def initialize_model_and_tokenizer (self ):
440- chosen_model = self .config ['vision' ]['chosen_model' ]
441- repo_id = VISION_MODELS [chosen_model ]["repo_id" ]
442- save_dir = VISION_MODELS [chosen_model ]["cache_dir" ]
443- cache_dir = CACHE_DIR / save_dir
444- cache_dir .mkdir (parents = True , exist_ok = True )
445-
446- warnings .filterwarnings ("ignore" , category = UserWarning )
447-
448- # openbmb/MiniCPM-Llama3-V-2_5-int4
449- model = AutoModel .from_pretrained (
450- repo_id ,
451- trust_remote_code = True ,
452- low_cpu_mem_usage = True ,
453- cache_dir = cache_dir
454- )
455- tokenizer = AutoTokenizer .from_pretrained (
456- repo_id ,
457- trust_remote_code = True ,
458- cache_dir = cache_dir
459- )
460- model .eval ()
461-
462- my_cprint (f"MiniCPM-Llama3-V vision model loaded into memory..." , "green" )
463-
464- return model , tokenizer , None
465-
466- @torch .inference_mode ()
467- def process_single_image (self , raw_image ):
468- question = 'Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.'
469- msgs = [{'role' : 'user' , 'content' : question }]
470-
471- response = self .model .chat (
472- image = raw_image ,
473- msgs = msgs ,
474- context = None ,
475- tokenizer = self .tokenizer ,
476- sampling = False ,
477- temperature = None
478- )
479-
480- if isinstance (response , tuple ) and len (response ) == 3 :
481- res , context , _ = response
482- else :
483- res = response
484-
485- return res
486-
487- '''
488- class loader_bunny(BaseLoader):
489- def initialize_model_and_tokenizer(self):
490- transformers.logging.set_verbosity_error()
491- transformers.logging.disable_progress_bar()
492- warnings.filterwarnings('ignore')
493-
494- #BAAI/Bunny-v1_1-4B
495- # BAAI/Bunny-v1_1-Llama-3-8B-V
496-
497- chosen_model = self.config['vision']['chosen_model']
498- model_path = VISION_MODELS[chosen_model]["model_path"]
499-
500- quantization_config = BitsAndBytesConfig(
501- load_in_4bit=True,
502- bnb_4bit_compute_dtype=torch.float16,
503- bnb_4bit_quant_type="nf4"
504- )
505-
506- model = AutoModelForCausalLM.from_pretrained(
507- model_path,
508- torch_dtype=torch.float16,
509- device_map='auto',
510- trust_remote_code=True,
511- quantization_config=quantization_config
512- )
513-
514- tokenizer = AutoTokenizer.from_pretrained(
515- model_path,
516- trust_remote_code=True
517- )
518-
519- my_cprint(f"Bunny vision model loaded into memory...", "green")
520-
521- return model, tokenizer, None
522-
523- @torch.inference_mode()
524- def process_single_image(self, raw_image):
525- prompt = "Describe what this image depicts in as much detail as possible."
526- text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n {prompt} ASSISTANT:"
527-
528- text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
529- input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.device)
530-
531- image_tensor = self.model.process_images([raw_image], self.model.config).to(dtype=self.model.dtype, device=self.device)
532-
533- output_ids = self.model.generate(
534- input_ids,
535- images=image_tensor,
536- max_length=4096,
537- use_cache=True,
538- repetition_penalty=1.0
539- )[0].to(self.device)
540-
541- result = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
542- return result
543- '''
0 commit comments