Skip to content

Commit e84bfd9

Browse files
authored
fix moondream2
1 parent bb5a9b6 commit e84bfd9

File tree

1 file changed

+320
-7
lines changed

1 file changed

+320
-7
lines changed

src/module_process_images.py

Lines changed: 320 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from tqdm import tqdm
1616
from transformers import (
1717
AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, BlipForConditionalGeneration, BlipProcessor,
18-
LlamaTokenizer, LlavaForConditionalGeneration, BitsAndBytesConfig
18+
LlamaTokenizer, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig
1919
)
2020

2121
from langchain_community.docstore.document import Document
@@ -33,8 +33,19 @@
3333
warnings.filterwarnings("ignore", category=DeprecationWarning)
3434
logging.getLogger().setLevel(logging.WARNING)
3535

36+
# warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
37+
# # logging.getLogger("transformers").setLevel(logging.CRITICAL)
38+
# logging.getLogger("transformers").setLevel(logging.ERROR)
39+
# logging.getLogger("transformers").setLevel(logging.WARNING)
40+
# logging.getLogger("transformers").setLevel(logging.INFO)
41+
# logging.getLogger("transformers").setLevel(logging.DEBUG)
42+
3643
ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']
3744

45+
current_directory = Path(__file__).parent
46+
CACHE_DIR = current_directory / "models" / "vision"
47+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
48+
3849
current_directory = Path(__file__).parent
3950
VISION_DIR = current_directory / "models" / "vision"
4051
VISION_DIR.mkdir(parents=True, exist_ok=True)
@@ -65,10 +76,20 @@ def choose_image_loader():
6576

6677
chosen_model = config["vision"]["chosen_model"]
6778

68-
if chosen_model == 'Moondream2':
79+
if chosen_model in ['llava 1.5 - 7b', 'bakllava 1.5 - 7b', 'llava 1.5 - 13b', ]:
80+
loader_func = loader_llava(config).process_images
81+
elif chosen_model == 'Cogvlm':
82+
loader_func = loader_cogvlm(config).process_images
83+
elif chosen_model == 'Moondream2':
6984
loader_func = loader_moondream(config).process_images
7085
elif chosen_model in ["Florence-2-large", "Florence-2-base"]:
7186
loader_func = loader_florence2(config).process_images
87+
elif chosen_model == 'Phi-3-vision-128k-instruct':
88+
loader_func = loader_phi3vision(config).process_images
89+
elif chosen_model == 'MiniCPM-Llama3-V-2_5-int4':
90+
loader_func = loader_minicpm_llama3v(config).process_images
91+
elif chosen_model in ['Llava 1.6 Vicuna - 7b', 'Llava 1.6 Vicuna - 13b']:
92+
loader_func = loader_llava_next(config).process_images
7293
else:
7394
my_cprint("No valid image model specified in config.yaml", "red")
7495
return []
@@ -143,6 +164,125 @@ def process_images(self):
143164
def process_single_image(self, raw_image):
144165
raise NotImplementedError("Subclasses must implement process_single_image method")
145166

167+
class loader_cogvlm(BaseLoader):
168+
def initialize_model_and_tokenizer(self):
169+
model_name = 'THUDM/cogvlm-chat-hf'
170+
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
171+
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=TORCH_TYPE)
172+
173+
tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
174+
model = AutoModelForCausalLM.from_pretrained(
175+
model_name,
176+
quantization_config=quantization_config,
177+
torch_dtype=TORCH_TYPE,
178+
low_cpu_mem_usage=True,
179+
trust_remote_code=True
180+
)
181+
182+
my_cprint(f"Cogvlm vision model loaded into memory...", "green")
183+
return model, tokenizer, None
184+
185+
@torch.inference_mode()
186+
def process_single_image(self, raw_image):
187+
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
188+
prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
189+
inputs = self.model.build_conversation_input_ids(self.tokenizer, query=prompt, history=[], images=[raw_image])
190+
inputs = {
191+
'input_ids': inputs['input_ids'].unsqueeze(0).to(self.device),
192+
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(self.device),
193+
'attention_mask': inputs['attention_mask'].unsqueeze(0).to(self.device),
194+
'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
195+
}
196+
197+
gen_kwargs = {"max_length": 2048, "do_sample": False}
198+
output = self.model.generate(**inputs, **gen_kwargs)
199+
output = output[:, inputs['input_ids'].shape[1]:]
200+
model_response = self.tokenizer.decode(output[0], skip_special_tokens=True).split("ASSISTANT: ")[-1]
201+
return model_response
202+
203+
class loader_llava(BaseLoader):
204+
def initialize_model_and_tokenizer(self):
205+
chosen_model = self.config['vision']['chosen_model']
206+
207+
model_info = VISION_MODELS[chosen_model]
208+
model_id = model_info['repo_id']
209+
precision = model_info['precision']
210+
save_dir = model_info["cache_dir"]
211+
cache_dir = CACHE_DIR / save_dir
212+
cache_dir.mkdir(parents=True, exist_ok=True)
213+
214+
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
215+
216+
model = LlavaForConditionalGeneration.from_pretrained(
217+
model_id,
218+
quantization_config=quantization_config,
219+
torch_dtype=torch.float16,
220+
low_cpu_mem_usage=True,
221+
cache_dir=cache_dir
222+
)
223+
224+
my_cprint(f"{chosen_model} vision model loaded into memory...", "green")
225+
226+
processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)
227+
228+
return model, None, processor
229+
230+
@torch.inference_mode()
231+
def process_single_image(self, raw_image):
232+
prompt = "USER: <image>\nDescribe this image in as much detail as possible while still trying to be succinct and not repeat yourself.\nASSISTANT:"
233+
inputs = self.processor(prompt, raw_image, return_tensors='pt').to(self.device)
234+
inputs = inputs.to(torch.float32)
235+
236+
output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False)
237+
full_response = self.processor.decode(output[0][2:], skip_special_tokens=True, do_sample=False)
238+
model_response = full_response.split("ASSISTANT: ")[-1]
239+
return model_response
240+
241+
242+
class loader_llava_next(BaseLoader):
243+
def initialize_model_and_tokenizer(self):
244+
chosen_model = self.config['vision']['chosen_model']
245+
246+
model_info = VISION_MODELS[chosen_model]
247+
model_id = model_info['repo_id']
248+
precision = model_info['precision']
249+
save_dir = model_info["cache_dir"]
250+
cache_dir = CACHE_DIR / save_dir
251+
cache_dir.mkdir(parents=True, exist_ok=True)
252+
253+
quantization_config = BitsAndBytesConfig(
254+
load_in_4bit=True,
255+
bnb_4bit_quant_type="nf4",
256+
bnb_4bit_compute_dtype=torch.float16,
257+
)
258+
259+
model = LlavaNextForConditionalGeneration.from_pretrained(
260+
model_id,
261+
quantization_config=quantization_config,
262+
torch_dtype=torch.float16,
263+
low_cpu_mem_usage=True,
264+
cache_dir=cache_dir
265+
)
266+
267+
my_cprint(f"{chosen_model} vision model loaded into memory...", "green")
268+
269+
processor = LlavaNextProcessor.from_pretrained(model_id, cache_dir=cache_dir)
270+
271+
return model, None, processor
272+
273+
@ torch.inference_mode()
274+
def process_single_image(self, raw_image):
275+
user_prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
276+
prompt = f"USER: <image>\n{user_prompt} ASSISTANT:"
277+
inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(self.device)
278+
279+
output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False)
280+
281+
response = self.processor.decode(output[0], skip_special_tokens=True) # possibly adjust to "full_response = self.processor.decode(output[0][2:], skip_special_tokens=True)" or something similar if output is preceded by special tokens inexplicatly
282+
model_response = response.split("ASSISTANT:")[-1].strip()
283+
284+
return model_response
285+
146286

147287
class loader_moondream(BaseLoader):
148288
def initialize_model_and_tokenizer(self):
@@ -159,7 +299,7 @@ def initialize_model_and_tokenizer(self):
159299

160300
my_cprint(f"Moondream2 vision model loaded into memory...", "green")
161301

162-
tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-05-20", cache_dir=cache_dir)
302+
tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-07-23", cache_dir=cache_dir)
163303

164304
return model, tokenizer, None
165305

@@ -180,11 +320,14 @@ def __init__(self, config):
180320

181321
def initialize_model_and_tokenizer(self):
182322
chosen_model = self.config['vision']['chosen_model']
183-
model_id = VISION_MODELS[chosen_model]['repo_id']
184-
cache_dir=VISION_DIR
323+
repo_id = VISION_MODELS[chosen_model]["repo_id"]
324+
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
185325

186-
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir)
187-
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir)
326+
cache_dir = CACHE_DIR / save_dir
327+
cache_dir.mkdir(parents=True, exist_ok=True)
328+
329+
model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
330+
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
188331

189332
device_type, precision_type = self.get_device_and_precision()
190333

@@ -228,3 +371,173 @@ def process_single_image(self, raw_image):
228371
parsed_answer = self.processor.post_process_generation(generated_text, task=prompt, image_size=(raw_image.width, raw_image.height))
229372

230373
return parsed_answer['<MORE_DETAILED_CAPTION>']
374+
375+
376+
class loader_phi3vision(BaseLoader):
377+
def initialize_model_and_tokenizer(self):
378+
chosen_model = self.config['vision']['chosen_model']
379+
repo_id = VISION_MODELS[chosen_model]["repo_id"]
380+
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
381+
cache_dir = CACHE_DIR / save_dir
382+
cache_dir.mkdir(parents=True, exist_ok=True)
383+
384+
quantization_config = BitsAndBytesConfig(
385+
load_in_4bit=True,
386+
bnb_4bit_compute_dtype=torch.bfloat16,
387+
bnb_4bit_quant_type="nf4"
388+
)
389+
390+
# microsoft/Phi-3-vision-128k-instruct
391+
model = AutoModelForCausalLM.from_pretrained(
392+
repo_id,
393+
device_map="cuda",
394+
trust_remote_code=True,
395+
torch_dtype="auto",
396+
attn_implementation='flash_attention_2',
397+
quantization_config=quantization_config,
398+
cache_dir=cache_dir
399+
)
400+
401+
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
402+
403+
my_cprint(f"Microsoft-Phi-3-vision model loaded into memory...", "green")
404+
405+
return model, None, processor
406+
407+
@torch.inference_mode()
408+
def process_single_image(self, raw_image):
409+
prompt = f"""<|user|>
410+
<|image_1|>
411+
Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.<|end|>
412+
<|assistant|>
413+
"""
414+
inputs = self.processor(prompt, [raw_image], return_tensors="pt").to(self.device)
415+
416+
generation_args = {
417+
"max_new_tokens": 500,
418+
"temperature": None,
419+
"do_sample": False,
420+
}
421+
422+
generate_ids = self.model.generate(
423+
**inputs,
424+
eos_token_id=self.processor.tokenizer.eos_token_id,
425+
**generation_args
426+
)
427+
428+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
429+
response = self.processor.batch_decode(
430+
generate_ids,
431+
skip_special_tokens=True,
432+
clean_up_tokenization_spaces=False
433+
)[0]
434+
435+
return response
436+
437+
438+
class loader_minicpm_llama3v(BaseLoader):
439+
def initialize_model_and_tokenizer(self):
440+
chosen_model = self.config['vision']['chosen_model']
441+
repo_id = VISION_MODELS[chosen_model]["repo_id"]
442+
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
443+
cache_dir = CACHE_DIR / save_dir
444+
cache_dir.mkdir(parents=True, exist_ok=True)
445+
446+
warnings.filterwarnings("ignore", category=UserWarning)
447+
448+
# openbmb/MiniCPM-Llama3-V-2_5-int4
449+
model = AutoModel.from_pretrained(
450+
repo_id,
451+
trust_remote_code=True,
452+
low_cpu_mem_usage=True,
453+
cache_dir=cache_dir
454+
)
455+
tokenizer = AutoTokenizer.from_pretrained(
456+
repo_id,
457+
trust_remote_code=True,
458+
cache_dir=cache_dir
459+
)
460+
model.eval()
461+
462+
my_cprint(f"MiniCPM-Llama3-V vision model loaded into memory...", "green")
463+
464+
return model, tokenizer, None
465+
466+
@torch.inference_mode()
467+
def process_single_image(self, raw_image):
468+
question = 'Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.'
469+
msgs = [{'role': 'user', 'content': question}]
470+
471+
response = self.model.chat(
472+
image=raw_image,
473+
msgs=msgs,
474+
context=None,
475+
tokenizer=self.tokenizer,
476+
sampling=False,
477+
temperature=None
478+
)
479+
480+
if isinstance(response, tuple) and len(response) == 3:
481+
res, context, _ = response
482+
else:
483+
res = response
484+
485+
return res
486+
487+
'''
488+
class loader_bunny(BaseLoader):
489+
def initialize_model_and_tokenizer(self):
490+
transformers.logging.set_verbosity_error()
491+
transformers.logging.disable_progress_bar()
492+
warnings.filterwarnings('ignore')
493+
494+
#BAAI/Bunny-v1_1-4B
495+
# BAAI/Bunny-v1_1-Llama-3-8B-V
496+
497+
chosen_model = self.config['vision']['chosen_model']
498+
model_path = VISION_MODELS[chosen_model]["model_path"]
499+
500+
quantization_config = BitsAndBytesConfig(
501+
load_in_4bit=True,
502+
bnb_4bit_compute_dtype=torch.float16,
503+
bnb_4bit_quant_type="nf4"
504+
)
505+
506+
model = AutoModelForCausalLM.from_pretrained(
507+
model_path,
508+
torch_dtype=torch.float16,
509+
device_map='auto',
510+
trust_remote_code=True,
511+
quantization_config=quantization_config
512+
)
513+
514+
tokenizer = AutoTokenizer.from_pretrained(
515+
model_path,
516+
trust_remote_code=True
517+
)
518+
519+
my_cprint(f"Bunny vision model loaded into memory...", "green")
520+
521+
return model, tokenizer, None
522+
523+
@torch.inference_mode()
524+
def process_single_image(self, raw_image):
525+
prompt = "Describe what this image depicts in as much detail as possible."
526+
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
527+
528+
text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
529+
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.device)
530+
531+
image_tensor = self.model.process_images([raw_image], self.model.config).to(dtype=self.model.dtype, device=self.device)
532+
533+
output_ids = self.model.generate(
534+
input_ids,
535+
images=image_tensor,
536+
max_length=4096,
537+
use_cache=True,
538+
repetition_penalty=1.0
539+
)[0].to(self.device)
540+
541+
result = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
542+
return result
543+
'''

0 commit comments

Comments
 (0)