Skip to content

Commit b81f681

Browse files
authored
fix moondream2
1 parent e84bfd9 commit b81f681

File tree

1 file changed

+6
-319
lines changed

1 file changed

+6
-319
lines changed

src/module_process_images.py

Lines changed: 6 additions & 319 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from tqdm import tqdm
1616
from transformers import (
1717
AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, BlipForConditionalGeneration, BlipProcessor,
18-
LlamaTokenizer, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig
18+
LlamaTokenizer, LlavaForConditionalGeneration, BitsAndBytesConfig
1919
)
2020

2121
from langchain_community.docstore.document import Document
@@ -33,19 +33,8 @@
3333
warnings.filterwarnings("ignore", category=DeprecationWarning)
3434
logging.getLogger().setLevel(logging.WARNING)
3535

36-
# warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
37-
# # logging.getLogger("transformers").setLevel(logging.CRITICAL)
38-
# logging.getLogger("transformers").setLevel(logging.ERROR)
39-
# logging.getLogger("transformers").setLevel(logging.WARNING)
40-
# logging.getLogger("transformers").setLevel(logging.INFO)
41-
# logging.getLogger("transformers").setLevel(logging.DEBUG)
42-
4336
ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff']
4437

45-
current_directory = Path(__file__).parent
46-
CACHE_DIR = current_directory / "models" / "vision"
47-
CACHE_DIR.mkdir(parents=True, exist_ok=True)
48-
4938
current_directory = Path(__file__).parent
5039
VISION_DIR = current_directory / "models" / "vision"
5140
VISION_DIR.mkdir(parents=True, exist_ok=True)
@@ -76,20 +65,10 @@ def choose_image_loader():
7665

7766
chosen_model = config["vision"]["chosen_model"]
7867

79-
if chosen_model in ['llava 1.5 - 7b', 'bakllava 1.5 - 7b', 'llava 1.5 - 13b', ]:
80-
loader_func = loader_llava(config).process_images
81-
elif chosen_model == 'Cogvlm':
82-
loader_func = loader_cogvlm(config).process_images
83-
elif chosen_model == 'Moondream2':
68+
if chosen_model == 'Moondream2':
8469
loader_func = loader_moondream(config).process_images
8570
elif chosen_model in ["Florence-2-large", "Florence-2-base"]:
8671
loader_func = loader_florence2(config).process_images
87-
elif chosen_model == 'Phi-3-vision-128k-instruct':
88-
loader_func = loader_phi3vision(config).process_images
89-
elif chosen_model == 'MiniCPM-Llama3-V-2_5-int4':
90-
loader_func = loader_minicpm_llama3v(config).process_images
91-
elif chosen_model in ['Llava 1.6 Vicuna - 7b', 'Llava 1.6 Vicuna - 13b']:
92-
loader_func = loader_llava_next(config).process_images
9372
else:
9473
my_cprint("No valid image model specified in config.yaml", "red")
9574
return []
@@ -164,125 +143,6 @@ def process_images(self):
164143
def process_single_image(self, raw_image):
165144
raise NotImplementedError("Subclasses must implement process_single_image method")
166145

167-
class loader_cogvlm(BaseLoader):
168-
def initialize_model_and_tokenizer(self):
169-
model_name = 'THUDM/cogvlm-chat-hf'
170-
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
171-
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=TORCH_TYPE)
172-
173-
tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
174-
model = AutoModelForCausalLM.from_pretrained(
175-
model_name,
176-
quantization_config=quantization_config,
177-
torch_dtype=TORCH_TYPE,
178-
low_cpu_mem_usage=True,
179-
trust_remote_code=True
180-
)
181-
182-
my_cprint(f"Cogvlm vision model loaded into memory...", "green")
183-
return model, tokenizer, None
184-
185-
@torch.inference_mode()
186-
def process_single_image(self, raw_image):
187-
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
188-
prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
189-
inputs = self.model.build_conversation_input_ids(self.tokenizer, query=prompt, history=[], images=[raw_image])
190-
inputs = {
191-
'input_ids': inputs['input_ids'].unsqueeze(0).to(self.device),
192-
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(self.device),
193-
'attention_mask': inputs['attention_mask'].unsqueeze(0).to(self.device),
194-
'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
195-
}
196-
197-
gen_kwargs = {"max_length": 2048, "do_sample": False}
198-
output = self.model.generate(**inputs, **gen_kwargs)
199-
output = output[:, inputs['input_ids'].shape[1]:]
200-
model_response = self.tokenizer.decode(output[0], skip_special_tokens=True).split("ASSISTANT: ")[-1]
201-
return model_response
202-
203-
class loader_llava(BaseLoader):
204-
def initialize_model_and_tokenizer(self):
205-
chosen_model = self.config['vision']['chosen_model']
206-
207-
model_info = VISION_MODELS[chosen_model]
208-
model_id = model_info['repo_id']
209-
precision = model_info['precision']
210-
save_dir = model_info["cache_dir"]
211-
cache_dir = CACHE_DIR / save_dir
212-
cache_dir.mkdir(parents=True, exist_ok=True)
213-
214-
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
215-
216-
model = LlavaForConditionalGeneration.from_pretrained(
217-
model_id,
218-
quantization_config=quantization_config,
219-
torch_dtype=torch.float16,
220-
low_cpu_mem_usage=True,
221-
cache_dir=cache_dir
222-
)
223-
224-
my_cprint(f"{chosen_model} vision model loaded into memory...", "green")
225-
226-
processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)
227-
228-
return model, None, processor
229-
230-
@torch.inference_mode()
231-
def process_single_image(self, raw_image):
232-
prompt = "USER: <image>\nDescribe this image in as much detail as possible while still trying to be succinct and not repeat yourself.\nASSISTANT:"
233-
inputs = self.processor(prompt, raw_image, return_tensors='pt').to(self.device)
234-
inputs = inputs.to(torch.float32)
235-
236-
output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False)
237-
full_response = self.processor.decode(output[0][2:], skip_special_tokens=True, do_sample=False)
238-
model_response = full_response.split("ASSISTANT: ")[-1]
239-
return model_response
240-
241-
242-
class loader_llava_next(BaseLoader):
243-
def initialize_model_and_tokenizer(self):
244-
chosen_model = self.config['vision']['chosen_model']
245-
246-
model_info = VISION_MODELS[chosen_model]
247-
model_id = model_info['repo_id']
248-
precision = model_info['precision']
249-
save_dir = model_info["cache_dir"]
250-
cache_dir = CACHE_DIR / save_dir
251-
cache_dir.mkdir(parents=True, exist_ok=True)
252-
253-
quantization_config = BitsAndBytesConfig(
254-
load_in_4bit=True,
255-
bnb_4bit_quant_type="nf4",
256-
bnb_4bit_compute_dtype=torch.float16,
257-
)
258-
259-
model = LlavaNextForConditionalGeneration.from_pretrained(
260-
model_id,
261-
quantization_config=quantization_config,
262-
torch_dtype=torch.float16,
263-
low_cpu_mem_usage=True,
264-
cache_dir=cache_dir
265-
)
266-
267-
my_cprint(f"{chosen_model} vision model loaded into memory...", "green")
268-
269-
processor = LlavaNextProcessor.from_pretrained(model_id, cache_dir=cache_dir)
270-
271-
return model, None, processor
272-
273-
@ torch.inference_mode()
274-
def process_single_image(self, raw_image):
275-
user_prompt = "Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself."
276-
prompt = f"USER: <image>\n{user_prompt} ASSISTANT:"
277-
inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(self.device)
278-
279-
output = self.model.generate(**inputs, max_new_tokens=512, do_sample=False)
280-
281-
response = self.processor.decode(output[0], skip_special_tokens=True) # possibly adjust to "full_response = self.processor.decode(output[0][2:], skip_special_tokens=True)" or something similar if output is preceded by special tokens inexplicatly
282-
model_response = response.split("ASSISTANT:")[-1].strip()
283-
284-
return model_response
285-
286146

287147
class loader_moondream(BaseLoader):
288148
def initialize_model_and_tokenizer(self):
@@ -320,14 +180,11 @@ def __init__(self, config):
320180

321181
def initialize_model_and_tokenizer(self):
322182
chosen_model = self.config['vision']['chosen_model']
323-
repo_id = VISION_MODELS[chosen_model]["repo_id"]
324-
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
325-
326-
cache_dir = CACHE_DIR / save_dir
327-
cache_dir.mkdir(parents=True, exist_ok=True)
183+
model_id = VISION_MODELS[chosen_model]['repo_id']
184+
cache_dir=VISION_DIR
328185

329-
model = AutoModelForCausalLM.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
330-
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
186+
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir)
187+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_dir)
331188

332189
device_type, precision_type = self.get_device_and_precision()
333190

@@ -371,173 +228,3 @@ def process_single_image(self, raw_image):
371228
parsed_answer = self.processor.post_process_generation(generated_text, task=prompt, image_size=(raw_image.width, raw_image.height))
372229

373230
return parsed_answer['<MORE_DETAILED_CAPTION>']
374-
375-
376-
class loader_phi3vision(BaseLoader):
377-
def initialize_model_and_tokenizer(self):
378-
chosen_model = self.config['vision']['chosen_model']
379-
repo_id = VISION_MODELS[chosen_model]["repo_id"]
380-
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
381-
cache_dir = CACHE_DIR / save_dir
382-
cache_dir.mkdir(parents=True, exist_ok=True)
383-
384-
quantization_config = BitsAndBytesConfig(
385-
load_in_4bit=True,
386-
bnb_4bit_compute_dtype=torch.bfloat16,
387-
bnb_4bit_quant_type="nf4"
388-
)
389-
390-
# microsoft/Phi-3-vision-128k-instruct
391-
model = AutoModelForCausalLM.from_pretrained(
392-
repo_id,
393-
device_map="cuda",
394-
trust_remote_code=True,
395-
torch_dtype="auto",
396-
attn_implementation='flash_attention_2',
397-
quantization_config=quantization_config,
398-
cache_dir=cache_dir
399-
)
400-
401-
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True, cache_dir=cache_dir)
402-
403-
my_cprint(f"Microsoft-Phi-3-vision model loaded into memory...", "green")
404-
405-
return model, None, processor
406-
407-
@torch.inference_mode()
408-
def process_single_image(self, raw_image):
409-
prompt = f"""<|user|>
410-
<|image_1|>
411-
Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.<|end|>
412-
<|assistant|>
413-
"""
414-
inputs = self.processor(prompt, [raw_image], return_tensors="pt").to(self.device)
415-
416-
generation_args = {
417-
"max_new_tokens": 500,
418-
"temperature": None,
419-
"do_sample": False,
420-
}
421-
422-
generate_ids = self.model.generate(
423-
**inputs,
424-
eos_token_id=self.processor.tokenizer.eos_token_id,
425-
**generation_args
426-
)
427-
428-
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
429-
response = self.processor.batch_decode(
430-
generate_ids,
431-
skip_special_tokens=True,
432-
clean_up_tokenization_spaces=False
433-
)[0]
434-
435-
return response
436-
437-
438-
class loader_minicpm_llama3v(BaseLoader):
439-
def initialize_model_and_tokenizer(self):
440-
chosen_model = self.config['vision']['chosen_model']
441-
repo_id = VISION_MODELS[chosen_model]["repo_id"]
442-
save_dir = VISION_MODELS[chosen_model]["cache_dir"]
443-
cache_dir = CACHE_DIR / save_dir
444-
cache_dir.mkdir(parents=True, exist_ok=True)
445-
446-
warnings.filterwarnings("ignore", category=UserWarning)
447-
448-
# openbmb/MiniCPM-Llama3-V-2_5-int4
449-
model = AutoModel.from_pretrained(
450-
repo_id,
451-
trust_remote_code=True,
452-
low_cpu_mem_usage=True,
453-
cache_dir=cache_dir
454-
)
455-
tokenizer = AutoTokenizer.from_pretrained(
456-
repo_id,
457-
trust_remote_code=True,
458-
cache_dir=cache_dir
459-
)
460-
model.eval()
461-
462-
my_cprint(f"MiniCPM-Llama3-V vision model loaded into memory...", "green")
463-
464-
return model, tokenizer, None
465-
466-
@torch.inference_mode()
467-
def process_single_image(self, raw_image):
468-
question = 'Describe this image in as much detail as possible while still trying to be succinct and not repeat yourself.'
469-
msgs = [{'role': 'user', 'content': question}]
470-
471-
response = self.model.chat(
472-
image=raw_image,
473-
msgs=msgs,
474-
context=None,
475-
tokenizer=self.tokenizer,
476-
sampling=False,
477-
temperature=None
478-
)
479-
480-
if isinstance(response, tuple) and len(response) == 3:
481-
res, context, _ = response
482-
else:
483-
res = response
484-
485-
return res
486-
487-
'''
488-
class loader_bunny(BaseLoader):
489-
def initialize_model_and_tokenizer(self):
490-
transformers.logging.set_verbosity_error()
491-
transformers.logging.disable_progress_bar()
492-
warnings.filterwarnings('ignore')
493-
494-
#BAAI/Bunny-v1_1-4B
495-
# BAAI/Bunny-v1_1-Llama-3-8B-V
496-
497-
chosen_model = self.config['vision']['chosen_model']
498-
model_path = VISION_MODELS[chosen_model]["model_path"]
499-
500-
quantization_config = BitsAndBytesConfig(
501-
load_in_4bit=True,
502-
bnb_4bit_compute_dtype=torch.float16,
503-
bnb_4bit_quant_type="nf4"
504-
)
505-
506-
model = AutoModelForCausalLM.from_pretrained(
507-
model_path,
508-
torch_dtype=torch.float16,
509-
device_map='auto',
510-
trust_remote_code=True,
511-
quantization_config=quantization_config
512-
)
513-
514-
tokenizer = AutoTokenizer.from_pretrained(
515-
model_path,
516-
trust_remote_code=True
517-
)
518-
519-
my_cprint(f"Bunny vision model loaded into memory...", "green")
520-
521-
return model, tokenizer, None
522-
523-
@torch.inference_mode()
524-
def process_single_image(self, raw_image):
525-
prompt = "Describe what this image depicts in as much detail as possible."
526-
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
527-
528-
text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
529-
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(self.device)
530-
531-
image_tensor = self.model.process_images([raw_image], self.model.config).to(dtype=self.model.dtype, device=self.device)
532-
533-
output_ids = self.model.generate(
534-
input_ids,
535-
images=image_tensor,
536-
max_length=4096,
537-
use_cache=True,
538-
repetition_penalty=1.0
539-
)[0].to(self.device)
540-
541-
result = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
542-
return result
543-
'''

0 commit comments

Comments
 (0)