v7.8.1

BBC-Esq · web-flow · commit c69d839bbb05 · 2025-05-25T08:49:52.000-04:00
diff --git a/src/constants.py b/src/constants.py
@@ -103,7 +103,7 @@
     "accelerate==1.7.0",
     "aiofiles==24.1.0",
     "aiohappyeyeballs==2.6.1",
-    "aiohttp==3.11.18", # langchain libraries require <4
+    "aiohttp==3.12.0", # langchain libraries require <4
     "aiosignal==1.3.2", # only required by aiohttp
     "anndata==0.11.4",
     "annotated-types==0.7.0",
@@ -120,8 +120,8 @@
     "cffi==1.17.1",
     "chardet==5.2.0",
     "charset-normalizer==3.4.2", # requests requires <4
-    "chattts==0.2.3",
-    "click==8.2.1",
+    "chattts==0.2.4",
+    "click==8.1.8", # gtts 2.5.4 requires <8.2, >=7.1
     "cloudpickle==3.1.1", # only required by tiledb-cloud and 3+ is only supported by tiledb-cloud 0.13+
     "colorama==0.4.6",
     "coloredlogs==15.0.1",
@@ -160,22 +160,22 @@
     "httpcore==1.0.9",
     "httpx==0.28.1",
     "httpx-sse==0.4.0",
-    "huggingface-hub==0.31.4", # tokenizers 0.21.1 requires >=0.16.4,<1.0
+    "huggingface-hub==0.32.0", # tokenizers 0.21.1 requires >=0.16.4,<1.0
     "humanfriendly==10.0",
     "HyperPyYAML==1.2.2",
     "idna==3.10",
     "img2pdf==0.6.1",
     "importlib_metadata==8.7.0",
     "Jinja2==3.1.6",
     "jiter==0.10.0", # required by openai newer versions
-    "joblib==1.5.0",
+    "joblib==1.5.1",
     "jsonpatch==1.33",
     "jsonpath-python==1.0.6",
     "jsonpointer==3.0.0",
     "kiwisolver==1.4.8",
     "langchain==0.3.25",
     "langchain-community==0.3.24",
-    "langchain-core==0.3.60",
+    "langchain-core==0.3.61",
     "langchain-huggingface==0.2.0",
     "langchain-text-splitters==0.3.8",
     "langdetect==1.0.9",
@@ -199,16 +199,14 @@
     "networkx==3.4.2",
     "nltk==3.9.1", # not higher; gives unexplained error
     "numba==0.61.2", # only required by openai-whisper and chattts
-    # langchain 0.3.23 requires <3,>=1.26.4
-    # langchain-community 0.3.21 requires >=1.26.2,<3
-    # langchain-core 0.3.51 requires numpy >=1.24.0, <2.0.0 if python is less than 3.12 and <3,>=1.26.0 if 3.12+
-    # numba 0.61.0 requires >=1.24,<2.2
-    # scipy 1.15.2 requires >=1.23.5,<2.5
-    # chattts 0.2.3 says it requires <2.0.0 but Claude cays 2+ is compatible
-    "numpy==1.26.4",
+    # langchain requires at least 1.26.2
+    # numba requires less than 2.3
+    # Scipy requires less than 2.5
+    # "numpy==1.26.4",
+    "numpy==2.2.2",
     "ocrmypdf==16.10.1",
     "olefile==0.47",
-    "openai==1.79.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
+    "openai==1.82.0", # only required by chat_lm_studio.py script and whispers2t (if using openai vanilla backend)
     "openai-whisper==20240930", # only required by whisper_s2t (if using openai vanilla backend)
     "openpyxl==3.1.5",
     "optimum==1.25.3",
@@ -230,8 +228,8 @@
     "pyarrow==20.0.0",
     "pybase16384==0.3.8", # only required by chattts
     "pycparser==2.22",
-    "pydantic==2.11.4", # unstructured-client==0.35.0 requires pydantic>=2.11.2
-    "pydantic_core==2.33.2", # pydantic 2.11.4 requires pydantic_core==2.33.2
+    "pydantic==2.11.5", # unstructured-client==0.35.0 requires pydantic>=2.11.2
+    "pydantic_core==2.33.2", # pydantic 2.11.5 requires pydantic_core==2.33.2
     "pydantic-settings==2.9.1", # langchain-community==0.3.23 requires pydantic-settings>=2.4.0,<3.0.0
     "Pygments==2.19.1",
     "PyOpenGL==3.1.9",
@@ -253,7 +251,7 @@
     "requests==2.32.3",
     "requests-toolbelt==1.0.0",
     "rich==14.0.0",
-    "ruamel.yaml==0.18.10",
+    "ruamel.yaml==0.18.11",
     "ruamel.yaml.clib==0.2.12",
     "safetensors==0.5.3",
     "scikit-learn==1.6.1",
@@ -283,8 +281,8 @@
     "timm==1.0.15",
     "tokenizers==0.21.1",
     "tqdm==4.67.1",
-    "transformers==4.52.1",
-    "typing-inspect==0.9.0",
+    "transformers==4.52.3",
+    "typing-inspection==0.4.1", # required by pydantic and pydantic-settings
     "typing_extensions==4.13.2",
     "unstructured-client==0.35.0",
     "tzdata==2025.2",
@@ -304,7 +302,7 @@
 
 full_install_libs = [
     "PySide6==6.9.0",
-    "pymupdf==1.25.5",
+    "pymupdf==1.26.0",
     "unstructured==0.17.2"
 ]
 
@@ -572,7 +570,7 @@
     ],
     'infly': [
         {
-            'name': 'infly-retriever-v1-1.5b',
+            'name': 'inf-retriever-v1-1.5b',
             'dimensions': 1536,
             'max_sequence': 8192,
             'size_mb': 3090,
@@ -583,12 +581,12 @@
             'precision': 'bfloat16'
         },
         {
-            'name': 'infly-retriever-v1-7b',
+            'name': 'inf-retriever-v1-7b',
             'dimensions': 3584,
             'max_sequence': 8192,
             'size_mb': 14130,
             'repo_id': 'infly/inf-retriever-v1',
-            'cache_dir': 'infly--inf-retriever-v1',
+            'cache_dir': 'infly--inf-retriever-v1-7b',
             'type': 'vector',
             'parameters': '7070m',
             'precision': 'bfloat16'
diff --git a/src/database_interactions.py b/src/database_interactions.py
@@ -417,7 +417,7 @@ def create_database(self, texts, embeddings):
 
             # IMMEDIATE CLEANUP - free ~50-75% of memory
             del all_texts, vectors
-            gc.collect()
+            # gc.collect()
 
             TileDB.from_embeddings(
                 text_embeddings=text_embed_pairs,
@@ -430,7 +430,7 @@ def create_database(self, texts, embeddings):
                 allow_dangerous_deserialization=True,
             )
 
-            my_cprint(f"Processed {len(all_texts)} chunks", "yellow")
+            my_cprint(f"Processed all chunks", "yellow")
             
             end_time = time.time()
             elapsed_time = end_time - start_time
diff --git a/src/module_process_images.py b/src/module_process_images.py
@@ -162,6 +162,7 @@ def process_single_image(self, raw_image):
         )
         return parsed.get('<MORE_DETAILED_CAPTION>', generated_text)
 
+
 class loader_glmv4(BaseLoader):
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config['vision']['chosen_model']
@@ -225,6 +226,83 @@ def process_single_image(self, raw_image):
             return ""
 
 
+# class loader_ovis(BaseLoader):
+    # def __init__(self, config):
+        # super().__init__(config)
+        # native = VISION_MODELS[self.config["vision"]["chosen_model"]]["precision"]
+        # # Choose dtype on GPU: bfloat16 if supported, else float16; always float32 on CPU
+        # if self.device == "cuda":
+            # if native in ("float32", "bfloat16") and has_bfloat16_support():
+                # self.dtype = torch.bfloat16
+            # elif native == "float32":
+                # self.dtype = torch.float16
+            # else:
+                # self.dtype = torch.float16
+        # else:
+            # self.dtype = torch.float32
+
+    # def initialize_model_and_tokenizer(self):
+        # chosen_model = self.config["vision"]["chosen_model"]
+        # info = VISION_MODELS[chosen_model]
+
+        # cache_dir = CACHE_DIR / info["cache_dir"]
+        # cache_dir.mkdir(parents=True, exist_ok=True)
+
+        # model = AutoModelForCausalLM.from_pretrained(
+            # info["repo_id"],
+            # torch_dtype=self.dtype,
+            # trust_remote_code=True,
+            # multimodal_max_length=8192,
+            # cache_dir=cache_dir
+        # ).to(self.device)
+        # model.eval()
+
+        # text_tokenizer = model.get_text_tokenizer()
+        # visual_tokenizer = model.get_visual_tokenizer()
+
+        # for module in visual_tokenizer.modules():
+            # if isinstance(module, torch.nn.Linear):
+                # module.to(device=self.device, dtype=self.dtype)
+
+        # return model, text_tokenizer, visual_tokenizer
+
+    # @torch.inference_mode()
+    # def process_single_image(self, raw_image):
+        # prompt = (
+            # "Explain everything you see in this picture "
+            # "but your response should be no more than one paragraph."
+        # )
+        # query = f"<image>\n{prompt}"
+
+        # _, input_ids, pixel_values = self.model.preprocess_inputs(query, [raw_image])
+        # attention_mask = torch.ne(input_ids, self.tokenizer.pad_token_id)
+
+        # # Batchify and move to the correct device & dtype
+        # input_ids      = input_ids.unsqueeze(0).to(self.device)        # [1, seq_len]
+        # attention_mask = attention_mask.unsqueeze(0).to(self.device)  # [1, seq_len]
+        # pixel_values   = pixel_values.to(device=self.device, dtype=self.dtype)  # [num_patches,3,14,14]
+        # pixel_values   = [pixel_values]  # wrap in list for generate()
+
+        # gen_kwargs = {
+            # "max_new_tokens": 1024,
+            # "do_sample": False,
+            # "pad_token_id": self.tokenizer.pad_token_id,
+            # "eos_token_id": self.tokenizer.eos_token_id,
+            # "use_cache": True,
+        # }
+
+        # # **Pass input_ids positionally** so Ovis2’s generate() sees it as text_input_ids
+        # output_ids = self.model.generate(
+            # input_ids,
+            # pixel_values=pixel_values,
+            # attention_mask=attention_mask,
+            # **gen_kwargs
+        # )[0]
+
+        # description = self.tokenizer.decode(output_ids, skip_special_tokens=True)
+        # return " ".join(line.strip() for line in description.split("\n") if line.strip())
+
+
 class loader_ovis(BaseLoader):
     def __init__(self, config):
         super().__init__(config)
@@ -233,12 +311,18 @@ def __init__(self, config):
         if self.device == "cuda":
             if native in ("float32", "bfloat16") and has_bfloat16_support():
                 self.dtype = torch.bfloat16
+                print(f"OVIS: Selected bfloat16 precision based on native={native}")
             elif native == "float32":
                 self.dtype = torch.float16
+                print(f"OVIS: Selected float16 precision based on native={native}")
             else:
                 self.dtype = torch.float16
+                print(f"OVIS: Selected float16 precision based on native={native}")
         else:
             self.dtype = torch.float32
+            print(f"OVIS: Selected float32 precision for CPU based on native={native}")
+        
+        print(f"OVIS: Device={self.device}, Initial dtype selection={self.dtype}")
 
     def initialize_model_and_tokenizer(self):
         chosen_model = self.config["vision"]["chosen_model"]
@@ -247,22 +331,64 @@ def initialize_model_and_tokenizer(self):
         cache_dir = CACHE_DIR / info["cache_dir"]
         cache_dir.mkdir(parents=True, exist_ok=True)
 
+        print(f"OVIS: Loading model with dtype={self.dtype}")
+        
         model = AutoModelForCausalLM.from_pretrained(
             info["repo_id"],
             torch_dtype=self.dtype,
             trust_remote_code=True,
             multimodal_max_length=8192,
             cache_dir=cache_dir
         ).to(self.device)
+        
+        # Print model layers precision before eval
+        print("OVIS: Model layer precisions after loading:")
+        for name, module in model.named_modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.LayerNorm)):
+                if hasattr(module, "weight") and module.weight is not None:
+                    print(f"  Layer {name}: {module.weight.dtype}")
+        
         model.eval()
+        
+        # Print model layers precision after eval
+        print("OVIS: Model layer precisions after eval():")
+        for name, module in model.named_modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d, torch.nn.LayerNorm)):
+                if hasattr(module, "weight") and module.weight is not None:
+                    print(f"  Layer {name}: {module.weight.dtype}")
 
         text_tokenizer = model.get_text_tokenizer()
         visual_tokenizer = model.get_visual_tokenizer()
 
+        # Print visual tokenizer layer info before conversion
+        print("OVIS: Visual tokenizer layer precisions before conversion:")
+        for name, module in visual_tokenizer.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if hasattr(module, "weight") and module.weight is not None:
+                    print(f"  VT Layer {name}: {module.weight.dtype}")
+        
+        # Count modules before conversion
+        linear_count = sum(1 for module in visual_tokenizer.modules() 
+                          if isinstance(module, torch.nn.Linear))
+        print(f"OVIS: Found {linear_count} Linear modules in visual_tokenizer")
+
         for module in visual_tokenizer.modules():
             if isinstance(module, torch.nn.Linear):
+                old_dtype = module.weight.dtype if hasattr(module, "weight") else "unknown"
                 module.to(device=self.device, dtype=self.dtype)
+                new_dtype = module.weight.dtype if hasattr(module, "weight") else "unknown"
+                print(f"OVIS: Converting module from {old_dtype} to {self.dtype}, result={new_dtype}")
+        
+        # Print visual tokenizer layer info after conversion
+        print("OVIS: Visual tokenizer layer precisions after conversion:")
+        for name, module in visual_tokenizer.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                if hasattr(module, "weight") and module.weight is not None:
+                    print(f"  VT Layer {name}: {module.weight.dtype}")
 
+        # Save model for process_single_image
+        self.model = model
+        
         return model, text_tokenizer, visual_tokenizer
 
     @torch.inference_mode()
@@ -273,14 +399,29 @@ def process_single_image(self, raw_image):
         )
         query = f"<image>\n{prompt}"
 
+        print("OVIS: Starting image processing")
         _, input_ids, pixel_values = self.model.preprocess_inputs(query, [raw_image])
+        print(f"OVIS: After preprocess_inputs - pixel_values dtype={pixel_values.dtype}")
+        
         attention_mask = torch.ne(input_ids, self.tokenizer.pad_token_id)
 
         # Batchify and move to the correct device & dtype
-        input_ids      = input_ids.unsqueeze(0).to(self.device)        # [1, seq_len]
-        attention_mask = attention_mask.unsqueeze(0).to(self.device)  # [1, seq_len]
-        pixel_values   = pixel_values.to(device=self.device, dtype=self.dtype)  # [num_patches,3,14,14]
-        pixel_values   = [pixel_values]  # wrap in list for generate()
+        input_ids = input_ids.unsqueeze(0).to(self.device)
+        attention_mask = attention_mask.unsqueeze(0).to(self.device)
+        
+        print(f"OVIS: Before pixel_values conversion - dtype={pixel_values.dtype}")
+        pixel_values = pixel_values.to(device=self.device, dtype=self.dtype)
+        print(f"OVIS: After pixel_values conversion - dtype={pixel_values.dtype}")
+        
+        pixel_values = [pixel_values]  # wrap in list for generate()
+
+        # Check model precision during inference
+        print("OVIS: Model layer precisions during inference:")
+        for name, module in self.model.named_modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
+                if hasattr(module, "weight") and module.weight is not None:
+                    if name.startswith("transformer") or name.startswith("lm_head"):
+                        print(f"  Inference layer {name}: {module.weight.dtype}")
 
         gen_kwargs = {
             "max_new_tokens": 1024,
@@ -290,7 +431,7 @@ def process_single_image(self, raw_image):
             "use_cache": True,
         }
 
-        # **Pass input_ids positionally** so Ovis2’s generate() sees it as text_input_ids
+        # **Pass input_ids positionally** so Ovis2's generate() sees it as text_input_ids
         output_ids = self.model.generate(
             input_ids,
             pixel_values=pixel_values,