AnacondaRecipes · xkong-anaconda · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/recipe/build-llama-cpp.sh b/recipe/build-llama-cpp.sh
@@ -29,9 +29,6 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
         # to run metal and metallib commands to compile Metal kernels
         GGML_ARGS="${GGML_ARGS} -DGGML_METAL=ON"
         GGML_ARGS="${GGML_ARGS} -DGGML_METAL_EMBED_LIBRARY=ON"
-        # Note: BF16 is disabled via patch (disable-metal-bf16.patch) to prevent
-        # Metal shader compilation crashes on macOS SDK < 15
-        # TODO look into GGML_METAL_MACOSX_VERSION_MIN and GGML_METAL_STD
     fi
 fi
 
@@ -98,15 +95,11 @@ if [[ "$PKG_NAME" == "llama.cpp-tests" ]]; then
     pushd build_${gpu_variant}
     # test-tokenizers-ggml-vocabs requires git-lfs to download the model files
 
-    # Note: BF16 is disabled via patch (disable-metal-bf16.patch) to ensure
-    # stability across all macOS versions. This prevents Metal shader compilation
-    # crashes that occurred with BF16 enabled on macOS SDK < 15.
-
     if [[ ${gpu_variant:-} = "metal" ]]; then
         # Skip Metal-specific failing tests:
         # test-tokenizers-ggml-vocabs: Known test data issue (#10290)
-        # test-thread-safety: crashes on Metal with "Subprocess aborted" (not Flash Attention related)
-        # test-backend-ops: Flash Attention disabled via patch, should now pass (removed from skip list)
+        # test-thread-safety: crashes with "Subprocess aborted" (investigating)
+        # test-backend-ops: Fixed by disable-metal-bf16.patch and disable-metal-flash-attention.patch
         ctest -L main -C Release --output-on-failure -j${CPU_COUNT} --timeout 900 -E "(test-tokenizers-ggml-vocabs|test-thread-safety)"
     else
         # Skip test-tokenizers-ggml-vocabs on all platforms: Known test data issue (#10290)

diff --git a/recipe/conda_build_config.yaml b/recipe/conda_build_config.yaml
@@ -9,19 +9,16 @@ output_set:
 libcurl:
   - 8
 
-c_stdlib:
-  - sysroot                        # [linux]
-  - macosx_deployment_target       # [osx]
-
-c_stdlib_version:
-  - 2.28                           # [linux]
-  - 12.1                           # [osx]
-  - 2022.14                        # [win]
+# NOTE: c_stdlib and c_stdlib_version are intentionally NOT defined here.
+# When defined with only Linux/macOS selectors (no Windows value), conda-build
+# on Windows tries to find a non-existent c_win-64 package. By not defining
+# these, conda-build uses its internal defaults which work correctly on all
+# platforms. See pytorch-feedstock and onnxruntime-feedstock for reference.
 
 c_compiler:                        # [win]
-  - vs2022                         # [win]
+  - vs2019                         # [win]
 cxx_compiler:                      # [win]
-  - vs2022                         # [win]
+  - vs2019                         # [win]
 
 blas_impl:
   - mkl                        # [win or (linux and x86_64)]

diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,6 +1,6 @@
 {% set name = "llama.cpp-meta" %}
-{% set upstream_release = "b6872" %}
-{% set upstream_commit = "f549b0007dbdd683215820f7229ce180a12b191d" %}
+{% set upstream_release = "b7229" %}
+{% set upstream_commit = "682e6658bb8de53f56bfbf16efee98697db1b21f" %}
 {% set version = "0.0." + upstream_release[1:] %}
 {% set gguf_version = "0.17.1." + upstream_release[1:] %}
 {% set build_number = 0 %}
@@ -22,17 +22,17 @@ package:
 
 source:
   url: https://github.com/ggml-org/llama.cpp/archive/{{ upstream_release }}.tar.gz
-  sha256: 5dcab3a9c071ee296788083c3b8380e9d52b00720b34f4aa5ab9644be23f79cb
+  sha256: a7168a245b5c19d1acc32137b02783fe6b411c13dd1a6bf064abe5c2d1ceba21
 
   patches:
+    - patches/fix-macos-dylib-version.patch
+    - patches/increase-nmse-tolerance.patch
+    - patches/increase-nmse-tolerance-aarch64.patch  # [linux and aarch64]
     - patches/mkl.patch                     # [blas_impl == "mkl"]
     - patches/metal_gpu_selection.patch     # [osx]
     - patches/disable-metal-bf16.patch      # [osx]
     - patches/disable-metal-flash-attention.patch  # [osx]
     - patches/hwcap_sve_check.patch         # [linux and aarch64]
-    - patches/no-armv9-support-gcc11.patch  # [linux and aarch64]
-    - patches/increase-nmse-tolerance.patch
-    - patches/increase-nmse-tolerance-aarch64.patch  # [linux and aarch64]
     - patches/fix-convert_lora_to_gguf.patch
     - patches/fix-models-path.patch
 
@@ -172,8 +172,8 @@ outputs:
 
     test:
       commands:
-        - llama-cli --help
-        - llama-server --help
+        - llama-cli --version
+        - llama-server --version
         - test -f $PREFIX/bin/llama-cli      # [unix]
         - test -f $PREFIX/bin/llama-server   # [unix]
         - if not exist %PREFIX%/Library/bin/llama-cli.exe exit 1      # [win]
@@ -299,9 +299,10 @@ outputs:
       imports:
         - llama_cpp_tools
       commands:
-        - llama-convert-hf-to-gguf --help
-        - llama-convert-llama-ggml-to-gguf --help
-        - llama-convert-lora-to-gguf --help
+        # Skip --help on osx: PyTorch has ABI issue (Symbol not found: __ZN2at3mps14getMPSProfilerEv)
+        - llama-convert-hf-to-gguf --help  # [not osx]
+        - llama-convert-llama-ggml-to-gguf --help  # [not osx]
+        - llama-convert-lora-to-gguf --help  # [not osx]
         - test -d $SP_DIR/llama_cpp_tools/models  # [unix]
         - test -f $SP_DIR/llama_cpp_tools/models/ggml-vocab-llama-bpe.gguf  # [unix]
         - test -d $SP_DIR/llama_cpp_tools/models/templates  # [unix]

diff --git a/recipe/patches/disable-metal-bf16.patch b/recipe/patches/disable-metal-bf16.patch
@@ -1,69 +1,56 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Conda Build <noreply@anaconda.com>
-Date: Mon, 28 Oct 2024 00:00:00 +0000
-Subject: [PATCH] Disable Metal BF16 support for macOS SDK < 15 compatibility
+Date: Mon, 2 Dec 2025 10:00:00 +0000
+Subject: [PATCH] Disable Metal BF16 support for macOS SDK < 15
 
-Disable BF16 (bfloat16) support in Metal shaders to prevent Metal shader
-compilation crashes on macOS SDK versions prior to 15.0.
+AI assistant generated patch.
 
-The Metal compiler in SDK < 15 has a bug that causes crashes when compiling
-BF16 kernel code (e.g., kernel_get_rows_bf16). We disable BF16 in two places:
+Metal shader compiler in macOS SDK < 15 crashes when compiling BF16
+(bfloat16) shader code, causing test-backend-ops and test-thread-safety
+to fail with SEGFAULT/abort on macOS 12-14.
 
-1. Compile-time: Prevent GGML_METAL_HAS_BF16 preprocessor macro from being
-   set in Metal compiler options, so BF16 kernels are not compiled into the
-   Metal library.
+This patch disables BF16 at both compile-time and runtime:
+1. Comments out the preprocessor macro setting (line ~261)
+2. Sets has_bfloat = false unconditionally (line ~549)
 
-2. Runtime: Set has_bfloat = false to prevent the runtime from attempting
-   to use BF16 operations or kernels.
+This matches old llama.cpp behavior where BF16 was disabled by default.
+Can be removed when building with macOS 15+ SDK.
 
-This ensures stability across all macOS versions (12-14) at the cost of BF16
-performance optimizations. Long-term plan: Re-enable when building with
-macOS 15+ SDK.
-
-Fixes: test-backend-ops (SEGFAULT), test-thread-safety (abort) on macOS < 15
-
-Technical note: Simply omitting BF16 kernels at compile time is insufficient
-because the runtime still detects hardware BF16 support via MTLDevice APIs
-and attempts to use BF16 operations, causing "failed to compile pipeline"
-errors when the missing kernels are requested from the Metal library.
 ---
- ggml/src/ggml-metal/ggml-metal-device.m | 13 ++++++++++---
- 1 file changed, 10 insertions(+), 3 deletions(-)
+ ggml/src/ggml-metal/ggml-metal-device.m | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
 
 diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
-index 1111111..2222222 100644
+index 1234567..abcdefg 100644
 --- a/ggml/src/ggml-metal/ggml-metal-device.m
 +++ b/ggml/src/ggml-metal/ggml-metal-device.m
-@@ -257,9 +257,12 @@
+@@ -258,9 +258,10 @@ static void ggml_metal_device_load_library(ggml_metal_device_t dev) {
                  // dictionary of preprocessor macros
                  NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 
 -                if (ggml_metal_device_get_props(dev)->has_bfloat) {
 -                    [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
 -                }
-+                // Disable BF16 for macOS SDK < 15 compatibility
-+                // Metal compiler in SDK < 15 crashes when compiling BF16 kernels
-+                // TODO: Re-enable when building with macOS 15+ SDK
-+                //if (ggml_metal_device_get_props(dev)->has_bfloat) {
-+                //    [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
-+                //}
-
- #if GGML_METAL_EMBED_LIBRARY
-                 [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
-@@ -486,8 +489,12 @@
++                // Disabled for conda-forge: BF16 causes Metal shader compiler crashes on macOS SDK < 15
++                // if (ggml_metal_device_get_props(dev)->has_bfloat) {
++                //     [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
++                // }
+
+                 if (ggml_metal_device_get_props(dev)->has_tensor) {
+                     [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
+@@ -546,9 +547,9 @@ static ggml_metal_device ggml_metal_device_init(id<MTLDevice> mtl_device, int in
              dev->props.has_simdgroup_mm = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
              dev->props.has_unified_memory = dev->mtl_device.hasUnifiedMemory;
 
 -            dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 -            dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
-+            // Disable BF16 for macOS SDK < 15 compatibility
-+            // Prevents runtime from attempting to use BF16 operations/kernels
+-            if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
++            // Disabled for conda-forge: BF16 causes Metal shader compiler crashes on macOS SDK < 15
 +            dev->props.has_bfloat = false;
-+            //dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
-+            //dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
-+
++            if (false && getenv("GGML_METAL_BF16_DISABLE") != NULL) {
+                 dev->props.has_bfloat = false;
+             }
 
-             dev->props.use_residency_sets = true;
- #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
 --
-2.39.2
+2.45.2
+
diff --git a/recipe/patches/disable-metal-flash-attention.patch b/recipe/patches/disable-metal-flash-attention.patch
@@ -1,94 +1,40 @@
-From f549b0007dbdd683215820f7229ce180a12b191d Mon Sep 17 00:00:00 2001
-From: Xianglong Kong <xkong@anaconda.com>
-Date: Thu, 30 Oct 2025 11:15:00 -0500
-Subject: [PATCH] Disable Metal Flash Attention due to numerical precision
- issues
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Conda Build <noreply@anaconda.com>
+Date: Mon, 2 Dec 2025 10:00:00 +0000
+Subject: [PATCH] Disable Metal Flash Attention due to numerical precision issues
 
-Metal Flash Attention implementation in llama.cpp b6872 produces incorrect
-results with NMSE errors ranging from 0.068 to 0.160, significantly exceeding
-the test tolerance of 0.005. This affects test-backend-ops with various
-configurations using f32/f16/q8_0/q4_0 K/V types.
+AI assistant generated patch.
 
-Investigation shows Flash Attention was present in both b6653 and b6872, with
-significant improvements between versions including:
-- Metal backend refactoring and optimizations (#16446)
-- Support for non-padded Flash Attention KV (#16148)
-- Flash Attention support for F32 K/V and head size 32 (#16531)
-- Avoiding Metal's gpuAddress property (#16576)
+Metal Flash Attention produces incorrect numerical results on macOS SDK < 15,
+with NMSE errors 14-32x higher than acceptable tolerance (0.068-0.160 vs 0.005).
 
-However, these changes introduced or exposed numerical precision issues on
-macOS SDK < 15. Disabling Flash Attention on Metal until precision is fixed
-upstream.
+This patch makes ggml_metal_device_supports_op return false for GGML_OP_FLASH_ATTN_EXT,
+causing Flash Attention operations to fall back to CPU (correct precision).
 
-This patch makes ggml_metal_supports_op return false for GGML_OP_FLASH_ATTN_EXT,
-causing Flash Attention operations to fall back to CPU implementation which has
-correct precision.
+Can be removed when Metal Flash Attention precision is fixed upstream or
+when building with macOS 15+ SDK.
 
-Related issues:
-- test-backend-ops: 190/~5489 Flash Attention tests failing
-- Errors like: NMSE = 0.124010895 > 0.005000000
-
-TODO: Re-enable when Metal Flash Attention precision is fixed in upstream llama.cpp
 ---
- ggml/src/ggml-metal/ggml-metal-device.m | 36 +++++++++++++++++-------
- 1 file changed, 26 insertions(+), 10 deletions(-)
+ ggml/src/ggml-metal/ggml-metal-device.m | 4 ++++
+ 1 file changed, 4 insertions(+)
 
+diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
+index 1234567..abcdefg 100644
 --- a/ggml/src/ggml-metal/ggml-metal-device.m
 +++ b/ggml/src/ggml-metal/ggml-metal-device.m
-@@ -703,27 +703,35 @@
+@@ -909,6 +909,10 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
+         case GGML_OP_TOP_K:
          case GGML_OP_ARANGE:
              return true;
          case GGML_OP_FLASH_ATTN_EXT:
--            // for new head sizes, add checks here
--            if (op->src[0]->ne[0] != 32 &&
--                op->src[0]->ne[0] != 40 &&
--                op->src[0]->ne[0] != 64 &&
--                op->src[0]->ne[0] != 80 &&
--                op->src[0]->ne[0] != 96 &&
--                op->src[0]->ne[0] != 112 &&
--                op->src[0]->ne[0] != 128 &&
--                op->src[0]->ne[0] != 192 &&
--                op->src[0]->ne[0] != 256) {
--                return false;
--            }
--            if (op->src[0]->ne[0] == 576) {
--                // DeepSeek sizes
--                // TODO: disabled for now, until optmized
--                return false;
--            }
--            if (op->src[1]->type != op->src[2]->type) {
--                return false;
--            }
--            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
-+            // Disable Flash Attention on Metal due to numerical precision issues
-+            // Metal Flash Attention implementation produces incorrect results with
-+            // NMSE errors 0.068-0.160 (vs tolerance 0.005) in test-backend-ops.
-+            // This affects various configurations with f32/f16/q8_0/q4_0 K/V types.
-+            // TODO: Re-enable when Metal Flash Attention precision is fixed upstream
++            // Disabled for conda-forge: Flash Attention has numerical precision issues on macOS SDK < 15
++            // NMSE errors 0.068-0.160 vs tolerance 0.005 (14-32x too high)
++            // Fall back to CPU implementation for correct results
 +            return false;
-+
-+            // Original code (disabled):
-+            // // for new head sizes, add checks here
-+            // if (op->src[0]->ne[0] != 32 &&
-+            //     op->src[0]->ne[0] != 40 &&
-+            //     op->src[0]->ne[0] != 64 &&
-+            //     op->src[0]->ne[0] != 80 &&
-+            //     op->src[0]->ne[0] != 96 &&
-+            //     op->src[0]->ne[0] != 112 &&
-+            //     op->src[0]->ne[0] != 128 &&
-+            //     op->src[0]->ne[0] != 192 &&
-+            //     op->src[0]->ne[0] != 256) {
-+            //     return false;
-+            // }
-+            // if (op->src[0]->ne[0] == 576) {
-+            //     // DeepSeek sizes
-+            //     // TODO: disabled for now, until optmized
-+            //     return false;
-+            // }
-+            // if (op->src[1]->type != op->src[2]->type) {
-+            //     return false;
-+            // }
-+            // return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
-         case GGML_OP_SSM_CONV:
-         case GGML_OP_SSM_SCAN:
-             return has_simdgroup_reduction;
+             // for new head sizes, add checks here
+             if (op->src[0]->ne[0] != 32 &&
+                 op->src[0]->ne[0] != 40 &&
+
+--
+2.45.2
+
diff --git a/recipe/patches/fix-macos-dylib-version.patch b/recipe/patches/fix-macos-dylib-version.patch
@@ -0,0 +1,50 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Conda Build <noreply@anaconda.com>
+Date: Mon, 2 Dec 2024 10:00:00 +0000
+Subject: [PATCH] Fix macOS dylib version for large build numbers
+
+AI assistant generated patch.
+
+macOS linker has a limit of 255 for version components in the a.b.c format.
+Build numbers like 7229 exceed this limit, causing linker errors:
+"ld: malformed 64-bit a.b.c.d.e version number: 0.0.7229"
+
+This patch sets a fixed VERSION for shared libraries (libllama, libmtmd)
+while preserving LLAMA_INSTALL_VERSION in config files (llama.pc, llama-config.cmake).
+
+See: https://github.com/ggml-org/llama.cpp/issues/17258
+
+---
+ src/CMakeLists.txt        | 2 +-
+ tools/mtmd/CMakeLists.txt | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 1234567..abcdefg 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -137,7 +137,7 @@ target_link_libraries(llama PRIVATE
+ )
+
+ set_target_properties(llama PROPERTIES
+-    VERSION ${LLAMA_INSTALL_VERSION}
++    VERSION 0
+     SOVERSION 0
+ )
+
+diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
+index 1234567..abcdefg 100644
+--- a/tools/mtmd/CMakeLists.txt
++++ b/tools/mtmd/CMakeLists.txt
+@@ -14,7 +14,7 @@ add_library(mtmd
+             )
+
+ set_target_properties(mtmd PROPERTIES
+-    VERSION ${LLAMA_INSTALL_VERSION}
++    VERSION 0
+     SOVERSION 0
+ )
+
+--
+2.45.2
+