pytorch
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/aoti_backend.py‎
Lines changed: 17 additions & 27 deletions b/‎backends/aoti/aoti_backend.py‎
Lines changed: 17 additions & 27 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/ETCoreMLStrings.mm‎
Lines changed: 38 additions & 27 deletions b/‎backends/apple/coreml/runtime/delegate/ETCoreMLStrings.mm‎
Lines changed: 38 additions & 27 deletions
diff --git a/‎backends/apple/metal/metal_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/metal/metal_backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/metal/runtime/shims/et_metal.h‎
Lines changed: 16 additions & 0 deletions b/‎backends/apple/metal/runtime/shims/et_metal.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎backends/apple/metal/runtime/shims/et_metal.mm‎
Lines changed: 87 additions & 0 deletions b/‎backends/apple/metal/runtime/shims/et_metal.mm‎
Lines changed: 87 additions & 0 deletions
@@ -381,7 +381,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135768" # 136 KiB
+          threshold="136000" # 136 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"
 
@@ -9,7 +9,7 @@
 import typing
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Dict, List, Set
 
 import torch
 from executorch.backends.aoti.passes.replace_view_copy_with_view import (
@@ -70,10 +70,15 @@ def get_aoti_compile_options(
 
     @classmethod
     @abstractmethod
-    def get_custom_passes(cls) -> List[typing.Any]:
+    def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]:
         """Return the list of custom passes to apply after ReplaceViewCopyWithViewPass and before decomposition."""
         pass
 
+    @classmethod
+    def get_extra_aoti_compile_context_manager(cls):
+        """Return extra context manager to apply during aoti_compile stage. By default returns an empty context manager."""
+        return contextlib.nullcontext()
+
     @classmethod
     @contextlib.contextmanager
     def collect_unsupported_fallback_kernels(cls, missing_fallback_kernels: Set[str]):
@@ -91,39 +96,24 @@ def collect_unsupported_fallback_kernels(cls, missing_fallback_kernels: Set[str]
         )
 
         def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels(
-            self,
-            kernel: str,
-            args: list[str],
-            device: str,
-            *,
-            debug_args: Optional[list[str]] = None,
-            debug_handle: Optional[int] = None,
-        ):
+            self, kernel: str, *args: Any, **kwargs: Any
+        ) -> None:
             if kernel not in supported_kernels:
                 missing_fallback_kernels.add(kernel)
 
-            original_generate_c_shim_extern_kernel_call(
-                self,
-                kernel,
-                args,
-                device,
-                debug_args=debug_args,
-                debug_handle=debug_handle,
+            return original_generate_c_shim_extern_kernel_call(
+                self, kernel, *args, **kwargs
             )
 
         def generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels(
-            self,
-            op_overload,
-            raw_args,
-            output_args,
-            raw_outputs,
-        ):
+            self, op_overload: Any, *args: Any, **kwargs: Any
+        ) -> None:
             kernel_name = getattr(op_overload, "_name", str(op_overload))
             if kernel_name not in supported_kernels:
                 missing_fallback_kernels.add(kernel_name)
 
-            original_generate_fallback_kernel_with_runtime_lookup_aot(
-                self, op_overload, raw_args, output_args, raw_outputs
+            return original_generate_fallback_kernel_with_runtime_lookup_aot(
+                self, op_overload, *args, **kwargs
             )
 
         CppWrapperCpu.generate_c_shim_extern_kernel_call = (
@@ -164,7 +154,7 @@ def preprocess(
         ReplaceViewCopyWithViewPass()(device_edge_program.graph_module)
 
         # Apply custom backend-specific passes
-        custom_passes = cls.get_custom_passes()
+        custom_passes = cls.get_custom_passes(compile_specs)
         for custom_pass in custom_passes:
             custom_pass(device_edge_program.graph_module)
 
@@ -189,7 +179,7 @@ def preprocess(
         # Compile with fallback kernel collection
         with cls.collect_unsupported_fallback_kernels(
             missing_fallback_kernels
-        ), torch.no_grad():
+        ), torch.no_grad(), cls.get_extra_aoti_compile_context_manager():
             paths = torch._inductor.aot_compile(
                 edge_program_module, tuple(user_input_placeholders), options=options
             )
 
@@ -101,39 +101,50 @@ + (NSString *)debugSymbolToHandlesKeyName {
 }
 
 + (nullable NSString *)assetsDirectoryPath {
-    static dispatch_once_t onceToken;
-    static NSString *result = nil;
-    dispatch_once(&onceToken, ^{
-        NSArray<NSString *> *paths = NSSearchPathForDirectoriesInDomains(NSCachesDirectory, NSUserDomainMask, YES);
-        if (paths.count > 0) {
-            result = [paths.lastObject stringByAppendingPathComponent:self.productName];
-        }
-    });
-    
-    return result;
+    #if defined(EXECUTORCH_COREML_ASSETS_DIRECTORY_PATH)
+        return @(EXECUTORCH_COREML_ASSETS_DIRECTORY_PATH);
+    #else
+        static dispatch_once_t onceToken;
+        static NSString *result = nil;
+        dispatch_once(&onceToken, ^{
+            NSArray<NSString *> *paths = NSSearchPathForDirectoriesInDomains(NSCachesDirectory, NSUserDomainMask, YES);
+            if (paths.count > 0) {
+                result = [paths.lastObject stringByAppendingPathComponent:self.productName];
+            }
+        });
+        
+        return result;
+    #endif
 }
 
 + (nullable NSString *)trashDirectoryPath {
-    static dispatch_once_t onceToken;
-    static NSString *result = nil;
-    dispatch_once(&onceToken, ^{
-        result = [NSTemporaryDirectory() stringByAppendingPathComponent:self.productName];
-    });
-    
-    return result;
+    #if defined(EXECUTORCH_COREML_TRASH_DIRECTORY_PATH)
+        return @(EXECUTORCH_COREML_TRASH_DIRECTORY_PATH);
+    #else
+        static dispatch_once_t onceToken;
+        static NSString *result = nil;
+        dispatch_once(&onceToken, ^{
+            result = [NSTemporaryDirectory() stringByAppendingPathComponent:self.productName];
+        });
+        
+        return result;
+    #endif
 }
 
 + (nullable NSString *)databaseDirectoryPath {
-    static dispatch_once_t onceToken;
-    static NSString *result = nil;
-    dispatch_once(&onceToken, ^{
-        NSArray<NSString *> *paths = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, YES);
-        if (paths.count > 0) {
-            result = [paths.lastObject stringByAppendingPathComponent:self.productName];
-        }
-    });
-    
-    return result;
+    #if defined(EXECUTORCH_COREML_DATABASE_DIRECTORY_PATH)
+        return @(EXECUTORCH_COREML_DATABASE_DIRECTORY_PATH);
+    #else
+        static dispatch_once_t onceToken;
+        static NSString *result = nil;
+        dispatch_once(&onceToken, ^{
+            NSArray<NSString *> *paths = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, YES);
+            if (paths.count > 0) {
+                result = [paths.lastObject stringByAppendingPathComponent:self.productName];
+            }
+        });
+        return result;
+    #endif
 }
 
 
 
@@ -42,7 +42,7 @@ def get_decomposition_table(cls) -> Dict[Any, Any]:
         return {}
 
     @classmethod
-    def get_custom_passes(cls) -> List[typing.Any]:
+    def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]:
         """Return Metal-specific passes (currently none)"""
         return []
 
 
@@ -181,6 +181,13 @@ class ETMetalKernelFunction {
   void startEncoding();
   void setArg(unsigned idx, const executorch::runtime::etensor::Tensor& tensor);
   void setArg(unsigned idx, int64_t val);
+  void setArg(unsigned idx, uint32_t val);
+  void setArg(unsigned idx, float val);
+  void setArg(unsigned idx, bool val);
+  void setArg(unsigned idx, const void* data, size_t size);
+
+  // Helper for Metal uint3 struct
+  void setArgUint3(unsigned idx, uint32_t x, uint32_t y, uint32_t z);
 
   void dispatchSingle(uint64_t length);
   void dispatchSingleWithGroupSize(uint64_t length, uint64_t group_size);
@@ -191,6 +198,15 @@ class ETMetalKernelFunction {
       const uint64_t* group_size,
       size_t group_size_size);
 
+  // Dispatch with explicit threadgroup count (not thread count)
+  void dispatchThreadgroups(
+      uint64_t gridX,
+      uint64_t gridY,
+      uint64_t gridZ,
+      uint64_t threadsX,
+      uint64_t threadsY,
+      uint64_t threadsZ);
+
   void runCommandBlock(std::function<void(void)> f);
 
  private:
 
@@ -10,6 +10,7 @@
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 #import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
 #import <Foundation/Foundation.h>
+#include <simd/simd.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/backends/apple/metal/runtime/shims/et_metal.h>
@@ -377,6 +378,58 @@ int metal_copy_memory(void* dst, const void* src, size_t nbytes, bool src_is_dev
     ET_LOG(Debug, "ETMetalKernelFunction::setArg: Set int64_t value %lld at index %u", val, idx);
 }
 
+void ETMetalKernelFunction::setArg(unsigned idx, uint32_t val) {
+    if (!encoder_) {
+        ET_LOG(Error, "ETMetalKernelFunction::setArg: No active encoder");
+        return;
+    }
+
+    [encoder_ setBytes:&val length:sizeof(uint32_t) atIndex:idx];
+    ET_LOG(Debug, "ETMetalKernelFunction::setArg: Set uint32_t value %u at index %u", val, idx);
+}
+
+void ETMetalKernelFunction::setArg(unsigned idx, float val) {
+    if (!encoder_) {
+        ET_LOG(Error, "ETMetalKernelFunction::setArg: No active encoder");
+        return;
+    }
+
+    [encoder_ setBytes:&val length:sizeof(float) atIndex:idx];
+    ET_LOG(Debug, "ETMetalKernelFunction::setArg: Set float value %f at index %u", val, idx);
+}
+
+void ETMetalKernelFunction::setArg(unsigned idx, bool val) {
+    if (!encoder_) {
+        ET_LOG(Error, "ETMetalKernelFunction::setArg: No active encoder");
+        return;
+    }
+
+    [encoder_ setBytes:&val length:sizeof(bool) atIndex:idx];
+    ET_LOG(Debug, "ETMetalKernelFunction::setArg: Set bool value %s at index %u", val ? "true" : "false", idx);
+}
+
+void ETMetalKernelFunction::setArg(unsigned idx, const void* data, size_t size) {
+    if (!encoder_) {
+        ET_LOG(Error, "ETMetalKernelFunction::setArg: No active encoder");
+        return;
+    }
+
+    [encoder_ setBytes:data length:size atIndex:idx];
+    ET_LOG(Debug, "ETMetalKernelFunction::setArg: Set bytes at index %u (size: %zu)", idx, size);
+}
+
+void ETMetalKernelFunction::setArgUint3(unsigned idx, uint32_t x, uint32_t y, uint32_t z) {
+    if (!encoder_) {
+        ET_LOG(Error, "ETMetalKernelFunction::setArgUint3: No active encoder");
+        return;
+    }
+
+    // Use SIMD library's uint3 type which matches Metal shader's uint3 layout
+    simd_uint3 val = {x, y, z};
+    [encoder_ setBytes:&val length:sizeof(simd_uint3) atIndex:idx];
+    ET_LOG(Debug, "ETMetalKernelFunction::setArgUint3: Set uint3{%u, %u, %u} at index %u", x, y, z, idx);
+}
+
 void ETMetalKernelFunction::dispatchSingle(uint64_t length) {
     if (!encoder_) {
         ET_LOG(Error, "ETMetalKernelFunction::dispatchSingle: No active encoder");
@@ -502,6 +555,40 @@ int metal_copy_memory(void* dst, const void* src, size_t nbytes, bool src_is_dev
 
 }
 
+void ETMetalKernelFunction::dispatchThreadgroups(uint64_t gridX, uint64_t gridY, uint64_t gridZ,
+                                                  uint64_t threadsX, uint64_t threadsY, uint64_t threadsZ) {
+    if (!encoder_) {
+        ET_LOG(Error, "ETMetalKernelFunction::dispatchThreadgroups: No active encoder");
+        return;
+    }
+
+    if (!cps_) {
+        ET_LOG(Error, "ETMetalKernelFunction::dispatchThreadgroups: No compute pipeline state");
+        return;
+    }
+
+    // Calculate total threads per threadgroup
+    uint64_t totalThreads = threadsX * threadsY * threadsZ;
+
+    const auto maxThreadsPerGroup = static_cast<uint64_t>([cps_ maxTotalThreadsPerThreadgroup]);
+
+    // Validate total thread count
+    if (totalThreads > maxThreadsPerGroup) {
+        ET_LOG(Error, "ETMetalKernelFunction::dispatchThreadgroups: Requested %llu total threads per threadgroup exceeds device maximum of %llu",
+               (unsigned long long)totalThreads, (unsigned long long)maxThreadsPerGroup);
+        return;
+    }
+
+    MTLSize threadgroupsPerGrid = MTLSizeMake(gridX, gridY, gridZ);
+    MTLSize threadsPerThreadgroup = MTLSizeMake(threadsX, threadsY, threadsZ);
+
+    [encoder_ dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadsPerThreadgroup];
+
+    ET_LOG(Debug, "ETMetalKernelFunction::dispatchThreadgroups: Dispatched grid [%llu, %llu, %llu] with threadgroup [%llu, %llu, %llu]",
+           (unsigned long long)gridX, (unsigned long long)gridY, (unsigned long long)gridZ,
+           (unsigned long long)threadsX, (unsigned long long)threadsY, (unsigned long long)threadsZ);
+}
+
 void ETMetalKernelFunction::runCommandBlock(std::function<void(void)> f) {
     // Use dispatch_sync with the stream's serial queue for thread safety and synchronization
     // This matches PyTorch's approach: dispatch_sync_with_rethrow(getCurrentMPSStream()->queue(), ...)