diff --git a/src/Engines/CpuEngine.cs b/src/Engines/CpuEngine.cs
index fc0745a0e..111eb7500 100644
--- a/src/Engines/CpuEngine.cs
+++ b/src/Engines/CpuEngine.cs
@@ -729,6 +729,76 @@ public Tensor<T> BatchMatMul<T>(Tensor<T> a, Tensor<T> b)
         return result;
     }
 
+    /// <inheritdoc/>
+    public Tensor<T> TensorMatMul<T>(Tensor<T> a, Tensor<T> b)
+    {
+        if (a == null) throw new ArgumentNullException(nameof(a));
+        if (b == null) throw new ArgumentNullException(nameof(b));
+        if (a.Rank != 2 || b.Rank != 2)
+        {
+            throw new ArgumentException(
+                $"TensorMatMul requires 2D tensors. Got ranks {a.Rank} and {b.Rank}.");
+        }
+
+        int m = a.Shape[0];
+        int k = a.Shape[1];
+        int k2 = b.Shape[0];
+        int n = b.Shape[1];
+
+        if (k != k2)
+        {
+            throw new ArgumentException(
+                $"Matrix dimensions incompatible for multiplication. " +
+                $"First tensor has shape [{m}, {k}], second has shape [{k2}, {n}]. " +
+                $"Inner dimensions must match ({k} != {k2}).");
+        }
+
+        var numOps = MathHelper.GetNumericOperations<T>();
+        var result = new Tensor<T>(new[] { m, n });
+
+        // Standard matrix multiplication: C = A @ B
+        for (int i = 0; i < m; i++)
+        {
+            for (int j = 0; j < n; j++)
+            {
+                T sum = numOps.Zero;
+                for (int p = 0; p < k; p++)
+                {
+                    sum = numOps.Add(sum, numOps.Multiply(a[i, p], b[p, j]));
+                }
+                result[i, j] = sum;
+            }
+        }
+
+        return result;
+    }
+
+    /// <inheritdoc/>
+    public Tensor<T> TensorTranspose<T>(Tensor<T> tensor)
+    {
+        if (tensor == null) throw new ArgumentNullException(nameof(tensor));
+        if (tensor.Rank != 2)
+        {
+            throw new ArgumentException(
+                $"TensorTranspose requires a 2D tensor. Got rank {tensor.Rank}.");
+        }
+
+        int rows = tensor.Shape[0];
+        int cols = tensor.Shape[1];
+        var result = new Tensor<T>(new[] { cols, rows });
+
+        // Transpose: result[j, i] = tensor[i, j]
+        for (int i = 0; i < rows; i++)
+        {
+            for (int j = 0; j < cols; j++)
+            {
+                result[j, i] = tensor[i, j];
+            }
+        }
+
+        return result;
+    }
+
     /// <inheritdoc/>
     public Tensor<T> TensorAdd<T>(Tensor<T> a, Tensor<T> b)
     {
diff --git a/src/Engines/GpuEngine.cs b/src/Engines/GpuEngine.cs
index 5def66643..11c659dce 100644
--- a/src/Engines/GpuEngine.cs
+++ b/src/Engines/GpuEngine.cs
@@ -3965,6 +3965,104 @@ private Tensor<double> BatchMatMulGpuDouble(Tensor<double> a, Tensor<double> b)
         }
     }
 
+    /// <inheritdoc/>
+    public Tensor<T> TensorMatMul<T>(Tensor<T> a, Tensor<T> b)
+    {
+        // Adaptive execution: check size threshold (Phase B: US-GPU-004)
+        // Use matrix multiply threshold since this is a matrix operation
+        if (Math.Max(a.Shape[0], a.Shape[1]) < _thresholds.MatrixMultiply)
+        {
+            return _cpuFallback.TensorMatMul(a, b);
+        }
+
+        // Check GPU health and type support (Phase B: US-GPU-006)
+        if (SupportsGpu && _gpuHealthy)
+        {
+            // For 2D tensors, we can use matrix operations directly
+            // Convert to Matrix, multiply, convert back to Tensor
+            if (typeof(T) == typeof(float))
+            {
+                var matrixA = ToMatrix((Tensor<float>)(object)a);
+                var matrixB = ToMatrix((Tensor<float>)(object)b);
+                var resultMatrix = MatrixMultiply(matrixA, matrixB);
+                return (Tensor<T>)(object)ToTensor(resultMatrix);
+            }
+            if (typeof(T) == typeof(double))
+            {
+                var matrixA = ToMatrix((Tensor<double>)(object)a);
+                var matrixB = ToMatrix((Tensor<double>)(object)b);
+                var resultMatrix = MatrixMultiply(matrixA, matrixB);
+                return (Tensor<T>)(object)ToTensor(resultMatrix);
+            }
+        }
+
+        // Fallback to CPU for unsupported types or unhealthy GPU
+        return _cpuFallback.TensorMatMul(a, b);
+    }
+
+    /// <inheritdoc/>
+    public Tensor<T> TensorTranspose<T>(Tensor<T> tensor)
+    {
+        // Adaptive execution: check size threshold (Phase B: US-GPU-004)
+        // Use MatrixMultiply threshold as a proxy for transpose threshold
+        if (Math.Max(tensor.Shape[0], tensor.Shape[1]) < _thresholds.MatrixMultiply)
+        {
+            return _cpuFallback.TensorTranspose(tensor);
+        }
+
+        // Check GPU health and type support (Phase B: US-GPU-006)
+        if (SupportsGpu && _gpuHealthy)
+        {
+            // For 2D tensors, we can use matrix transpose directly
+            // Convert to Matrix, transpose, convert back to Tensor
+            if (typeof(T) == typeof(float))
+            {
+                var matrix = ToMatrix((Tensor<float>)(object)tensor);
+                var resultMatrix = MatrixTranspose(matrix);
+                return (Tensor<T>)(object)ToTensor(resultMatrix);
+            }
+            if (typeof(T) == typeof(double))
+            {
+                var matrix = ToMatrix((Tensor<double>)(object)tensor);
+                var resultMatrix = MatrixTranspose(matrix);
+                return (Tensor<T>)(object)ToTensor(resultMatrix);
+            }
+        }
+
+        // Fallback to CPU for unsupported types or unhealthy GPU
+        return _cpuFallback.TensorTranspose(tensor);
+    }
+
+    // Helper methods to convert between Matrix and Tensor for 2D operations
+    private static Matrix<T> ToMatrix<T>(Tensor<T> tensor)
+    {
+        if (tensor.Rank != 2)
+            throw new ArgumentException("Tensor must be 2D to convert to Matrix");
+
+        var matrix = new Matrix<T>(tensor.Shape[0], tensor.Shape[1]);
+        for (int i = 0; i < tensor.Shape[0]; i++)
+        {
+            for (int j = 0; j < tensor.Shape[1]; j++)
+            {
+                matrix[i, j] = tensor[i, j];
+            }
+        }
+        return matrix;
+    }
+
+    private static Tensor<T> ToTensor<T>(Matrix<T> matrix)
+    {
+        var tensor = new Tensor<T>(new[] { matrix.Rows, matrix.Columns });
+        for (int i = 0; i < matrix.Rows; i++)
+        {
+            for (int j = 0; j < matrix.Columns; j++)
+            {
+                tensor[i, j] = matrix[i, j];
+            }
+        }
+        return tensor;
+    }
+
     /// <inheritdoc/>
     public Tensor<T> TensorAdd<T>(Tensor<T> a, Tensor<T> b)
     {
diff --git a/src/Engines/IEngine.cs b/src/Engines/IEngine.cs
index b67cc69b2..4504c8c8b 100644
--- a/src/Engines/IEngine.cs
+++ b/src/Engines/IEngine.cs
@@ -691,6 +691,45 @@ public interface IEngine
     /// </remarks>
     Tensor<T> BatchMatMul<T>(Tensor<T> a, Tensor<T> b);
 
+    /// <summary>
+    /// Performs matrix multiplication on two 2D tensors.
+    /// </summary>
+    /// <typeparam name="T">The numeric type of tensor elements.</typeparam>
+    /// <param name="a">The first tensor (M x K) - must be 2D.</param>
+    /// <param name="b">The second tensor (K x N) - must be 2D.</param>
+    /// <returns>The product tensor (M x N).</returns>
+    /// <exception cref="ArgumentException">Thrown when tensors are not 2D or dimensions are incompatible.</exception>
+    /// <remarks>
+    /// <para>
+    /// Matrix multiplication for 2D tensors. This is the tensor equivalent of MatrixMultiply.
+    /// Used in autodiff computation graphs where operations work with Tensor types.
+    /// </para>
+    /// <para>
+    /// For 2D tensors: result[i,j] = sum(a[i,k] * b[k,j]) for all k.
+    /// GPU acceleration provides significant speedup for large matrices.
+    /// </para>
+    /// </remarks>
+    Tensor<T> TensorMatMul<T>(Tensor<T> a, Tensor<T> b);
+
+    /// <summary>
+    /// Transposes a 2D tensor (swaps rows and columns).
+    /// </summary>
+    /// <typeparam name="T">The numeric type of tensor elements.</typeparam>
+    /// <param name="tensor">The input tensor (M x N) - must be 2D.</param>
+    /// <returns>The transposed tensor (N x M).</returns>
+    /// <exception cref="ArgumentException">Thrown when tensor is not 2D.</exception>
+    /// <remarks>
+    /// <para>
+    /// Transpose operation for 2D tensors. This is the tensor equivalent of MatrixTranspose.
+    /// Used in autodiff computation graphs where operations work with Tensor types.
+    /// </para>
+    /// <para>
+    /// For a 2D tensor, swaps dimensions: if input has shape [M, N], result has shape [N, M].
+    /// GPU acceleration provides speedup for large tensors through coalesced memory access.
+    /// </para>
+    /// </remarks>
+    Tensor<T> TensorTranspose<T>(Tensor<T> tensor);
+
     /// <summary>
     /// Adds two tensors element-wise.
     /// </summary>