diff --git a/src/Autodiff/TensorOperations.cs b/src/Autodiff/TensorOperations.cs
index ccc99f43d..60031608c 100644
--- a/src/Autodiff/TensorOperations.cs
+++ b/src/Autodiff/TensorOperations.cs
@@ -1346,6 +1346,336 @@ void BackwardFunction(Tensor<T> gradient)
                 $"Got shape=[{string.Join(", ", shape)}], axis={axis}");
         }
     }
+
+    /// <summary>
+    /// Applies Sparsemax activation function (sparse softmax via Euclidean projection onto simplex).
+    /// </summary>
+    /// <param name="a">The input node.</param>
+    /// <param name="axis">The axis along which to compute sparsemax. Default is -1 (last axis).</param>
+    /// <returns>A new computation node containing the sparsemax result.</returns>
+    /// <remarks>
+    /// <para>
+    /// Sparsemax is a sparse alternative to softmax that can produce exactly zero probabilities.
+    /// Algorithm: sparsemax(z) = max(z - τ, 0) where τ is computed via Euclidean projection onto probability simplex.
+    /// The backward pass uses: ∂sparsemax/∂z = diag(S) - (1/|S|) * (s * s^T) where S is the support set (non-zero indices).
+    /// </para>
+    /// <para><b>For Beginners:</b> Sparsemax converts scores to probabilities like softmax, but can assign exactly zero probability.
+    ///
+    /// For sparsemax:
+    /// - The forward pass projects input onto the simplex, potentially setting some values to exactly zero
+    /// - All non-zero values sum to 1 (probability distribution)
+    /// - The backward pass only propagates gradients through non-zero outputs
+    ///
+    /// Sparsemax is useful for:
+    /// - Classification with many classes where only a few are relevant
+    /// - Attention mechanisms with sparse focus
+    /// - Interpretable probability distributions
+    /// </para>
+    /// </remarks>
+    public static ComputationNode<T> Sparsemax(ComputationNode<T> a, int axis = -1)
+    {
+        var numOps = MathHelper.GetNumericOperations<T>();
+        var shape = a.Value.Shape;
+
+        if (axis < 0)
+            axis = shape.Length + axis;
+
+        if (shape.Length == 2 && axis == 1)
+        {
+            int batchSize = shape[0];
+            int features = shape[1];
+            var result = new Tensor<T>(shape);
+            var supportSets = new List<bool[]>();
+
+            for (int b = 0; b < batchSize; b++)
+            {
+                var row = new T[features];
+                for (int f = 0; f < features; f++)
+                    row[f] = a.Value[b, f];
+
+                var sorted = row.OrderByDescending(x => x).ToArray();
+
+                int k = 1;
+                T sum = sorted[0];
+                T threshold = numOps.Zero;
+
+                for (int i = 1; i < features; i++)
+                {
+                    sum = numOps.Add(sum, sorted[i]);
+                    T kPlusOne = numOps.FromDouble(i + 1);
+                    T avgMinusZ = numOps.Subtract(
+                        numOps.Divide(sum, kPlusOne),
+                        sorted[i]
+                    );
+
+                    if (numOps.GreaterThan(avgMinusZ, numOps.Zero))
+                    {
+                        k = i + 1;
+                        threshold = numOps.Divide(
+                            numOps.Subtract(sum, numOps.One),
+                            numOps.FromDouble(k)
+                        );
+                        break;
+                    }
+                }
+
+                if (k == 1)
+                {
+                    threshold = numOps.Divide(
+                        numOps.Subtract(sum, numOps.One),
+                        numOps.FromDouble(k)
+                    );
+                }
+
+                var support = new bool[features];
+                for (int f = 0; f < features; f++)
+                {
+                    var shifted = numOps.Subtract(row[f], threshold);
+                    if (numOps.GreaterThan(shifted, numOps.Zero))
+                    {
+                        result[b, f] = shifted;
+                        support[f] = true;
+                    }
+                    else
+                    {
+                        result[b, f] = numOps.Zero;
+                        support[f] = false;
+                    }
+                }
+                supportSets.Add(support);
+            }
+
+            void BackwardFunction(Tensor<T> gradient)
+            {
+                if (a.RequiresGradient)
+                {
+                    var gradA = new Tensor<T>(shape);
+                    for (int b = 0; b < batchSize; b++)
+                    {
+                        var support = supportSets[b];
+                        int supportSize = support.Count(s => s);
+
+                        if (supportSize == 0)
+                            continue;
+
+                        T supportSizeT = numOps.FromDouble(supportSize);
+                        T sumGrad = numOps.Zero;
+
+                        for (int f = 0; f < features; f++)
+                        {
+                            if (support[f])
+                                sumGrad = numOps.Add(sumGrad, gradient[b, f]);
+                        }
+
+                        T avgGrad = numOps.Divide(sumGrad, supportSizeT);
+
+                        for (int f = 0; f < features; f++)
+                        {
+                            if (support[f])
+                                gradA[b, f] = numOps.Subtract(gradient[b, f], avgGrad);
+                            else
+                                gradA[b, f] = numOps.Zero;
+                        }
+                    }
+
+                    if (a.Gradient == null)
+                    {
+                        a.Gradient = gradA;
+                    }
+                    else
+                    {
+                        var existingGradient = a.Gradient;
+                        if (existingGradient != null)
+                        {
+                            a.Gradient = existingGradient.Add(gradA);
+                        }
+                    }
+                }
+            }
+
+            var node = new ComputationNode<T>(
+                value: result,
+                requiresGradient: a.RequiresGradient,
+                parents: new List<ComputationNode<T>> { a },
+                backwardFunction: BackwardFunction,
+                name: null);
+
+            var tape = GradientTape<T>.Current;
+            if (tape != null && tape.IsRecording)
+                tape.RecordOperation(node);
+
+            return node;
+        }
+        else
+        {
+            throw new NotImplementedException(
+                $"Sparsemax is currently only implemented for 2D tensors along axis=-1. " +
+                $"Got shape=[{string.Join(", ", shape)}], axis={axis}");
+        }
+    }
+
+    /// <summary>
+    /// Applies Spherical Softmax activation function (L2 normalization followed by softmax).
+    /// </summary>
+    /// <param name="a">The input node.</param>
+    /// <param name="axis">The axis along which to compute spherical softmax. Default is -1 (last axis).</param>
+    /// <returns>A new computation node containing the spherical softmax result.</returns>
+    /// <remarks>
+    /// <para>
+    /// Spherical Softmax normalizes inputs to unit sphere before applying softmax.
+    /// Formula: spherical_softmax(x) = softmax(x / ||x||_2).
+    /// The backward pass combines gradients from both normalization and softmax steps.
+    /// </para>
+    /// <para><b>For Beginners:</b> Spherical Softmax is softmax with L2 normalization for numerical stability.
+    ///
+    /// For spherical softmax:
+    /// - First normalizes input to unit length (projects to unit sphere)
+    /// - Then applies standard softmax
+    /// - Improves stability with large magnitude inputs
+    ///
+    /// Spherical Softmax is useful for:
+    /// - Classification with varying input magnitudes
+    /// - Attention mechanisms with scale invariance
+    /// - Numerically stable probability distributions
+    /// </para>
+    /// </remarks>
+    public static ComputationNode<T> SphericalSoftmax(ComputationNode<T> a, int axis = -1)
+    {
+        var numOps = MathHelper.GetNumericOperations<T>();
+        var shape = a.Value.Shape;
+
+        if (axis < 0)
+            axis = shape.Length + axis;
+
+        if (shape.Length == 2 && axis == 1)
+        {
+            int batchSize = shape[0];
+            int features = shape[1];
+            var result = new Tensor<T>(shape);
+            var norms = new T[batchSize];
+            var normalized = new Tensor<T>(shape);
+
+            for (int b = 0; b < batchSize; b++)
+            {
+                T sumSquares = numOps.Zero;
+                for (int f = 0; f < features; f++)
+                {
+                    var val = a.Value[b, f];
+                    sumSquares = numOps.Add(sumSquares, numOps.Multiply(val, val));
+                }
+
+                T norm = numOps.Sqrt(sumSquares);
+                norms[b] = norm;
+
+                T epsilon = numOps.FromDouble(1e-8);
+                if (numOps.LessThan(norm, epsilon))
+                    norm = epsilon;
+
+                for (int f = 0; f < features; f++)
+                {
+                    normalized[b, f] = numOps.Divide(a.Value[b, f], norm);
+                }
+
+                var maxVal = normalized[b, 0];
+                for (int f = 1; f < features; f++)
+                {
+                    if (numOps.GreaterThan(normalized[b, f], maxVal))
+                        maxVal = normalized[b, f];
+                }
+
+                var expSum = numOps.Zero;
+                var expValues = new T[features];
+                for (int f = 0; f < features; f++)
+                {
+                    var shifted = numOps.Subtract(normalized[b, f], maxVal);
+                    expValues[f] = numOps.Exp(shifted);
+                    expSum = numOps.Add(expSum, expValues[f]);
+                }
+
+                for (int f = 0; f < features; f++)
+                {
+                    result[b, f] = numOps.Divide(expValues[f], expSum);
+                }
+            }
+
+            void BackwardFunction(Tensor<T> gradient)
+            {
+                if (a.RequiresGradient)
+                {
+                    var gradA = new Tensor<T>(shape);
+
+                    for (int b = 0; b < batchSize; b++)
+                    {
+                        var dotProduct = numOps.Zero;
+                        for (int f = 0; f < features; f++)
+                        {
+                            dotProduct = numOps.Add(dotProduct,
+                                numOps.Multiply(gradient[b, f], result[b, f]));
+                        }
+
+                        var norm = norms[b];
+                        T epsilon = numOps.FromDouble(1e-8);
+                        if (numOps.LessThan(norm, epsilon))
+                            norm = epsilon;
+
+                        for (int f = 0; f < features; f++)
+                        {
+                            var softmaxGrad = numOps.Subtract(gradient[b, f], dotProduct);
+                            softmaxGrad = numOps.Multiply(result[b, f], softmaxGrad);
+
+                            T normTerm = numOps.Zero;
+                            for (int j = 0; j < features; j++)
+                            {
+                                normTerm = numOps.Add(normTerm,
+                                    numOps.Multiply(a.Value[b, j], gradient[b, j]));
+                            }
+                            normTerm = numOps.Multiply(normTerm, a.Value[b, f]);
+                            normTerm = numOps.Divide(normTerm,
+                                numOps.Multiply(norm, numOps.Multiply(norm, norm)));
+
+                            gradA[b, f] = numOps.Divide(
+                                numOps.Subtract(softmaxGrad, normTerm),
+                                norm
+                            );
+                        }
+                    }
+
+                    if (a.Gradient == null)
+                    {
+                        a.Gradient = gradA;
+                    }
+                    else
+                    {
+                        var existingGradient = a.Gradient;
+                        if (existingGradient != null)
+                        {
+                            a.Gradient = existingGradient.Add(gradA);
+                        }
+                    }
+                }
+            }
+
+            var node = new ComputationNode<T>(
+                value: result,
+                requiresGradient: a.RequiresGradient,
+                parents: new List<ComputationNode<T>> { a },
+                backwardFunction: BackwardFunction,
+                name: null);
+
+            var tape = GradientTape<T>.Current;
+            if (tape != null && tape.IsRecording)
+                tape.RecordOperation(node);
+
+            return node;
+        }
+        else
+        {
+            throw new NotImplementedException(
+                $"SphericalSoftmax is currently only implemented for 2D tensors along axis=-1. " +
+                $"Got shape=[{string.Join(", ", shape)}], axis={axis}");
+        }
+    }
+
     /// <summary>
     /// Concatenates multiple computation nodes along a specified axis.
     /// </summary>
diff --git a/src/Engines/CpuEngine.cs b/src/Engines/CpuEngine.cs
index fc0745a0e..3af896966 100644
--- a/src/Engines/CpuEngine.cs
+++ b/src/Engines/CpuEngine.cs
@@ -1218,5 +1218,137 @@ public Tensor<T> ELU<T>(Tensor<T> tensor, double alpha = 1.0)
         return new Tensor<T>(tensor.Shape, resultVector);
     }
 
+    public Tensor<T> Sparsemax<T>(Tensor<T> input) where T : struct
+    {
+        if (input == null)
+            throw new ArgumentNullException(nameof(input));
+
+        var shape = input.Shape;
+        if (shape.Length != 2)
+        {
+            throw new NotSupportedException(
+                $"Sparsemax is currently only implemented for 2D tensors. Got shape=[{string.Join(", ", shape)}]");
+        }
+
+        var numOps = MathHelper.GetNumericOperations<T>();
+        int batchSize = shape[0];
+        int features = shape[1];
+        var result = new Tensor<T>(shape);
+
+        for (int b = 0; b < batchSize; b++)
+        {
+            var row = new T[features];
+            for (int f = 0; f < features; f++)
+                row[f] = input[b, f];
+
+            var sorted = row.OrderByDescending(x => x).ToArray();
+
+            int k = 1;
+            T sum = sorted[0];
+            T threshold = numOps.Zero;
+
+            for (int i = 1; i < features; i++)
+            {
+                sum = numOps.Add(sum, sorted[i]);
+                T kPlusOne = numOps.FromDouble(i + 1);
+                T avgMinusZ = numOps.Subtract(
+                    numOps.Divide(sum, kPlusOne),
+                    sorted[i]
+                );
+
+                if (numOps.GreaterThan(avgMinusZ, numOps.Zero))
+                {
+                    k = i + 1;
+                    threshold = numOps.Divide(
+                        numOps.Subtract(sum, numOps.One),
+                        numOps.FromDouble(k)
+                    );
+                    break;
+                }
+            }
+
+            if (k == 1)
+            {
+                threshold = numOps.Divide(
+                    numOps.Subtract(sum, numOps.One),
+                    numOps.FromDouble(k)
+                );
+            }
+
+            for (int f = 0; f < features; f++)
+            {
+                var shifted = numOps.Subtract(row[f], threshold);
+                if (numOps.GreaterThan(shifted, numOps.Zero))
+                {
+                    result[b, f] = shifted;
+                }
+                else
+                {
+                    result[b, f] = numOps.Zero;
+                }
+            }
+        }
+
+        return result;
+    }
+
+    public Tensor<T> SphericalSoftmax<T>(Tensor<T> input) where T : struct
+    {
+        if (input == null)
+            throw new ArgumentNullException(nameof(input));
+
+        var shape = input.Shape;
+        if (shape.Length != 2)
+        {
+            throw new NotSupportedException(
+                $"SphericalSoftmax is currently only implemented for 2D tensors. Got shape=[{string.Join(", ", shape)}]");
+        }
+
+        var numOps = MathHelper.GetNumericOperations<T>();
+        int batchSize = shape[0];
+        int features = shape[1];
+        var result = new Tensor<T>(shape);
+
+        for (int b = 0; b < batchSize; b++)
+        {
+            T sumSquares = numOps.Zero;
+            for (int f = 0; f < features; f++)
+            {
+                var val = input[b, f];
+                sumSquares = numOps.Add(sumSquares, numOps.Multiply(val, val));
+            }
+
+            T norm = numOps.Sqrt(sumSquares);
+            T epsilon = numOps.FromDouble(1e-8);
+            if (numOps.LessThan(norm, epsilon))
+                norm = epsilon;
+
+            var maxVal = numOps.Divide(input[b, 0], norm);
+            for (int f = 1; f < features; f++)
+            {
+                var normalized = numOps.Divide(input[b, f], norm);
+                if (numOps.GreaterThan(normalized, maxVal))
+                    maxVal = normalized;
+            }
+
+            var expSum = numOps.Zero;
+            var expValues = new T[features];
+            for (int f = 0; f < features; f++)
+            {
+                var normalized = numOps.Divide(input[b, f], norm);
+                var shifted = numOps.Subtract(normalized, maxVal);
+                expValues[f] = numOps.Exp(shifted);
+                expSum = numOps.Add(expSum, expValues[f]);
+            }
+
+            for (int f = 0; f < features; f++)
+            {
+                result[b, f] = numOps.Divide(expValues[f], expSum);
+            }
+        }
+
+        return result;
+    }
+
     #endregion
 }
diff --git a/src/Engines/GpuEngine.cs b/src/Engines/GpuEngine.cs
index 5def66643..60793550b 100644
--- a/src/Engines/GpuEngine.cs
+++ b/src/Engines/GpuEngine.cs
@@ -1499,6 +1499,18 @@ public Tensor<T> ELU<T>(Tensor<T> tensor, double alpha = 1.0)
         return _cpuFallback.ELU(tensor, alpha);
     }
 
+    /// <inheritdoc/>
+    public Tensor<T> Sparsemax<T>(Tensor<T> input) where T : struct
+    {
+        return _cpuFallback.Sparsemax(input);
+    }
+
+    /// <inheritdoc/>
+    public Tensor<T> SphericalSoftmax<T>(Tensor<T> input) where T : struct
+    {
+        return _cpuFallback.SphericalSoftmax(input);
+    }
+
     #endregion
 
     #region GPU Kernels (Float Implementation)
diff --git a/src/Engines/IEngine.cs b/src/Engines/IEngine.cs
index b67cc69b2..d6b6909ef 100644
--- a/src/Engines/IEngine.cs
+++ b/src/Engines/IEngine.cs
@@ -469,6 +469,16 @@ public interface IEngine
     /// </summary>
     Tensor<T> ELU<T>(Tensor<T> tensor, double alpha = 1.0);
 
+    /// <summary>
+    /// Applies Sparsemax activation function (sparse softmax via Euclidean projection onto simplex).
+    /// </summary>
+    Tensor<T> Sparsemax<T>(Tensor<T> input) where T : struct;
+
+    /// <summary>
+    /// Applies Spherical Softmax activation function (L2 normalization followed by softmax).
+    /// </summary>
+    Tensor<T> SphericalSoftmax<T>(Tensor<T> input) where T : struct;
+
     #endregion
 
     #region Matrix Operations (Phase B: Epic 2)