diff --git a/src/Autodiff/TensorOperations.cs b/src/Autodiff/TensorOperations.cs index ccc99f43d..60031608c 100644 --- a/src/Autodiff/TensorOperations.cs +++ b/src/Autodiff/TensorOperations.cs @@ -1346,6 +1346,336 @@ void BackwardFunction(Tensor gradient) $"Got shape=[{string.Join(", ", shape)}], axis={axis}"); } } + + /// + /// Applies Sparsemax activation function (sparse softmax via Euclidean projection onto simplex). + /// + /// The input node. + /// The axis along which to compute sparsemax. Default is -1 (last axis). + /// A new computation node containing the sparsemax result. + /// + /// + /// Sparsemax is a sparse alternative to softmax that can produce exactly zero probabilities. + /// Algorithm: sparsemax(z) = max(z - τ, 0) where τ is computed via Euclidean projection onto probability simplex. + /// The backward pass uses: ∂sparsemax/∂z = diag(S) - (1/|S|) * (s * s^T) where S is the support set (non-zero indices). + /// + /// For Beginners: Sparsemax converts scores to probabilities like softmax, but can assign exactly zero probability. + /// + /// For sparsemax: + /// - The forward pass projects input onto the simplex, potentially setting some values to exactly zero + /// - All non-zero values sum to 1 (probability distribution) + /// - The backward pass only propagates gradients through non-zero outputs + /// + /// Sparsemax is useful for: + /// - Classification with many classes where only a few are relevant + /// - Attention mechanisms with sparse focus + /// - Interpretable probability distributions + /// + /// + public static ComputationNode Sparsemax(ComputationNode a, int axis = -1) + { + var numOps = MathHelper.GetNumericOperations(); + var shape = a.Value.Shape; + + if (axis < 0) + axis = shape.Length + axis; + + if (shape.Length == 2 && axis == 1) + { + int batchSize = shape[0]; + int features = shape[1]; + var result = new Tensor(shape); + var supportSets = new List(); + + for (int b = 0; b < batchSize; b++) + { + var row = new T[features]; + for (int f = 0; f < features; f++) + row[f] = a.Value[b, f]; + + var sorted = row.OrderByDescending(x => x).ToArray(); + + int k = 1; + T sum = sorted[0]; + T threshold = numOps.Zero; + + for (int i = 1; i < features; i++) + { + sum = numOps.Add(sum, sorted[i]); + T kPlusOne = numOps.FromDouble(i + 1); + T avgMinusZ = numOps.Subtract( + numOps.Divide(sum, kPlusOne), + sorted[i] + ); + + if (numOps.GreaterThan(avgMinusZ, numOps.Zero)) + { + k = i + 1; + threshold = numOps.Divide( + numOps.Subtract(sum, numOps.One), + numOps.FromDouble(k) + ); + break; + } + } + + if (k == 1) + { + threshold = numOps.Divide( + numOps.Subtract(sum, numOps.One), + numOps.FromDouble(k) + ); + } + + var support = new bool[features]; + for (int f = 0; f < features; f++) + { + var shifted = numOps.Subtract(row[f], threshold); + if (numOps.GreaterThan(shifted, numOps.Zero)) + { + result[b, f] = shifted; + support[f] = true; + } + else + { + result[b, f] = numOps.Zero; + support[f] = false; + } + } + supportSets.Add(support); + } + + void BackwardFunction(Tensor gradient) + { + if (a.RequiresGradient) + { + var gradA = new Tensor(shape); + for (int b = 0; b < batchSize; b++) + { + var support = supportSets[b]; + int supportSize = support.Count(s => s); + + if (supportSize == 0) + continue; + + T supportSizeT = numOps.FromDouble(supportSize); + T sumGrad = numOps.Zero; + + for (int f = 0; f < features; f++) + { + if (support[f]) + sumGrad = numOps.Add(sumGrad, gradient[b, f]); + } + + T avgGrad = numOps.Divide(sumGrad, supportSizeT); + + for (int f = 0; f < features; f++) + { + if (support[f]) + gradA[b, f] = numOps.Subtract(gradient[b, f], avgGrad); + else + gradA[b, f] = numOps.Zero; + } + } + + if (a.Gradient == null) + { + a.Gradient = gradA; + } + else + { + var existingGradient = a.Gradient; + if (existingGradient != null) + { + a.Gradient = existingGradient.Add(gradA); + } + } + } + } + + var node = new ComputationNode( + value: result, + requiresGradient: a.RequiresGradient, + parents: new List> { a }, + backwardFunction: BackwardFunction, + name: null); + + var tape = GradientTape.Current; + if (tape != null && tape.IsRecording) + tape.RecordOperation(node); + + return node; + } + else + { + throw new NotImplementedException( + $"Sparsemax is currently only implemented for 2D tensors along axis=-1. " + + $"Got shape=[{string.Join(", ", shape)}], axis={axis}"); + } + } + + /// + /// Applies Spherical Softmax activation function (L2 normalization followed by softmax). + /// + /// The input node. + /// The axis along which to compute spherical softmax. Default is -1 (last axis). + /// A new computation node containing the spherical softmax result. + /// + /// + /// Spherical Softmax normalizes inputs to unit sphere before applying softmax. + /// Formula: spherical_softmax(x) = softmax(x / ||x||_2). + /// The backward pass combines gradients from both normalization and softmax steps. + /// + /// For Beginners: Spherical Softmax is softmax with L2 normalization for numerical stability. + /// + /// For spherical softmax: + /// - First normalizes input to unit length (projects to unit sphere) + /// - Then applies standard softmax + /// - Improves stability with large magnitude inputs + /// + /// Spherical Softmax is useful for: + /// - Classification with varying input magnitudes + /// - Attention mechanisms with scale invariance + /// - Numerically stable probability distributions + /// + /// + public static ComputationNode SphericalSoftmax(ComputationNode a, int axis = -1) + { + var numOps = MathHelper.GetNumericOperations(); + var shape = a.Value.Shape; + + if (axis < 0) + axis = shape.Length + axis; + + if (shape.Length == 2 && axis == 1) + { + int batchSize = shape[0]; + int features = shape[1]; + var result = new Tensor(shape); + var norms = new T[batchSize]; + var normalized = new Tensor(shape); + + for (int b = 0; b < batchSize; b++) + { + T sumSquares = numOps.Zero; + for (int f = 0; f < features; f++) + { + var val = a.Value[b, f]; + sumSquares = numOps.Add(sumSquares, numOps.Multiply(val, val)); + } + + T norm = numOps.Sqrt(sumSquares); + norms[b] = norm; + + T epsilon = numOps.FromDouble(1e-8); + if (numOps.LessThan(norm, epsilon)) + norm = epsilon; + + for (int f = 0; f < features; f++) + { + normalized[b, f] = numOps.Divide(a.Value[b, f], norm); + } + + var maxVal = normalized[b, 0]; + for (int f = 1; f < features; f++) + { + if (numOps.GreaterThan(normalized[b, f], maxVal)) + maxVal = normalized[b, f]; + } + + var expSum = numOps.Zero; + var expValues = new T[features]; + for (int f = 0; f < features; f++) + { + var shifted = numOps.Subtract(normalized[b, f], maxVal); + expValues[f] = numOps.Exp(shifted); + expSum = numOps.Add(expSum, expValues[f]); + } + + for (int f = 0; f < features; f++) + { + result[b, f] = numOps.Divide(expValues[f], expSum); + } + } + + void BackwardFunction(Tensor gradient) + { + if (a.RequiresGradient) + { + var gradA = new Tensor(shape); + + for (int b = 0; b < batchSize; b++) + { + var dotProduct = numOps.Zero; + for (int f = 0; f < features; f++) + { + dotProduct = numOps.Add(dotProduct, + numOps.Multiply(gradient[b, f], result[b, f])); + } + + var norm = norms[b]; + T epsilon = numOps.FromDouble(1e-8); + if (numOps.LessThan(norm, epsilon)) + norm = epsilon; + + for (int f = 0; f < features; f++) + { + var softmaxGrad = numOps.Subtract(gradient[b, f], dotProduct); + softmaxGrad = numOps.Multiply(result[b, f], softmaxGrad); + + T normTerm = numOps.Zero; + for (int j = 0; j < features; j++) + { + normTerm = numOps.Add(normTerm, + numOps.Multiply(a.Value[b, j], gradient[b, j])); + } + normTerm = numOps.Multiply(normTerm, a.Value[b, f]); + normTerm = numOps.Divide(normTerm, + numOps.Multiply(norm, numOps.Multiply(norm, norm))); + + gradA[b, f] = numOps.Divide( + numOps.Subtract(softmaxGrad, normTerm), + norm + ); + } + } + + if (a.Gradient == null) + { + a.Gradient = gradA; + } + else + { + var existingGradient = a.Gradient; + if (existingGradient != null) + { + a.Gradient = existingGradient.Add(gradA); + } + } + } + } + + var node = new ComputationNode( + value: result, + requiresGradient: a.RequiresGradient, + parents: new List> { a }, + backwardFunction: BackwardFunction, + name: null); + + var tape = GradientTape.Current; + if (tape != null && tape.IsRecording) + tape.RecordOperation(node); + + return node; + } + else + { + throw new NotImplementedException( + $"SphericalSoftmax is currently only implemented for 2D tensors along axis=-1. " + + $"Got shape=[{string.Join(", ", shape)}], axis={axis}"); + } + } + /// /// Concatenates multiple computation nodes along a specified axis. /// diff --git a/src/Engines/CpuEngine.cs b/src/Engines/CpuEngine.cs index fc0745a0e..3af896966 100644 --- a/src/Engines/CpuEngine.cs +++ b/src/Engines/CpuEngine.cs @@ -1218,5 +1218,137 @@ public Tensor ELU(Tensor tensor, double alpha = 1.0) return new Tensor(tensor.Shape, resultVector); } + public Tensor Sparsemax(Tensor input) where T : struct + { + if (input == null) + throw new ArgumentNullException(nameof(input)); + + var shape = input.Shape; + if (shape.Length != 2) + { + throw new NotSupportedException( + $"Sparsemax is currently only implemented for 2D tensors. Got shape=[{string.Join(", ", shape)}]"); + } + + var numOps = MathHelper.GetNumericOperations(); + int batchSize = shape[0]; + int features = shape[1]; + var result = new Tensor(shape); + + for (int b = 0; b < batchSize; b++) + { + var row = new T[features]; + for (int f = 0; f < features; f++) + row[f] = input[b, f]; + + var sorted = row.OrderByDescending(x => x).ToArray(); + + int k = 1; + T sum = sorted[0]; + T threshold = numOps.Zero; + + for (int i = 1; i < features; i++) + { + sum = numOps.Add(sum, sorted[i]); + T kPlusOne = numOps.FromDouble(i + 1); + T avgMinusZ = numOps.Subtract( + numOps.Divide(sum, kPlusOne), + sorted[i] + ); + + if (numOps.GreaterThan(avgMinusZ, numOps.Zero)) + { + k = i + 1; + threshold = numOps.Divide( + numOps.Subtract(sum, numOps.One), + numOps.FromDouble(k) + ); + break; + } + } + + if (k == 1) + { + threshold = numOps.Divide( + numOps.Subtract(sum, numOps.One), + numOps.FromDouble(k) + ); + } + + for (int f = 0; f < features; f++) + { + var shifted = numOps.Subtract(row[f], threshold); + if (numOps.GreaterThan(shifted, numOps.Zero)) + { + result[b, f] = shifted; + } + else + { + result[b, f] = numOps.Zero; + } + } + } + + return result; + } + + public Tensor SphericalSoftmax(Tensor input) where T : struct + { + if (input == null) + throw new ArgumentNullException(nameof(input)); + + var shape = input.Shape; + if (shape.Length != 2) + { + throw new NotSupportedException( + $"SphericalSoftmax is currently only implemented for 2D tensors. Got shape=[{string.Join(", ", shape)}]"); + } + + var numOps = MathHelper.GetNumericOperations(); + int batchSize = shape[0]; + int features = shape[1]; + var result = new Tensor(shape); + + for (int b = 0; b < batchSize; b++) + { + T sumSquares = numOps.Zero; + for (int f = 0; f < features; f++) + { + var val = input[b, f]; + sumSquares = numOps.Add(sumSquares, numOps.Multiply(val, val)); + } + + T norm = numOps.Sqrt(sumSquares); + T epsilon = numOps.FromDouble(1e-8); + if (numOps.LessThan(norm, epsilon)) + norm = epsilon; + + var maxVal = numOps.Divide(input[b, 0], norm); + for (int f = 1; f < features; f++) + { + var normalized = numOps.Divide(input[b, f], norm); + if (numOps.GreaterThan(normalized, maxVal)) + maxVal = normalized; + } + + var expSum = numOps.Zero; + var expValues = new T[features]; + for (int f = 0; f < features; f++) + { + var normalized = numOps.Divide(input[b, f], norm); + var shifted = numOps.Subtract(normalized, maxVal); + expValues[f] = numOps.Exp(shifted); + expSum = numOps.Add(expSum, expValues[f]); + } + + for (int f = 0; f < features; f++) + { + result[b, f] = numOps.Divide(expValues[f], expSum); + } + } + + return result; + } + #endregion } diff --git a/src/Engines/GpuEngine.cs b/src/Engines/GpuEngine.cs index 5def66643..60793550b 100644 --- a/src/Engines/GpuEngine.cs +++ b/src/Engines/GpuEngine.cs @@ -1499,6 +1499,18 @@ public Tensor ELU(Tensor tensor, double alpha = 1.0) return _cpuFallback.ELU(tensor, alpha); } + /// + public Tensor Sparsemax(Tensor input) where T : struct + { + return _cpuFallback.Sparsemax(input); + } + + /// + public Tensor SphericalSoftmax(Tensor input) where T : struct + { + return _cpuFallback.SphericalSoftmax(input); + } + #endregion #region GPU Kernels (Float Implementation) diff --git a/src/Engines/IEngine.cs b/src/Engines/IEngine.cs index b67cc69b2..d6b6909ef 100644 --- a/src/Engines/IEngine.cs +++ b/src/Engines/IEngine.cs @@ -469,6 +469,16 @@ public interface IEngine /// Tensor ELU(Tensor tensor, double alpha = 1.0); + /// + /// Applies Sparsemax activation function (sparse softmax via Euclidean projection onto simplex). + /// + Tensor Sparsemax(Tensor input) where T : struct; + + /// + /// Applies Spherical Softmax activation function (L2 normalization followed by softmax). + /// + Tensor SphericalSoftmax(Tensor input) where T : struct; + #endregion #region Matrix Operations (Phase B: Epic 2)