Skip to content

Commit 4edc798

Browse files
franklinicclaude
andcommitted
fix: add gradient operations and fix Softplus numerical stability
- Add GradHardSigmoid with proper masking for -3 < x < 3 - Add GradHardTanh with proper masking for minVal < x < maxVal - Add GradSoftPlus with numerically stable implementation - Fix Softplus forward pass: use max(0,x) + log(1+exp(-|x|)) formula - Add comprehensive TensorMatMul/TensorTranspose tests (20 tests) Addresses PR review comments for #499, #500, #503, #504, #508, #509 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent e9f76b7 commit 4edc798

File tree

3 files changed

+512
-5
lines changed

3 files changed

+512
-5
lines changed

src/Autodiff/TensorOperations.cs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,13 +1665,19 @@ public static ComputationNode<T> SoftPlus(ComputationNode<T> a)
16651665
var engine = AiDotNetEngine.Current;
16661666
var numOps = MathHelper.GetNumericOperations<T>();
16671667

1668-
// Forward pass: ln(1 + e^x)
1669-
// Using numerically stable version: max(0, x) + ln(1 + exp(-|x|))
1668+
// Forward pass: numerically stable softplus
1669+
// softplus(x) = max(0, x) + ln(1 + exp(-|x|))
1670+
// For large positive x, this avoids exp(x) overflow
1671+
// For large negative x, exp(-|x|) approaches 0, so result ≈ 0
16701672
var result = a.Value.Transform((x, idx) =>
16711673
{
1672-
var expX = numOps.Exp(x);
1673-
var onePlusExpX = numOps.Add(numOps.One, expX);
1674-
return numOps.Log(onePlusExpX);
1674+
// Compute |x|: if x >= 0, absX = x, else absX = -x
1675+
var absX = numOps.GreaterThanOrEquals(x, numOps.Zero) ? x : numOps.Negate(x);
1676+
var negAbsX = numOps.Negate(absX);
1677+
var expNegAbsX = numOps.Exp(negAbsX);
1678+
var log1pExpNegAbsX = numOps.Log(numOps.Add(numOps.One, expNegAbsX));
1679+
var maxZeroX = numOps.GreaterThan(x, numOps.Zero) ? x : numOps.Zero;
1680+
return numOps.Add(maxZeroX, log1pExpNegAbsX);
16751681
});
16761682

16771683
void BackwardFunction(Tensor<T> gradient)

src/JitCompiler/CodeGen/GradientOps.cs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,102 @@ public static Tensor<T> GradSoftmax<T>(Tensor<T> gradOutput, Tensor<T> forwardOu
200200
return Tensor<T>.ElementwiseMultiply(forwardOutput, diff);
201201
}
202202

203+
/// <summary>
204+
/// Gradient of HardSigmoid operation.
205+
/// Forward: y = clip((x + 3) / 6, 0, 1)
206+
/// Backward: grad_x = grad_y * (1/6 if -3 &lt; x &lt; 3, else 0)
207+
/// </summary>
208+
public static Tensor<T> GradHardSigmoid<T>(Tensor<T> gradOutput, Tensor<T> forwardInput)
209+
{
210+
var numOps = MathHelper.GetNumericOperations<T>();
211+
var inputData = forwardInput.ToArray();
212+
var gradData = gradOutput.ToArray();
213+
var resultData = new T[inputData.Length];
214+
215+
var negThree = numOps.FromDouble(-3.0);
216+
var three = numOps.FromDouble(3.0);
217+
var oneSixth = numOps.FromDouble(1.0 / 6.0);
218+
219+
for (int i = 0; i < inputData.Length; i++)
220+
{
221+
// Gradient is 1/6 only when -3 < x < 3, else 0
222+
var x = inputData[i];
223+
var inLinearRegion = numOps.GreaterThan(x, negThree) && numOps.LessThan(x, three);
224+
var derivative = inLinearRegion ? oneSixth : numOps.Zero;
225+
resultData[i] = numOps.Multiply(gradData[i], derivative);
226+
}
227+
228+
return new Tensor<T>(gradOutput.Shape, new Vector<T>(resultData));
229+
}
230+
231+
/// <summary>
232+
/// Gradient of HardTanh operation.
233+
/// Forward: y = clip(x, minVal, maxVal)
234+
/// Backward: grad_x = grad_y * (1 if minVal &lt; x &lt; maxVal, else 0)
235+
/// </summary>
236+
public static Tensor<T> GradHardTanh<T>(Tensor<T> gradOutput, Tensor<T> forwardInput, double minVal = -1.0, double maxVal = 1.0)
237+
{
238+
var numOps = MathHelper.GetNumericOperations<T>();
239+
var inputData = forwardInput.ToArray();
240+
var gradData = gradOutput.ToArray();
241+
var resultData = new T[inputData.Length];
242+
243+
var minT = numOps.FromDouble(minVal);
244+
var maxT = numOps.FromDouble(maxVal);
245+
246+
for (int i = 0; i < inputData.Length; i++)
247+
{
248+
// Gradient is 1 only when minVal < x < maxVal, else 0
249+
var x = inputData[i];
250+
var inLinearRegion = numOps.GreaterThan(x, minT) && numOps.LessThan(x, maxT);
251+
var derivative = inLinearRegion ? numOps.One : numOps.Zero;
252+
resultData[i] = numOps.Multiply(gradData[i], derivative);
253+
}
254+
255+
return new Tensor<T>(gradOutput.Shape, new Vector<T>(resultData));
256+
}
257+
258+
/// <summary>
259+
/// Gradient of SoftPlus operation.
260+
/// Forward: y = log(1 + exp(x)) (numerically stable)
261+
/// Backward: grad_x = grad_y * sigmoid(x)
262+
/// </summary>
263+
public static Tensor<T> GradSoftPlus<T>(Tensor<T> gradOutput, Tensor<T> forwardInput, double beta = 1.0, double threshold = 20.0)
264+
{
265+
var numOps = MathHelper.GetNumericOperations<T>();
266+
var inputData = forwardInput.ToArray();
267+
var gradData = gradOutput.ToArray();
268+
var resultData = new T[inputData.Length];
269+
270+
var betaT = numOps.FromDouble(beta);
271+
var thresholdT = numOps.FromDouble(threshold);
272+
273+
for (int i = 0; i < inputData.Length; i++)
274+
{
275+
var x = inputData[i];
276+
var betaX = numOps.Multiply(betaT, x);
277+
278+
T derivative;
279+
// For numerical stability: when beta*x > threshold, sigmoid(beta*x) ≈ 1
280+
if (numOps.GreaterThan(betaX, thresholdT))
281+
{
282+
derivative = numOps.One;
283+
}
284+
else
285+
{
286+
// sigmoid(beta * x) = 1 / (1 + exp(-beta * x))
287+
var negBetaX = numOps.Negate(betaX);
288+
var expVal = numOps.Exp(negBetaX);
289+
var onePlusExp = numOps.Add(numOps.One, expVal);
290+
derivative = numOps.Divide(numOps.One, onePlusExp);
291+
}
292+
293+
resultData[i] = numOps.Multiply(gradData[i], derivative);
294+
}
295+
296+
return new Tensor<T>(gradOutput.Shape, new Vector<T>(resultData));
297+
}
298+
203299
/// <summary>
204300
/// Helper: Creates a mask tensor where elements > 0 are 1, else 0.
205301
/// </summary>

0 commit comments

Comments
 (0)