Fixes softmax LSE calculation by removing scale factor

LoserCheems · LoserCheems · commit af12e8018565 · 2025-08-12T22:26:54.000+08:00
Removes the multiplication by softmax_scale from the log-sum-exp calculation
when sum is valid, keeping only the row maximum and log sum components.

This corrects the mathematical formula to properly compute the LSE value
without the unnecessary scaling factor that was affecting numerical accuracy.
diff --git a/csrc/src/softmax.h b/csrc/src/softmax.h
@@ -209,7 +209,9 @@ struct Softmax {
         for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
             float sum = row_sum(mi);
             float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-            lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+            lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -INFINITY : INFINITY)
+                    : (row_max(mi) + __logf(sum));
             float scale = inv_sum;
             #pragma unroll
             for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scale; }