diff --git a/libm/src/math/generic/fmod.rs b/libm/src/math/generic/fmod.rs
index 29acc8a4d..3c3fd44b2 100644
--- a/libm/src/math/generic/fmod.rs
+++ b/libm/src/math/generic/fmod.rs
@@ -1,8 +1,12 @@
 /* SPDX-License-Identifier: MIT OR Apache-2.0 */
-use crate::support::{CastFrom, Float, Int, MinInt};
+use crate::support::{CastFrom, CastInto, Float, HInt, Int, MinInt, NarrowingDiv};
 
 #[inline]
-pub fn fmod<F: Float>(x: F, y: F) -> F {
+pub fn fmod<F: Float>(x: F, y: F) -> F
+where
+    F::Int: HInt,
+    <F::Int as HInt>::D: NarrowingDiv,
+{
     let _1 = F::Int::ONE;
     let sx = x.to_bits() & F::SIGN_MASK;
     let ux = x.to_bits() & !F::SIGN_MASK;
@@ -29,7 +33,7 @@ pub fn fmod<F: Float>(x: F, y: F) -> F {
 
     // To compute `(num << ex) % (div << ey)`, first
     // evaluate `rem = (num << (ex - ey)) % div` ...
-    let rem = reduction(num, ex - ey, div);
+    let rem = reduction::<F>(num, ex - ey, div);
     // ... so the result will be `rem << ey`
 
     if rem.is_zero() {
@@ -58,11 +62,55 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
 }
 
 /// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
-fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
-    x %= y;
-    for _ in 0..e {
-        x <<= 1;
-        x = x.checked_sub(y).unwrap_or(x);
+fn reduction<F>(mut x: F::Int, e: u32, y: F::Int) -> F::Int
+where
+    F: Float,
+    F::Int: HInt,
+    <<F as Float>::Int as HInt>::D: NarrowingDiv,
+{
+    // `f16` only has 5 exponent bits, so even `f16::MAX = 65504.0` is only
+    // a 40-bit integer multiple of the smallest subnormal.
+    if F::BITS == 16 {
+        debug_assert!(F::EXP_MAX - F::EXP_MIN == 29);
+        debug_assert!(e <= 29);
+        let u: u16 = x.cast();
+        let v: u16 = y.cast();
+        let u = (u as u64) << e;
+        let v = v as u64;
+        return F::Int::cast_from((u % v) as u16);
     }
-    x
+
+    // Ensure `x < 2y` for later steps
+    if x >= (y << 1) {
+        // This case is only reached with subnormal divisors,
+        // but it might be better to just normalize all significands
+        // to make this unnecessary. The further calls could potentially
+        // benefit from assuming a specific fixed leading bit position.
+        x %= y;
+    }
+
+    // The simple implementation seems to be fastest for a short reduction
+    // at this size. The limit here was chosen empirically on an Intel Nehalem.
+    // Less old CPUs that have faster `u64 * u64 -> u128` might not benefit,
+    // and 32-bit systems or architectures without hardware multipliers might
+    // want to do this in more cases.
+    if F::BITS == 64 && e < 32 {
+        // Assumes `x < 2y`
+        for _ in 0..e {
+            x = x.checked_sub(y).unwrap_or(x);
+            x <<= 1;
+        }
+        return x.checked_sub(y).unwrap_or(x);
+    }
+
+    // Fast path for short reductions
+    if e < F::BITS {
+        let w = x.widen() << e;
+        if let Some((_, r)) = w.checked_narrowing_div_rem(y) {
+            return r;
+        }
+    }
+
+    // Assumes `x < 2y`
+    crate::support::linear_mul_reduction(x, e, y)
 }
diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs
index f35b9de3e..15ab010dc 100644
--- a/libm/src/math/support/mod.rs
+++ b/libm/src/math/support/mod.rs
@@ -29,9 +29,7 @@ pub use hex_float::hf16;
 pub use hex_float::hf128;
 #[allow(unused_imports)]
 pub use hex_float::{hf32, hf64};
-#[allow(unused_imports)]
 pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
-#[allow(unused_imports)]
 pub use modular::linear_mul_reduction;
 
 /// Hint to the compiler that the current path is cold.
diff --git a/libm/src/math/support/modular.rs b/libm/src/math/support/modular.rs
index dbf1f0513..cc0edf2f2 100644
--- a/libm/src/math/support/modular.rs
+++ b/libm/src/math/support/modular.rs
@@ -1,5 +1,9 @@
 /* SPDX-License-Identifier: MIT OR Apache-2.0 */
 
+//! This module provides accelerated modular multiplication by large powers
+//! of two, which is needed for computing floating point remainders in `fmod`
+//! and similar functions.
+//!
 //! To keep the equations somewhat concise, the following conventions are used:
 //!  - all integer operations are in the mathematical sense, without overflow
 //!  - concatenation means multiplication: `2xq = 2 * x * q`
@@ -10,7 +14,6 @@ use crate::support::{DInt, HInt, Int};
 
 /// Compute the remainder `(x << e) % y` with unbounded integers.
 /// Requires `x < 2y` and `y.leading_zeros() >= 2`
-#[allow(dead_code)]
 pub fn linear_mul_reduction<U>(x: U, mut e: u32, mut y: U) -> U
 where
     U: HInt + Int<Unsigned = U>,