From 401101f24df8677d9ef124e69f957521fc33a25e Mon Sep 17 00:00:00 2001
From: Moulins <arthur.heuillard@orange.fr>
Date: Thu, 27 Nov 2025 14:43:08 +0100
Subject: [PATCH 1/4] tests: Move `known_failure` logic inside of `TestRunner`

This'll make it easier to tweak it in the future.
---
 tests/framework/src/runner.rs           | 174 ++++++++++++++----------
 tests/tests/external_interface/tests.rs |  22 ++-
 tests/tests/regression_tests.rs         |  33 +----
 tests/tests/shared_object/mod.rs        |  18 +--
 4 files changed, 121 insertions(+), 126 deletions(-)
diff --git a/tests/framework/src/runner.rs b/tests/framework/src/runner.rs
index 85c0c463f400..97edfc3f8c48 100644
--- a/tests/framework/src/runner.rs
+++ b/tests/framework/src/runner.rs
@@ -154,8 +154,64 @@ impl TestRunner {
         self.remaining_iterations == 1
     }
 
-    /// Tick this test forward, running any actionscript and progressing the timeline by one.
-    pub fn tick(&mut self) {
+    pub fn is_preloaded(&self) -> bool {
+        self.preloaded
+    }
+
+    /// Ticks this test forward: runs actionscript, progresses the timeline by one,
+    /// executes custom FsCommands and performs scheduled tests.
+    pub fn tick(&mut self) -> Result<TestStatus> {
+        use std::panic::{AssertUnwindSafe, catch_unwind, resume_unwind};
+
+        let unwind_result = catch_unwind(AssertUnwindSafe(|| self.tick_inner()));
+    
+        match (unwind_result, self.options.known_failure) {
+            (Ok(ret), _) => ret,
+            // known_failure tests may pass by panicking.
+            (Err(_), true) =>  Ok(TestStatus::Finished),
+            (Err(panic), false) => resume_unwind(panic),
+        }
+    }
+    
+    fn tick_inner(&mut self) -> Result<TestStatus> {
+        self.do_tick();
+
+        match (self.test(), self.options.known_failure) {
+            (Ok(()), _) => (),
+            (Err(_), true) => return Ok(TestStatus::Finished),
+            (Err(err), false) => return Err(err),
+        }
+
+        match self.remaining_iterations {
+            0 => match (self.last_test(), self.options.known_failure) {
+                (Ok(()), true) => Err(anyhow!(
+                    "Test was known to be failing, but now passes successfully. Please update it and remove `known_failure = true`!",
+                )),
+                (Ok(()), false) => Ok(TestStatus::Finished),
+                (Err(_), true) => Ok(TestStatus::Finished),
+                (Err(err), false) => Err(err),
+            },
+            _ if self.options.sleep_to_meet_frame_rate => {
+                // If requested, ensure that the 'expected' amount of
+                // time actually elapses between frames. This is useful for
+                // tests that call 'flash.utils.getTimer()' and use
+                // 'setInterval'/'flash.utils.Timer'
+                //
+                // Note that when Ruffle actually runs frames, we can
+                // execute frames faster than this in order to 'catch up'
+                // if we've fallen behind. However, in order to make regression
+                // tests deterministic, we always call 'update_timers' with
+                // an elapsed time of 'frame_time'. By sleeping for 'frame_time_duration',
+                // we ensure that the result of 'flash.utils.getTimer()' is consistent
+                // with timer execution (timers will see an elapsed time of *at least*
+                // the requested timer interval).
+                Ok(TestStatus::Sleep(self.frame_time_duration))
+            }
+            _ => Ok(TestStatus::Continue),
+        }
+    }
+
+    fn do_tick(&mut self) {
         if !self
             .player
             .lock()
@@ -179,14 +235,10 @@ impl TestRunner {
         self.executor.run();
     }
 
-    pub fn is_preloaded(&self) -> bool {
-        self.preloaded
-    }
-
     /// After a tick, run any custom fdcommands that were queued up and perform any scheduled tests.
-    pub fn test(&mut self) -> Result<TestStatus> {
+    fn test(&mut self) -> Result<()> {
         if !self.preloaded {
-            return Ok(TestStatus::Continue);
+            return Ok(());
         }
         for command in self.fs_commands.try_iter() {
             match command {
@@ -225,86 +277,58 @@ impl TestRunner {
         // Rendering has side-effects (such as processing 'DisplayObject.scrollRect' updates)
         self.player.lock().unwrap().render();
 
-        if let Some(name) = self
-            .images
-            .iter()
-            .find(|(_k, v)| v.trigger == ImageTrigger::SpecificIteration(self.current_iteration))
-            .map(|(k, _v)| k.to_owned())
-        {
-            let image_comparison = self
-                .images
-                .remove(&name)
-                .expect("Name was just retrieved from map, should not be missing!");
+        let trigger = ImageTrigger::SpecificIteration(self.current_iteration);
+        if let Some((name, comp)) = self.take_image_comparison_by_trigger(trigger) {
             capture_and_compare_image(
                 &self.root_path,
                 &self.player,
                 &name,
-                image_comparison,
+                comp,
                 self.options.known_failure,
                 self.render_interface.as_deref(),
             )?;
         }
 
-        if self.remaining_iterations == 0 {
-            // Last iteration, let's check everything went well
-
-            if let Some(name) = self
-                .images
-                .iter()
-                .find(|(_k, v)| v.trigger == ImageTrigger::LastFrame)
-                .map(|(k, _v)| k.to_owned())
-            {
-                let image_comparison = self
-                    .images
-                    .remove(&name)
-                    .expect("Name was just retrieved from map, should not be missing!");
-
-                capture_and_compare_image(
-                    &self.root_path,
-                    &self.player,
-                    &name,
-                    image_comparison,
-                    self.options.known_failure,
-                    self.render_interface.as_deref(),
-                )?;
-            }
+        Ok(())
+    }
 
-            if !self.images.is_empty() {
-                return Err(anyhow!(
-                    "Image comparisons didn't trigger: {:?}",
-                    self.images.keys()
-                ));
-            }
+    fn last_test(&mut self) -> Result<()> {
+        // Last iteration, let's check everything went well
 
-            self.executor.run();
+        let trigger = ImageTrigger::LastFrame;
+        if let Some((name, comp)) = self.take_image_comparison_by_trigger(trigger) {
+            capture_and_compare_image(
+                &self.root_path,
+                &self.player,
+                &name,
+                comp,
+                self.options.known_failure,
+                self.render_interface.as_deref(),
+            )?;
+        }
 
-            let trace = self.log.trace_output();
-            // Null bytes are invisible, and interfere with constructing
-            // the expected output.txt file. Any tests dealing with null
-            // bytes should explicitly test for them in ActionScript.
-            let normalized_trace = trace.replace('\0', "");
-            compare_trace_output(&self.output_path, &self.options, &normalized_trace)?;
+        if !self.images.is_empty() {
+            return Err(anyhow!(
+                "Image comparisons didn't trigger: {:?}",
+                self.images.keys()
+            ));
         }
 
-        Ok(match self.remaining_iterations {
-            0 => TestStatus::Finished,
-            _ if self.options.sleep_to_meet_frame_rate => {
-                // If requested, ensure that the 'expected' amount of
-                // time actually elapses between frames. This is useful for
-                // tests that call 'flash.utils.getTimer()' and use
-                // 'setInterval'/'flash.utils.Timer'
-                //
-                // Note that when Ruffle actually runs frames, we can
-                // execute frames faster than this in order to 'catch up'
-                // if we've fallen behind. However, in order to make regression
-                // tests deterministic, we always call 'update_timers' with
-                // an elapsed time of 'frame_time'. By sleeping for 'frame_time_duration',
-                // we ensure that the result of 'flash.utils.getTimer()' is consistent
-                // with timer execution (timers will see an elapsed time of *at least*
-                // the requested timer interval).
-                TestStatus::Sleep(self.frame_time_duration)
-            }
-            _ => TestStatus::Continue,
-        })
+        self.executor.run();
+
+        let trace = self.log.trace_output();
+        // Null bytes are invisible, and interfere with constructing
+        // the expected output.txt file. Any tests dealing with null
+        // bytes should explicitly test for them in ActionScript.
+        let normalized_trace = trace.replace('\0', "");
+        compare_trace_output(&self.output_path, &self.options, &normalized_trace)?;
+        Ok(())
+    }
+
+    fn take_image_comparison_by_trigger(
+        &mut self,
+        trigger: ImageTrigger,
+    ) -> Option<(String, ImageComparison)> {
+        self.images.extract_if(|_k, v| v.trigger == trigger).next()
     }
 }
diff --git a/tests/tests/external_interface/tests.rs b/tests/tests/external_interface/tests.rs
index 608f3eb9df6b..b55dfaa4eae3 100644
--- a/tests/tests/external_interface/tests.rs
+++ b/tests/tests/external_interface/tests.rs
@@ -30,17 +30,16 @@ pub fn external_interface_avm1(
     let mut first = true;
 
     loop {
-        runner.tick();
-        if !runner.is_preloaded() {
-            continue;
-        }
-
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
         }
 
+        if !runner.is_preloaded() {
+            continue;
+        }
+
         if first {
             first = false;
             let mut player_locked = runner.player().lock().unwrap();
@@ -102,17 +101,16 @@ pub fn external_interface_avm2(
     let mut first = true;
 
     loop {
-        runner.tick();
-        if !runner.is_preloaded() {
-            continue;
-        }
-
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
         }
 
+        if !runner.is_preloaded() {
+            continue;
+        }
+
         if first {
             first = false;
             let mut player_locked = runner.player().lock().unwrap();
diff --git a/tests/tests/regression_tests.rs b/tests/tests/regression_tests.rs
index cf9157b47e33..b813bd17bde7 100644
--- a/tests/tests/regression_tests.rs
+++ b/tests/tests/regression_tests.rs
@@ -6,7 +6,6 @@ use crate::environment::NativeEnvironment;
 use crate::external_interface::tests::{external_interface_avm1, external_interface_avm2};
 use crate::shared_object::{shared_object_avm1, shared_object_avm2, shared_object_self_ref_avm1};
 use anyhow::Context;
-use anyhow::Result;
 use clap::Parser;
 use libtest_mimic::Trial;
 use ruffle_fs_tests_runner::FsTestsRunner;
@@ -15,7 +14,6 @@ use ruffle_test_framework::runner::TestStatus;
 use ruffle_test_framework::test::Test;
 use ruffle_test_framework::vfs::VfsPath;
 use std::borrow::Cow;
-use std::panic::{AssertUnwindSafe, catch_unwind, resume_unwind};
 use std::path::PathBuf;
 use std::thread::sleep;
 
@@ -116,32 +114,13 @@ fn trial_for_test(opts: &RuffleTestOpts, test: Test, list_only: bool) -> Trial {
     }
 
     let trial = Trial::test(test.name.clone(), move || {
-        let test = AssertUnwindSafe(test);
-        let unwind_result = catch_unwind(|| {
-            let mut runner = test.create_test_runner(&NativeEnvironment)?;
-
-            loop {
-                runner.tick();
-                match runner.test()? {
-                    TestStatus::Continue => {}
-                    TestStatus::Sleep(duration) => sleep(duration),
-                    TestStatus::Finished => break,
-                }
-            }
+        let mut runner = test.create_test_runner(&NativeEnvironment)?;
 
-            Result::<_>::Ok(())
-        });
-        if test.options.known_failure {
-            match unwind_result {
-                Ok(Ok(())) => Err(
-                    format!("{} was known to be failing, but now passes successfully. Please update it and remove `known_failure = true`!", test.name).into()
-                ),
-                Ok(Err(_)) | Err(_) => Ok(()),
-            }
-        } else {
-            match unwind_result {
-                Ok(r) => Ok(r?),
-                Err(e) => resume_unwind(e),
+        loop {
+            match runner.tick()? {
+                TestStatus::Continue => (),
+                TestStatus::Sleep(duration) => sleep(duration),
+                TestStatus::Finished => break Ok(()),
             }
         }
     });
diff --git a/tests/tests/shared_object/mod.rs b/tests/tests/shared_object/mod.rs
index f248a08d481e..89b76dd7012e 100644
--- a/tests/tests/shared_object/mod.rs
+++ b/tests/tests/shared_object/mod.rs
@@ -25,8 +25,7 @@ pub fn shared_object_avm1(environment: &impl Environment) -> Result<(), libtest_
     let mut runner = test1.create_test_runner(environment)?;
 
     loop {
-        runner.tick();
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
@@ -67,8 +66,7 @@ pub fn shared_object_avm1(environment: &impl Environment) -> Result<(), libtest_
     }
 
     loop {
-        runner.tick();
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
@@ -99,8 +97,7 @@ pub fn shared_object_self_ref_avm1(
     let mut runner = test1.create_test_runner(environment)?;
 
     loop {
-        runner.tick();
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
@@ -140,8 +137,7 @@ pub fn shared_object_self_ref_avm1(
     }
 
     loop {
-        runner.tick();
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
@@ -170,8 +166,7 @@ pub fn shared_object_avm2(environment: &impl Environment) -> Result<(), libtest_
     let mut runner = test1.create_test_runner(environment)?;
 
     loop {
-        runner.tick();
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,
@@ -211,8 +206,7 @@ pub fn shared_object_avm2(environment: &impl Environment) -> Result<(), libtest_
     }
 
     loop {
-        runner.tick();
-        match runner.test()? {
+        match runner.tick()? {
             TestStatus::Continue => {}
             TestStatus::Sleep(duration) => sleep(duration),
             TestStatus::Finished => break,

From f50890d72681c19721b7430804a9d9f6852687a5 Mon Sep 17 00:00:00 2001
From: Moulins <arthur.heuillard@orange.fr>
Date: Thu, 27 Nov 2025 16:35:45 +0100
Subject: [PATCH 2/4] tests: distinguish between panicky and non-panicky
 known_failure tests

Tests that are expected to panic are now required to explicitely say so
in their `test.toml`, by using `known_failure.panic = "expected message"`.

Additionally, only panics raised during player ticking are caught, and not those
happenning elsewhere (e.g. in rendering, or in test framework code).
---
 tests/README.md                               |  3 +
 tests/framework/src/options.rs                | 10 ++-
 tests/framework/src/options/known_failure.rs  | 53 +++++++++++++++
 tests/framework/src/runner.rs                 | 65 ++++++++++++-------
 tests/framework/src/test.rs                   |  2 +-
 tests/tests/regression_tests.rs               |  2 +-
 .../pixelDissolve/test.toml                   |  2 +-
 .../action_order/PlaceAndRemove/test.toml     |  2 +-
 .../get_frame_number_test/test.toml           |  2 +-
 .../matrix_accuracy_test1/test.toml           |  2 +-
 10 files changed, 112 insertions(+), 31 deletions(-)
 create mode 100644 tests/framework/src/options/known_failure.rs

diff --git a/tests/README.md b/tests/README.md
index 7dc223d8cc65..a5343eb14b73 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -37,6 +37,9 @@ ignore = false
 
 # If true, this test is known to fail and the test runner will expect it to fail.
 # When the test passes in the future, it'll fail and alert that it now passes.
+# This will not catch Ruffle panics; if the test is expected to panic, use
+#   `known_failure.panic = "panic message"`
+# instead.
 known_failure = false
 
 # Path (relative to the directory containing test.toml) to the expected output
diff --git a/tests/framework/src/options.rs b/tests/framework/src/options.rs
index f4fb1d7fffb2..f8fa49b33d43 100644
--- a/tests/framework/src/options.rs
+++ b/tests/framework/src/options.rs
@@ -2,12 +2,14 @@ pub mod approximations;
 pub mod expression;
 pub mod font;
 pub mod image_comparison;
+pub mod known_failure;
 pub mod player;
 
 use crate::image_trigger::ImageTrigger;
 use crate::options::approximations::Approximations;
 use crate::options::font::{DefaultFontsOptions, FontOptions, FontSortOptions};
 use crate::options::image_comparison::ImageComparison;
+use crate::options::known_failure::KnownFailure;
 use crate::options::player::PlayerOptions;
 use anyhow::{Result, bail};
 use serde::Deserialize;
@@ -68,7 +70,7 @@ pub struct TestOptions {
     pub sleep_to_meet_frame_rate: bool,
     pub image_comparisons: HashMap<String, ImageComparison>,
     pub ignore: bool,
-    pub known_failure: bool,
+    pub known_failure: KnownFailure,
     pub approximations: Option<Approximations>,
     pub player_options: PlayerOptions,
     pub log_fetch: bool,
@@ -89,7 +91,7 @@ impl Default for TestOptions {
             sleep_to_meet_frame_rate: false,
             image_comparisons: Default::default(),
             ignore: false,
-            known_failure: false,
+            known_failure: KnownFailure::None,
             approximations: None,
             player_options: PlayerOptions::default(),
             log_fetch: false,
@@ -178,6 +180,10 @@ impl TestOptions {
         Ok(())
     }
 
+    pub fn has_known_failure(&self) -> bool {
+        !matches!(self.known_failure, KnownFailure::None)
+    }
+
     pub fn output_path(&self, test_directory: &VfsPath) -> Result<VfsPath> {
         Ok(test_directory.join(&self.output_path)?)
     }
diff --git a/tests/framework/src/options/known_failure.rs b/tests/framework/src/options/known_failure.rs
new file mode 100644
index 000000000000..51a0c0847a71
--- /dev/null
+++ b/tests/framework/src/options/known_failure.rs
@@ -0,0 +1,53 @@
+use std::fmt;
+
+use serde::{
+    Deserialize, Deserializer,
+    de::{self, value::MapAccessDeserializer},
+};
+
+#[derive(Clone, Debug, Default)]
+pub enum KnownFailure {
+    #[default]
+    None,
+    AnyCheck,
+    Panic {
+        message: String,
+    },
+}
+
+impl<'de> Deserialize<'de> for KnownFailure {
+    fn deserialize<D: Deserializer<'de>>(deser: D) -> Result<Self, D::Error> {
+        deser.deserialize_any(KnownFailureVisitor)
+    }
+}
+
+struct KnownFailureVisitor;
+
+impl<'de> de::Visitor<'de> for KnownFailureVisitor {
+    type Value = KnownFailure;
+
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str("a boolean, or `.panic = 'message'`")
+    }
+
+    fn visit_bool<E: de::Error>(self, v: bool) -> Result<Self::Value, E> {
+        if v {
+            Ok(KnownFailure::AnyCheck)
+        } else {
+            Ok(KnownFailure::None)
+        }
+    }
+
+    fn visit_map<A: de::MapAccess<'de>>(self, map: A) -> Result<Self::Value, A::Error> {
+        #[derive(Deserialize)]
+        #[serde(deny_unknown_fields)]
+        enum Raw {
+            #[serde(rename = "panic")]
+            Panic(String),
+        }
+
+        match Raw::deserialize(MapAccessDeserializer::new(map))? {
+            Raw::Panic(message) => Ok(KnownFailure::Panic { message }),
+        }
+    }
+}
diff --git a/tests/framework/src/runner.rs b/tests/framework/src/runner.rs
index 97edfc3f8c48..88b3b0924faf 100644
--- a/tests/framework/src/runner.rs
+++ b/tests/framework/src/runner.rs
@@ -8,6 +8,7 @@ use crate::fs_commands::{FsCommand, TestFsCommandProvider};
 use crate::image_trigger::ImageTrigger;
 use crate::options::TestOptions;
 use crate::options::image_comparison::ImageComparison;
+use crate::options::known_failure::KnownFailure;
 use crate::runner::automation::perform_automated_event;
 use crate::runner::image_test::capture_and_compare_image;
 use crate::runner::trace::compare_trace_output;
@@ -20,6 +21,8 @@ use ruffle_core::{Player, PlayerBuilder};
 use ruffle_input_format::InputInjector;
 use ruffle_render::backend::{RenderBackend, ViewportDimensions};
 use ruffle_socket_format::SocketEvent;
+use std::any::Any;
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex, mpsc};
 use std::time::Duration;
@@ -163,33 +166,39 @@ impl TestRunner {
     pub fn tick(&mut self) -> Result<TestStatus> {
         use std::panic::{AssertUnwindSafe, catch_unwind, resume_unwind};
 
-        let unwind_result = catch_unwind(AssertUnwindSafe(|| self.tick_inner()));
-    
-        match (unwind_result, self.options.known_failure) {
-            (Ok(ret), _) => ret,
-            // known_failure tests may pass by panicking.
-            (Err(_), true) =>  Ok(TestStatus::Finished),
-            (Err(panic), false) => resume_unwind(panic),
+        let unwind_result = catch_unwind(AssertUnwindSafe(|| self.do_tick()));
+        match (unwind_result, &self.options.known_failure) {
+            (Ok(()), _) => (),
+            (Err(panic), KnownFailure::Panic { message }) => {
+                let actual = panic_payload_as_string(panic);
+                if actual.contains(message) {
+                    return Ok(TestStatus::Finished);
+                }
+
+                let mut actual = actual.into_owned();
+                actual.push_str("\n\nnote: expected panic message to contain: ");
+                actual.push_str(message);
+                resume_unwind(Box::new(actual))
+            }
+            (Err(panic), _) => resume_unwind(panic),
         }
-    }
-    
-    fn tick_inner(&mut self) -> Result<TestStatus> {
-        self.do_tick();
 
-        match (self.test(), self.options.known_failure) {
+        match (self.test(), &self.options.known_failure) {
             (Ok(()), _) => (),
-            (Err(_), true) => return Ok(TestStatus::Finished),
-            (Err(err), false) => return Err(err),
+            (Err(_), KnownFailure::AnyCheck) => return Ok(TestStatus::Finished),
+            (Err(err), _) => return Err(err),
         }
 
-        match self.remaining_iterations {
-            0 => match (self.last_test(), self.options.known_failure) {
-                (Ok(()), true) => Err(anyhow!(
+        match (self.remaining_iterations, &self.options.known_failure) {
+            (0, KnownFailure::None) => self.last_test().map(|_| TestStatus::Finished),
+            (0, KnownFailure::Panic { .. }) => Err(anyhow!(
+                "Test was known to be panicking, but now finishes successfully. Please update it and remove `known_failure.panic = '...'`!",
+            )),
+            (0, KnownFailure::AnyCheck) => match self.last_test() {
+                Ok(()) => Err(anyhow!(
                     "Test was known to be failing, but now passes successfully. Please update it and remove `known_failure = true`!",
                 )),
-                (Ok(()), false) => Ok(TestStatus::Finished),
-                (Err(_), true) => Ok(TestStatus::Finished),
-                (Err(err), false) => Err(err),
+                Err(_) => Ok(TestStatus::Finished),
             },
             _ if self.options.sleep_to_meet_frame_rate => {
                 // If requested, ensure that the 'expected' amount of
@@ -258,7 +267,7 @@ impl TestRunner {
                             &self.player,
                             &name,
                             image_comparison,
-                            self.options.known_failure,
+                            matches!(self.options.known_failure, KnownFailure::AnyCheck),
                             self.render_interface.as_deref(),
                         )?;
                     } else {
@@ -284,7 +293,7 @@ impl TestRunner {
                 &self.player,
                 &name,
                 comp,
-                self.options.known_failure,
+                matches!(self.options.known_failure, KnownFailure::AnyCheck),
                 self.render_interface.as_deref(),
             )?;
         }
@@ -302,7 +311,7 @@ impl TestRunner {
                 &self.player,
                 &name,
                 comp,
-                self.options.known_failure,
+                matches!(self.options.known_failure, KnownFailure::AnyCheck),
                 self.render_interface.as_deref(),
             )?;
         }
@@ -332,3 +341,13 @@ impl TestRunner {
         self.images.extract_if(|_k, v| v.trigger == trigger).next()
     }
 }
+
+fn panic_payload_as_string(panic: Box<dyn Any + Send + 'static>) -> Cow<'static, str> {
+    if let Some(s) = panic.downcast_ref::<&str>() {
+        (*s).into()
+    } else if let Ok(s) = panic.downcast::<String>() {
+        (*s).into()
+    } else {
+        "<opaque payload>".into()
+    }
+}
diff --git a/tests/framework/src/test.rs b/tests/framework/src/test.rs
index 4969006d0766..d526097addaf 100644
--- a/tests/framework/src/test.rs
+++ b/tests/framework/src/test.rs
@@ -142,7 +142,7 @@ impl Test {
         if self.options.ignore {
             return false;
         }
-        if ignore_known_failures && self.options.known_failure {
+        if ignore_known_failures && self.options.has_known_failure() {
             return false;
         }
         self.options.required_features.can_run()
diff --git a/tests/tests/regression_tests.rs b/tests/tests/regression_tests.rs
index b813bd17bde7..1caccdc53c1e 100644
--- a/tests/tests/regression_tests.rs
+++ b/tests/tests/regression_tests.rs
@@ -106,7 +106,7 @@ fn trial_for_test(opts: &RuffleTestOpts, test: Test, list_only: bool) -> Trial {
     // Put extra info into the test 'kind' instead of appending it to the test name,
     // to not break `cargo test some/test -- --exact` and `cargo test -- --list`.
     let mut test_kind = String::new();
-    if test.options.known_failure {
+    if test.options.has_known_failure() {
         test_kind.push('!');
     }
     if let Some(name) = &test.options.subtest_name {
diff --git a/tests/tests/swfs/avm1/bitmap_data_thorough/pixelDissolve/test.toml b/tests/tests/swfs/avm1/bitmap_data_thorough/pixelDissolve/test.toml
index 77f4c0e41c1f..0ba9a09e047c 100644
--- a/tests/tests/swfs/avm1/bitmap_data_thorough/pixelDissolve/test.toml
+++ b/tests/tests/swfs/avm1/bitmap_data_thorough/pixelDissolve/test.toml
@@ -1,2 +1,2 @@
 num_frames = 1
-known_failure = true
+known_failure.panic = "attempt to add with overflow"
diff --git a/tests/tests/swfs/from_gnash/misc-ming.all/action_order/PlaceAndRemove/test.toml b/tests/tests/swfs/from_gnash/misc-ming.all/action_order/PlaceAndRemove/test.toml
index a78fc2707fae..c2620d7ef9f1 100644
--- a/tests/tests/swfs/from_gnash/misc-ming.all/action_order/PlaceAndRemove/test.toml
+++ b/tests/tests/swfs/from_gnash/misc-ming.all/action_order/PlaceAndRemove/test.toml
@@ -1,5 +1,5 @@
 num_frames = 30
-known_failure = true
+known_failure.panic = "Gotos must start from the correct tag position for frame 1"
 
 [subtests.fp9]
 output_path = "output.fp9.txt"
diff --git a/tests/tests/swfs/from_gnash/misc-ming.all/get_frame_number_test/test.toml b/tests/tests/swfs/from_gnash/misc-ming.all/get_frame_number_test/test.toml
index 11e963f5a927..f70edd57f4bf 100644
--- a/tests/tests/swfs/from_gnash/misc-ming.all/get_frame_number_test/test.toml
+++ b/tests/tests/swfs/from_gnash/misc-ming.all/get_frame_number_test/test.toml
@@ -1,2 +1,2 @@
 num_frames = 30
-known_failure = true
+known_failure.panic = "attempt to add with overflow"
diff --git a/tests/tests/swfs/from_gnash/misc-swfc.all/matrix_accuracy_test1/test.toml b/tests/tests/swfs/from_gnash/misc-swfc.all/matrix_accuracy_test1/test.toml
index a78fc2707fae..0a8d82350586 100644
--- a/tests/tests/swfs/from_gnash/misc-swfc.all/matrix_accuracy_test1/test.toml
+++ b/tests/tests/swfs/from_gnash/misc-swfc.all/matrix_accuracy_test1/test.toml
@@ -1,5 +1,5 @@
 num_frames = 30
-known_failure = true
+known_failure.panic = "attempt to subtract with overflow"
 
 [subtests.fp9]
 output_path = "output.fp9.txt"

From 557fe020addac106c6b60df4eab053c8816fb6b7 Mon Sep 17 00:00:00 2001
From: Moulins <arthur.heuillard@orange.fr>
Date: Fri, 28 Nov 2025 15:48:28 +0100
Subject: [PATCH 3/4] tests: make `known_failure = true` check-specific

Root-level `known_failure = true` now only except the trace output check to
fail; if other checks (e.g. image comparisons) are expected to fail, they should
specify their own `known_failure = true`.
---
 tests/README.md                               |  7 +-
 tests/framework/src/options.rs                |  1 +
 .../framework/src/options/image_comparison.rs |  1 +
 tests/framework/src/options/known_failure.rs  |  4 +-
 tests/framework/src/runner.rs                 | 41 ++++------
 tests/framework/src/runner/image_test.rs      | 80 ++++++++++---------
 tests/framework/src/runner/trace.rs           | 37 +++++++--
 .../avm2/bitmapdata_draw_filters/test.toml    |  2 +-
 .../embed_matching/no_font_found/test.toml    |  2 +-
 .../misc-ming.all/BeginBitmapFill/test.toml   |  2 +-
 .../misc-ming.all/place_object_test/test.toml |  1 +
 .../misc-ming.all/shape_test/test.toml        |  1 +
 .../swfs/from_shumway/MaskTest-2/test.toml    |  2 +-
 .../acid-bitmap-draw_quality_low/test.toml    |  1 +
 .../acid/acid-shapes-testing/test.toml        |  1 +
 .../from_shumway/acid/acid-stroke-0/test.toml |  2 +-
 .../from_shumway/acid/acid-text-4/test.toml   |  2 +-
 .../from_shumway/acid/acid-text-5/test.toml   |  2 +-
 .../acid/acid-textfield/test.toml             |  1 +
 .../swfs/from_shumway/captions/test.toml      |  2 +-
 .../flash_text_TextField/test.toml            |  1 +
 .../flash_text_TextField2/test.toml           |  1 +
 .../from_shumway/gradientTransform/test.toml  |  2 +-
 .../swfs/from_shumway/hardwrap/test.toml      |  2 +-
 .../swfs/from_shumway/stylesheet/test.toml    |  2 +-
 .../oversize/swf_10_too_big/test.toml         |  2 +-
 .../oversize/swf_9_too_big/test.toml          |  2 +-
 .../simple_shapes/strokes/scale/test.toml     |  2 +-
 28 files changed, 118 insertions(+), 88 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index a5343eb14b73..b1b5fdc910ac 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -35,7 +35,8 @@ sleep_to_meet_frame_rate = false
 # Prefer setting `known_failure = true` to ignoring the test.
 ignore = false
 
-# If true, this test is known to fail and the test runner will expect it to fail.
+# If true, this test is known to fail and the test runner will expect the check against
+# the trace output (specified `output_path`) to fail.
 # When the test passes in the future, it'll fail and alert that it now passes.
 # This will not catch Ruffle panics; if the test is expected to panic, use
 #   `known_failure.panic = "panic message"`
@@ -100,6 +101,10 @@ with_default_font = false
 # This requires a render to be setup for this test
 [image_comparisons.COMPARISON_NAME] # COMPARISON_NAME is a name of this particular image
 
+# If true, this image comparison is known to fail and the test runner will expect it to fail.
+# When the comparison passes in the future, it'll fail and alert that it now passes.
+known_failure = false
+
 # The tolerance per pixel channel to be considered "the same".
 # Increase as needed with tests that aren't pixel perfect across platforms.
 # Prefer running tests with higher sample count to make a better use of this option.
diff --git a/tests/framework/src/options.rs b/tests/framework/src/options.rs
index f8fa49b33d43..7ba749fe5119 100644
--- a/tests/framework/src/options.rs
+++ b/tests/framework/src/options.rs
@@ -182,6 +182,7 @@ impl TestOptions {
 
     pub fn has_known_failure(&self) -> bool {
         !matches!(self.known_failure, KnownFailure::None)
+            || self.image_comparisons.values().any(|cmp| cmp.known_failure)
     }
 
     pub fn output_path(&self, test_directory: &VfsPath) -> Result<VfsPath> {
diff --git a/tests/framework/src/options/image_comparison.rs b/tests/framework/src/options/image_comparison.rs
index ec9c442b8007..d65a3041e90a 100644
--- a/tests/framework/src/options/image_comparison.rs
+++ b/tests/framework/src/options/image_comparison.rs
@@ -11,6 +11,7 @@ pub struct ImageComparison {
     max_outliers: Option<usize>,
     checks: Vec<ImageComparisonCheck>,
     pub trigger: ImageTrigger,
+    pub known_failure: bool,
 }
 
 impl ImageComparison {
diff --git a/tests/framework/src/options/known_failure.rs b/tests/framework/src/options/known_failure.rs
index 51a0c0847a71..372fc5e1bb8f 100644
--- a/tests/framework/src/options/known_failure.rs
+++ b/tests/framework/src/options/known_failure.rs
@@ -9,7 +9,7 @@ use serde::{
 pub enum KnownFailure {
     #[default]
     None,
-    AnyCheck,
+    TraceOutput,
     Panic {
         message: String,
     },
@@ -32,7 +32,7 @@ impl<'de> de::Visitor<'de> for KnownFailureVisitor {
 
     fn visit_bool<E: de::Error>(self, v: bool) -> Result<Self::Value, E> {
         if v {
-            Ok(KnownFailure::AnyCheck)
+            Ok(KnownFailure::TraceOutput)
         } else {
             Ok(KnownFailure::None)
         }
diff --git a/tests/framework/src/runner.rs b/tests/framework/src/runner.rs
index 88b3b0924faf..2504b92aab6c 100644
--- a/tests/framework/src/runner.rs
+++ b/tests/framework/src/runner.rs
@@ -183,23 +183,10 @@ impl TestRunner {
             (Err(panic), _) => resume_unwind(panic),
         }
 
-        match (self.test(), &self.options.known_failure) {
-            (Ok(()), _) => (),
-            (Err(_), KnownFailure::AnyCheck) => return Ok(TestStatus::Finished),
-            (Err(err), _) => return Err(err),
-        }
+        self.test()?;
 
-        match (self.remaining_iterations, &self.options.known_failure) {
-            (0, KnownFailure::None) => self.last_test().map(|_| TestStatus::Finished),
-            (0, KnownFailure::Panic { .. }) => Err(anyhow!(
-                "Test was known to be panicking, but now finishes successfully. Please update it and remove `known_failure.panic = '...'`!",
-            )),
-            (0, KnownFailure::AnyCheck) => match self.last_test() {
-                Ok(()) => Err(anyhow!(
-                    "Test was known to be failing, but now passes successfully. Please update it and remove `known_failure = true`!",
-                )),
-                Err(_) => Ok(TestStatus::Finished),
-            },
+        match self.remaining_iterations {
+            0 => self.last_test().map(|_| TestStatus::Finished),
             _ if self.options.sleep_to_meet_frame_rate => {
                 // If requested, ensure that the 'expected' amount of
                 // time actually elapses between frames. This is useful for
@@ -267,7 +254,6 @@ impl TestRunner {
                             &self.player,
                             &name,
                             image_comparison,
-                            matches!(self.options.known_failure, KnownFailure::AnyCheck),
                             self.render_interface.as_deref(),
                         )?;
                     } else {
@@ -293,7 +279,6 @@ impl TestRunner {
                 &self.player,
                 &name,
                 comp,
-                matches!(self.options.known_failure, KnownFailure::AnyCheck),
                 self.render_interface.as_deref(),
             )?;
         }
@@ -303,6 +288,12 @@ impl TestRunner {
 
     fn last_test(&mut self) -> Result<()> {
         // Last iteration, let's check everything went well
+        if let KnownFailure::Panic { .. } = &self.options.known_failure {
+            return Err(anyhow!(
+                "Test was known to be panicking, but now finishes successfully. \
+                Please update it and remove `known_failure.panic = '...'`!",
+            ));
+        }
 
         let trigger = ImageTrigger::LastFrame;
         if let Some((name, comp)) = self.take_image_comparison_by_trigger(trigger) {
@@ -311,7 +302,6 @@ impl TestRunner {
                 &self.player,
                 &name,
                 comp,
-                matches!(self.options.known_failure, KnownFailure::AnyCheck),
                 self.render_interface.as_deref(),
             )?;
         }
@@ -325,13 +315,12 @@ impl TestRunner {
 
         self.executor.run();
 
-        let trace = self.log.trace_output();
-        // Null bytes are invisible, and interfere with constructing
-        // the expected output.txt file. Any tests dealing with null
-        // bytes should explicitly test for them in ActionScript.
-        let normalized_trace = trace.replace('\0', "");
-        compare_trace_output(&self.output_path, &self.options, &normalized_trace)?;
-        Ok(())
+        compare_trace_output(
+            &self.log,
+            &self.output_path,
+            self.options.approximations.as_ref(),
+            matches!(self.options.known_failure, KnownFailure::TraceOutput),
+        )
     }
 
     fn take_image_comparison_by_trigger(
diff --git a/tests/framework/src/runner/image_test.rs b/tests/framework/src/runner/image_test.rs
index 8374d24b5c85..90fde67c0d7d 100644
--- a/tests/framework/src/runner/image_test.rs
+++ b/tests/framework/src/runner/image_test.rs
@@ -14,49 +14,56 @@ pub fn capture_and_compare_image(
     player: &Arc<Mutex<Player>>,
     name: &String,
     image_comparison: ImageComparison,
-    known_failure: bool,
     render_interface: Option<&dyn RenderInterface>,
 ) -> anyhow::Result<()> {
     use anyhow::Context;
 
-    if let Some(render_interface) = render_interface {
+    let Some(render_interface) = render_interface else {
+        return Ok(());
+    };
+
+    let actual_image = {
         let mut player_lock = player.lock().unwrap();
         player_lock.render();
+        render_interface.capture(player_lock.renderer_mut())
+    };
 
-        let actual_image = render_interface.capture(player_lock.renderer_mut());
-
-        let expected_image_path = base_path.join(format!("{name}.expected.png"))?;
-        if expected_image_path.is_file()? {
-            let expected_image = image::load_from_memory(&read_bytes(&expected_image_path)?)
+    let expected_image = {
+        let path = base_path.join(format!("{name}.expected.png"))?;
+        if path.is_file()? {
+            image::load_from_memory(&read_bytes(&path)?)
                 .context("Failed to open expected image")?
-                .into_rgba8();
-
-            test(
-                &image_comparison,
-                name,
-                actual_image,
-                expected_image,
-                base_path,
-                render_interface.name(),
-                known_failure,
-            )?;
-        } else if known_failure {
+                .into_rgba8()
+        } else if image_comparison.known_failure {
+            // If we're expecting this to be wrong, don't save a likely wrong image
+            return Err(anyhow!("Image '{name}': No image to compare to!"));
+        } else {
+            write_image(&path, &actual_image, ImageFormat::Png)?;
             return Err(anyhow!(
-                "No image to compare to, pretending this failed since we don't know if it worked."
+                "Image '{name}': No image to compare to! Saved actual image as expected."
             ));
-        } else {
-            // If we're expecting this to be wrong, don't save a likely wrong image
-            write_image(&expected_image_path, &actual_image, ImageFormat::Png)?;
         }
-    } else if known_failure {
-        // It's possible that the trace output matched but the image might not.
-        // If we aren't checking the image, pretend the match failed (which makes it actually pass, since it's expecting failure).
-        return Err(anyhow!(
-            "Not checking images, pretending this failed since we don't know if it worked."
-        ));
-    }
+    };
 
-    Ok(())
+    let result = test(
+        &image_comparison,
+        name,
+        actual_image,
+        expected_image,
+        base_path,
+        render_interface.name(),
+        // If we're expecting failure, spamming files isn't productive.
+        !image_comparison.known_failure,
+    );
+
+    match (result, image_comparison.known_failure) {
+        (result, false) => result,
+        (Ok(()), true) => Err(anyhow!(
+            "Image '{name}': Check was known to be failing, but now passes successfully. \
+            Please update the test and remove `known_failure = true`!",
+        )),
+        (Err(_), true) => Ok(()),
+    }
 }
 
 pub fn test(
@@ -66,13 +73,12 @@ pub fn test(
     expected_image: image::RgbaImage,
     test_path: &VfsPath,
     environment_name: String,
-    known_failure: bool,
+    save_failures: bool,
 ) -> anyhow::Result<()> {
     use anyhow::Context;
 
     let save_actual_image = || {
-        if !known_failure {
-            // If we're expecting failure, spamming files isn't productive.
+        if save_failures {
             write_image(
                 &test_path.join(format!("{name}.actual-{environment_name}.png"))?,
                 &actual_image,
@@ -141,8 +147,7 @@ pub fn test(
             difference_color.extend_from_slice(&p[..3]);
         }
 
-        if !known_failure {
-            // If we're expecting failure, spamming files isn't productive.
+        if save_failures {
             let difference_image = image::RgbImage::from_raw(
                 actual_image.width(),
                 actual_image.height(),
@@ -163,8 +168,7 @@ pub fn test(
                 difference_alpha.push(p[3])
             }
 
-            if !known_failure {
-                // If we're expecting failure, spamming files isn't productive.
+            if save_failures {
                 let difference_image = image::GrayImage::from_raw(
                     actual_image.width(),
                     actual_image.height(),
diff --git a/tests/framework/src/runner/trace.rs b/tests/framework/src/runner/trace.rs
index afbad1bcdc14..900ba5fb78a1 100644
--- a/tests/framework/src/runner/trace.rs
+++ b/tests/framework/src/runner/trace.rs
@@ -1,19 +1,42 @@
-use crate::options::TestOptions;
+use crate::backends::TestLogBackend;
+use crate::options::approximations::Approximations;
 use anyhow::{Error, anyhow};
 use pretty_assertions::Comparison;
 use vfs::VfsPath;
 
 pub fn compare_trace_output(
+    log: &TestLogBackend,
     expected_path: &VfsPath,
-    options: &TestOptions,
-    actual_output: &str,
+    approximations: Option<&Approximations>,
+    known_failure: bool,
 ) -> anyhow::Result<()> {
-    let expected_output = expected_path.read_to_string()?.replace("\r\n", "\n");
+    let expected_trace = expected_path.read_to_string()?.replace("\r\n", "\n");
+
+    // Null bytes are invisible, and interfere with constructing
+    // the expected output.txt file. Any tests dealing with null
+    // bytes should explicitly test for them in ActionScript.
+    let actual_trace = log.trace_output().replace('\0', "");
 
-    if let Some(approximations) = &options.approximations {
+    let result = test(&expected_trace, approximations, &actual_trace);
+    match (result, known_failure) {
+        (res, false) => res,
+        (Ok(()), true) => Err(anyhow!(
+            "Trace output check was known to be failing, but now passes successfully. \
+            Please update the test and remove `known_failure = true`!",
+        )),
+        (Err(_), true) => Ok(()),
+    }
+}
+
+pub fn test(
+    expected_output: &str,
+    approximations: Option<&Approximations>,
+    actual_output: &str,
+) -> anyhow::Result<()> {
+    if let Some(approximations) = approximations {
         let add_comparison_to_err = |err: Error| -> Error {
             let left_pretty = PrettyString(actual_output);
-            let right_pretty = PrettyString(&expected_output);
+            let right_pretty = PrettyString(expected_output);
             let comparison = Comparison::new(&left_pretty, &right_pretty);
 
             anyhow!("{}\n\n{}\n", err, comparison)
@@ -90,7 +113,7 @@ pub fn compare_trace_output(
             }
         }
     } else {
-        assert_text_matches(actual_output, &expected_output)?;
+        assert_text_matches(actual_output, expected_output)?;
     }
 
     Ok(())
diff --git a/tests/tests/swfs/avm2/bitmapdata_draw_filters/test.toml b/tests/tests/swfs/avm2/bitmapdata_draw_filters/test.toml
index 0d8096bcf008..02f201203057 100644
--- a/tests/tests/swfs/avm2/bitmapdata_draw_filters/test.toml
+++ b/tests/tests/swfs/avm2/bitmapdata_draw_filters/test.toml
@@ -1,9 +1,9 @@
 num_ticks = 1
 
+[image_comparisons.output]
 # FIXME Ruffle does not use CAB in BitmapData.draw
 known_failure = true
 
-[image_comparisons.output]
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/fonts/embed_matching/no_font_found/test.toml b/tests/tests/swfs/fonts/embed_matching/no_font_found/test.toml
index a4c2e0d01766..25c992837b44 100644
--- a/tests/tests/swfs/fonts/embed_matching/no_font_found/test.toml
+++ b/tests/tests/swfs/fonts/embed_matching/no_font_found/test.toml
@@ -1,9 +1,9 @@
 # There are no fonts embedded in this swf. It should not render anything at all, or error.
 
 num_frames = 1
-known_failure = true # Right now we intentionally fall back, because we don't support DefineFont4 embedded fonts yet
 
 [image_comparisons.output]
+known_failure = true # Right now we intentionally fall back, because we don't support DefineFont4 embedded fonts yet
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/from_gnash/misc-ming.all/BeginBitmapFill/test.toml b/tests/tests/swfs/from_gnash/misc-ming.all/BeginBitmapFill/test.toml
index 97481f0c8c6f..71498f5cf11b 100644
--- a/tests/tests/swfs/from_gnash/misc-ming.all/BeginBitmapFill/test.toml
+++ b/tests/tests/swfs/from_gnash/misc-ming.all/BeginBitmapFill/test.toml
@@ -1,7 +1,7 @@
 num_frames = 30
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 50
 max_outliers = 100
 
diff --git a/tests/tests/swfs/from_gnash/misc-ming.all/place_object_test/test.toml b/tests/tests/swfs/from_gnash/misc-ming.all/place_object_test/test.toml
index 31d18cd1733b..466e3fefdfef 100644
--- a/tests/tests/swfs/from_gnash/misc-ming.all/place_object_test/test.toml
+++ b/tests/tests/swfs/from_gnash/misc-ming.all/place_object_test/test.toml
@@ -2,6 +2,7 @@ num_frames = 30
 known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 50
 
 [player_options]
diff --git a/tests/tests/swfs/from_gnash/misc-ming.all/shape_test/test.toml b/tests/tests/swfs/from_gnash/misc-ming.all/shape_test/test.toml
index 31d18cd1733b..466e3fefdfef 100644
--- a/tests/tests/swfs/from_gnash/misc-ming.all/shape_test/test.toml
+++ b/tests/tests/swfs/from_gnash/misc-ming.all/shape_test/test.toml
@@ -2,6 +2,7 @@ num_frames = 30
 known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 50
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/MaskTest-2/test.toml b/tests/tests/swfs/from_shumway/MaskTest-2/test.toml
index 7d0149029a09..506ed6b349a9 100644
--- a/tests/tests/swfs/from_shumway/MaskTest-2/test.toml
+++ b/tests/tests/swfs/from_shumway/MaskTest-2/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 3
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/acid/acid-bitmap-draw_quality_low/test.toml b/tests/tests/swfs/from_shumway/acid/acid-bitmap-draw_quality_low/test.toml
index 255465176c04..b206c10aeb77 100644
--- a/tests/tests/swfs/from_shumway/acid/acid-bitmap-draw_quality_low/test.toml
+++ b/tests/tests/swfs/from_shumway/acid/acid-bitmap-draw_quality_low/test.toml
@@ -2,6 +2,7 @@
 
 num_frames = 1
 
+[image_comparisons.output]
 # FIXME Even on low quality, Flash uses antialiasing here
 known_failure = true
 
diff --git a/tests/tests/swfs/from_shumway/acid/acid-shapes-testing/test.toml b/tests/tests/swfs/from_shumway/acid/acid-shapes-testing/test.toml
index 1d82bc434a06..66897470776e 100644
--- a/tests/tests/swfs/from_shumway/acid/acid-shapes-testing/test.toml
+++ b/tests/tests/swfs/from_shumway/acid/acid-shapes-testing/test.toml
@@ -4,6 +4,7 @@ num_frames = 1
 known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12118
 
 [image_comparisons.output]
+known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12118
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/acid/acid-stroke-0/test.toml b/tests/tests/swfs/from_shumway/acid/acid-stroke-0/test.toml
index c9a311449b17..ec6e41730500 100644
--- a/tests/tests/swfs/from_shumway/acid/acid-stroke-0/test.toml
+++ b/tests/tests/swfs/from_shumway/acid/acid-stroke-0/test.toml
@@ -1,9 +1,9 @@
 # Test adapted from Shumway at https://github.com/mozilla/shumway/tree/master/test/swfs/acid
 
 num_frames = 1
-known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12119
 
 [image_comparisons.output]
+known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12119
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/acid/acid-text-4/test.toml b/tests/tests/swfs/from_shumway/acid/acid-text-4/test.toml
index 5c7ec840b477..36b9eef174ad 100644
--- a/tests/tests/swfs/from_shumway/acid/acid-text-4/test.toml
+++ b/tests/tests/swfs/from_shumway/acid/acid-text-4/test.toml
@@ -1,9 +1,9 @@
 # Test adapted from Shumway at https://github.com/mozilla/shumway/tree/master/test/swfs/acid
 
 num_frames = 1
-known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12121
 
 [image_comparisons.output]
+known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12121
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/acid/acid-text-5/test.toml b/tests/tests/swfs/from_shumway/acid/acid-text-5/test.toml
index 27dfd472a6ea..820a49c1f1d2 100644
--- a/tests/tests/swfs/from_shumway/acid/acid-text-5/test.toml
+++ b/tests/tests/swfs/from_shumway/acid/acid-text-5/test.toml
@@ -1,9 +1,9 @@
 # Test adapted from Shumway at https://github.com/mozilla/shumway/tree/master/test/swfs/acid
 
 num_frames = 1
-known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12122
 
 [image_comparisons.output]
+known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12122
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/acid/acid-textfield/test.toml b/tests/tests/swfs/from_shumway/acid/acid-textfield/test.toml
index fb20d52f8d84..0f96371d9850 100644
--- a/tests/tests/swfs/from_shumway/acid/acid-textfield/test.toml
+++ b/tests/tests/swfs/from_shumway/acid/acid-textfield/test.toml
@@ -4,6 +4,7 @@ num_frames = 1
 known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12123
 
 [image_comparisons.output]
+known_failure = true # https://github.com/ruffle-rs/ruffle/issues/12123
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/captions/test.toml b/tests/tests/swfs/from_shumway/captions/test.toml
index 7d0149029a09..506ed6b349a9 100644
--- a/tests/tests/swfs/from_shumway/captions/test.toml
+++ b/tests/tests/swfs/from_shumway/captions/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 3
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/flash_text_TextField/test.toml b/tests/tests/swfs/from_shumway/flash_text_TextField/test.toml
index 7d0149029a09..c90160d16054 100644
--- a/tests/tests/swfs/from_shumway/flash_text_TextField/test.toml
+++ b/tests/tests/swfs/from_shumway/flash_text_TextField/test.toml
@@ -2,6 +2,7 @@ num_frames = 1
 known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 3
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/flash_text_TextField2/test.toml b/tests/tests/swfs/from_shumway/flash_text_TextField2/test.toml
index 7d0149029a09..c90160d16054 100644
--- a/tests/tests/swfs/from_shumway/flash_text_TextField2/test.toml
+++ b/tests/tests/swfs/from_shumway/flash_text_TextField2/test.toml
@@ -2,6 +2,7 @@ num_frames = 1
 known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 3
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/gradientTransform/test.toml b/tests/tests/swfs/from_shumway/gradientTransform/test.toml
index 7d0149029a09..506ed6b349a9 100644
--- a/tests/tests/swfs/from_shumway/gradientTransform/test.toml
+++ b/tests/tests/swfs/from_shumway/gradientTransform/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 3
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/hardwrap/test.toml b/tests/tests/swfs/from_shumway/hardwrap/test.toml
index 7d0149029a09..506ed6b349a9 100644
--- a/tests/tests/swfs/from_shumway/hardwrap/test.toml
+++ b/tests/tests/swfs/from_shumway/hardwrap/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 3
 
 [player_options]
diff --git a/tests/tests/swfs/from_shumway/stylesheet/test.toml b/tests/tests/swfs/from_shumway/stylesheet/test.toml
index c323f8252b79..db13602e4647 100644
--- a/tests/tests/swfs/from_shumway/stylesheet/test.toml
+++ b/tests/tests/swfs/from_shumway/stylesheet/test.toml
@@ -1,7 +1,7 @@
 num_ticks = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 5
 
 [player_options]
diff --git a/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_10_too_big/test.toml b/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_10_too_big/test.toml
index 3e65a70aad8f..09e13a22d472 100644
--- a/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_10_too_big/test.toml
+++ b/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_10_too_big/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_9_too_big/test.toml b/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_9_too_big/test.toml
index 3e65a70aad8f..09e13a22d472 100644
--- a/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_9_too_big/test.toml
+++ b/tests/tests/swfs/visual/cache_as_bitmap/oversize/swf_9_too_big/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 0
 
 [player_options]
diff --git a/tests/tests/swfs/visual/simple_shapes/strokes/scale/test.toml b/tests/tests/swfs/visual/simple_shapes/strokes/scale/test.toml
index 6bca5ae19375..3b91203f03d9 100644
--- a/tests/tests/swfs/visual/simple_shapes/strokes/scale/test.toml
+++ b/tests/tests/swfs/visual/simple_shapes/strokes/scale/test.toml
@@ -1,7 +1,7 @@
 num_frames = 1
-known_failure = true
 
 [image_comparisons.output]
+known_failure = true
 tolerance = 0
 
 [player_options]

From dc5dd76d1d3a4fe5f204e72b372840f98e045c06 Mon Sep 17 00:00:00 2001
From: Moulins <arthur.heuillard@orange.fr>
Date: Sat, 29 Nov 2025 23:20:16 +0100
Subject: [PATCH 4/4] tests: Don't run panicky tests if debug assertions are
 disabled

---
 tests/README.md             |  3 ++-
 tests/framework/src/test.rs | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/README.md b/tests/README.md
index b1b5fdc910ac..d04b9f9e9627 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -40,7 +40,8 @@ ignore = false
 # When the test passes in the future, it'll fail and alert that it now passes.
 # This will not catch Ruffle panics; if the test is expected to panic, use
 #   `known_failure.panic = "panic message"`
-# instead.
+# instead (note that 'panicky' tests will be skipped if the test harness is run
+# with debug assertions disabled, e.g. with `--release`).
 known_failure = false
 
 # Path (relative to the directory containing test.toml) to the expected output
diff --git a/tests/framework/src/test.rs b/tests/framework/src/test.rs
index d526097addaf..2bee291a7df9 100644
--- a/tests/framework/src/test.rs
+++ b/tests/framework/src/test.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use crate::environment::Environment;
 use crate::options::TestOptions;
+use crate::options::known_failure::KnownFailure;
 use crate::runner::TestRunner;
 use crate::util::read_bytes;
 use anyhow::{Result, anyhow};
@@ -145,6 +146,15 @@ impl Test {
         if ignore_known_failures && self.options.has_known_failure() {
             return false;
         }
+
+        // Panicky tests may expect to hit a debug assertion, so don't run them
+        // if assertions are disabled.
+        if !cfg!(debug_assertions)
+            && matches!(self.options.known_failure, KnownFailure::Panic { .. })
+        {
+            return false;
+        }
+
         self.options.required_features.can_run()
             && self
                 .options