diff --git a/Project.toml b/Project.toml
index e671e0f9da..85a7d40b89 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TrixiParticles"
 uuid = "66699cd8-9c01-4e9d-a059-b96c86d16b3a"
-authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
 version = "0.4.2"
+authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -51,7 +51,7 @@ FastPow = "0.1"
 FileIO = "1"
 ForwardDiff = "1"
 GPUArraysCore = "0.2"
-IntervalSets = "0.7 - 0.7.11"
+IntervalSets = "0.7.13"
 JSON = "1"
 KernelAbstractions = "0.9"
 MuladdMacro = "0.2"
diff --git a/examples/fluid/dam_break_2d.jl b/examples/fluid/dam_break_2d.jl
index 0a8a9474e3..db4c92a511 100644
--- a/examples/fluid/dam_break_2d.jl
+++ b/examples/fluid/dam_break_2d.jl
@@ -37,7 +37,7 @@ tspan = (0.0, 5.7 / sqrt(gravity / H))
 
 # Boundary geometry and initial fluid particle positions
 initial_fluid_size = (W, H)
-tank_size = (floor(5.366 * H / boundary_particle_spacing) * boundary_particle_spacing, 4.0)
+tank_size = (floor(5.366 * H / boundary_particle_spacing) * boundary_particle_spacing, 2.0+10*fluid_particle_spacing)
 
 fluid_density = 1000.0
 sound_speed = 20 * sqrt(gravity * H)
@@ -61,8 +61,8 @@ viscosity_fluid = ArtificialViscosityMonaghan(alpha=alpha, beta=0.0)
 # The density diffusion model by Molteni and Colagrossi shows unphysical effects at the
 # free surface in long-running simulations, but is significantly faster than the model
 # by Antuono. This simulation is short enough to use the faster model.
-density_diffusion = DensityDiffusionMolteniColagrossi(delta=0.1)
-# density_diffusion = DensityDiffusionAntuono(tank.fluid, delta=0.1)
+# density_diffusion = DensityDiffusionMolteniColagrossi(delta=0.1)
+density_diffusion = DensityDiffusionAntuono(tank.fluid, delta=0.1)
 
 fluid_system = WeaklyCompressibleSPHSystem(tank.fluid, fluid_density_calculator,
                                            state_equation, smoothing_kernel,
@@ -70,7 +70,8 @@ fluid_system = WeaklyCompressibleSPHSystem(tank.fluid, fluid_density_calculator,
                                            density_diffusion=density_diffusion,
                                            acceleration=(0.0, -gravity), correction=nothing,
                                            surface_tension=nothing,
-                                           reference_particle_spacing=0)
+                                           reference_particle_spacing=fluid_particle_spacing,
+                                           shifting_technique=nothing)
 
 # ==========================================================================================
 # ==== Boundary
@@ -83,7 +84,7 @@ boundary_model = BoundaryModelDummyParticles(tank.boundary.density, tank.boundar
                                              boundary_density_calculator,
                                              smoothing_kernel, smoothing_length,
                                              correction=nothing,
-                                             reference_particle_spacing=0,
+                                             reference_particle_spacing=fluid_particle_spacing,
                                              viscosity=viscosity_wall)
 
 boundary_system = WallBoundarySystem(tank.boundary, boundary_model,
@@ -93,7 +94,9 @@ boundary_system = WallBoundarySystem(tank.boundary, boundary_model,
 # ==== Simulation
 # `nothing` will automatically choose the best update strategy. This is only to be able
 # to change this with `trixi_include`.
-semi = Semidiscretization(fluid_system, boundary_system,
+extra_system = nothing
+extra_system2 = nothing
+semi = Semidiscretization(fluid_system, boundary_system, extra_system, extra_system2,
                           neighborhood_search=GridNeighborhoodSearch{2}(update_strategy=nothing),
                           parallelization_backend=PolyesterBackend())
 ode = semidiscretize(semi, tspan)
@@ -101,7 +104,7 @@ ode = semidiscretize(semi, tspan)
 info_callback = InfoCallback(interval=100)
 
 solution_prefix = ""
-saving_callback = SolutionSavingCallback(dt=0.02, prefix=solution_prefix)
+saving_callback = SolutionSavingCallback(dt=0.01, prefix=solution_prefix)
 
 # This can be overwritten with `trixi_include`
 extra_callback = nothing
@@ -117,6 +120,12 @@ callbacks = CallbackSet(info_callback, saving_callback, stepsize_callback, extra
                         extra_callback2, density_reinit_cb)
 
 time_integration_scheme = CarpenterKennedy2N54(williamson_condition=false)
-sol = solve(ode, time_integration_scheme,
-            dt=1.0, # This is overwritten by the stepsize callback
+# sol = solve(ode, time_integration_scheme,
+#             dt=1.0, # This is overwritten by the stepsize callback
+#             save_everystep=false, callback=callbacks);
+sol = solve(ode, RDPK3SpFSAL35(),
+            abstol=1e-5, # Default abstol is 1e-6 (may need to be tuned to prevent boundary penetration)
+            reltol=1e-4, # Default reltol is 1e-3 (may need to be tuned to prevent boundary penetration)
+            dtmax=1e-2, # Limit stepsize to prevent crashing
+            maxiters=1e7,
             save_everystep=false, callback=callbacks);
diff --git a/examples/fluid/dam_break_2d_gpu.jl b/examples/fluid/dam_break_2d_gpu.jl
index 10cfc734c3..04e84b7ef3 100644
--- a/examples/fluid/dam_break_2d_gpu.jl
+++ b/examples/fluid/dam_break_2d_gpu.jl
@@ -36,9 +36,11 @@ trixi_include(@__MODULE__,
               joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
               neighborhood_search=neighborhood_search,
               fluid_particle_spacing=fluid_particle_spacing,
-              tspan=tspan, smoothing_length=smoothing_length,
+              tspan=tspan,
+              smoothing_length=smoothing_length,
               density_diffusion=density_diffusion,
-              boundary_layers=boundary_layers, spacing_ratio=spacing_ratio,
+              boundary_layers=boundary_layers,
+              spacing_ratio=spacing_ratio,
               boundary_model=boundary_model,
               parallelization_backend=PolyesterBackend(),
               boundary_density_calculator=boundary_density_calculator)
diff --git a/examples/fluid/dam_break_ds.jl b/examples/fluid/dam_break_ds.jl
new file mode 100644
index 0000000000..f9fab8538f
--- /dev/null
+++ b/examples/fluid/dam_break_ds.jl
@@ -0,0 +1,38 @@
+using TrixiParticles
+using CUDA
+
+fluid_particle_spacing = 0.001
+
+# Load setup from dam break example
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              fluid_particle_spacing=fluid_particle_spacing,
+              tank_size=(4.0, 3.0), W=1.0, H=2.0,
+              spacing_ratio=1, boundary_layers=1,
+              sol=nothing, ode=nothing)
+
+tank.fluid.coordinates .+= 0.005
+tank.boundary.coordinates .+= 0.005
+
+# Define a GPU-compatible neighborhood search
+min_corner = minimum(tank.boundary.coordinates, dims=2) .- 1
+max_corner = maximum(tank.boundary.coordinates, dims=2) .+ 1
+cell_list = FullGridCellList(; min_corner, max_corner, max_points_per_cell=30)
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list,
+                                                update_strategy=ParallelUpdate())
+
+# Run the dam break simulation with this neighborhood search
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              tank=tank,
+              smoothing_length=1.414216 * fluid_particle_spacing,
+              time_integration_scheme=SymplecticPositionVerlet(),
+              boundary_density_calculator=ContinuityDensity(),
+              fluid_particle_spacing=fluid_particle_spacing,
+              tank_size=(4.0, 3.0), W=1.0, H=2.0,
+              spacing_ratio=1, boundary_layers=1,
+              tspan=(0.0, 0.05), cfl=0.2,
+              neighborhood_search=neighborhood_search,
+              viscosity_wall=viscosity_fluid,
+              dt=1e-5, stepsize_callback=nothing, saving_callback=nothing,
+              parallelization_backend=CUDABackend())
diff --git a/examples/paper/dam_break_2d_val_600.jl b/examples/paper/dam_break_2d_val_600.jl
new file mode 100644
index 0000000000..2a26c9d418
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600.jl
@@ -0,0 +1,42 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# When using data center CPUs with large numbers of cores, especially on multi-socket
+# systems with multiple NUMA nodes, pinning threads to cores can significantly
+# improve performance, even for low resolutions.
+# using ThreadPinning
+# pinthreads(:numa)
+
+# `resolution` in this case is set relative to `H`, the initial height of the fluid.
+# Use 40, 80 or 400 for validation.
+# Note: 400 takes about 30 minutes on a large data center CPU (much longer with serial update)
+resolution = 600
+
+min_corner = (-1.5, -1.5)
+max_corner = (6.5, 5.0)
+cell_list = FullGridCellList(; min_corner, max_corner, max_points_per_cell=30)
+
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list, update_strategy=ParallelUpdate())
+# neighborhood_search = GridNeighborhoodSearch{2}(; cell_list)
+
+
+# ==========================================================================================
+# ==== WCSPH simulation
+trixi_include_changeprecision(Float32, @__MODULE__,
+              joinpath(validation_dir(), "dam_break_2d",
+                       "setup_marrone_2011.jl"),
+              use_edac=false,
+              particles_per_height=resolution,
+              sound_speed=50 * sqrt(9.81 * 0.6), # This is used by De Courcy et al. (2024)
+              alpha=0.01, # This is used by De Courcy et al. (2024)
+              tspan=(0.0, 7 / sqrt(9.81 / 0.6)), # This is used by De Courcy et al. (2024)
+              parallelization_backend=CUDABackend(),
+              neighborhood_search=neighborhood_search)
diff --git a/examples/paper/dam_break_2d_val_600_phys_viscosity.jl b/examples/paper/dam_break_2d_val_600_phys_viscosity.jl
new file mode 100644
index 0000000000..7fd7223ac1
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600_phys_viscosity.jl
@@ -0,0 +1,49 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# When using data center CPUs with large numbers of cores, especially on multi-socket
+# systems with multiple NUMA nodes, pinning threads to cores can significantly
+# improve performance, even for low resolutions.
+# using ThreadPinning
+# pinthreads(:numa)
+
+# `resolution` in this case is set relative to `H`, the initial height of the fluid.
+# Use 40, 80 or 400 for validation.
+# Note: 400 takes about 30 minutes on a large data center CPU (much longer with serial update)
+resolution = 600
+
+min_corner = (-1.5, -1.5)
+max_corner = (6.5, 5.0)
+cell_list = FullGridCellList(; min_corner, max_corner, max_points_per_cell=30)
+
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list, update_strategy=ParallelUpdate())
+# neighborhood_search = GridNeighborhoodSearch{2}(; cell_list)
+
+# switch to physical viscosity model
+viscosity_fluid = ViscosityMorris(nu = 8.9E-7)
+# viscosity_fluid = ViscosityAdami(nu = 8.9E-7)
+#  viscosity_fluid = nothing
+
+
+# ==========================================================================================
+# ==== WCSPH simulation
+trixi_include(@__MODULE__,
+              joinpath(validation_dir(), "dam_break_2d",
+                       "setup_marrone_2011.jl"),
+              use_edac=false,
+              extra_string = "_phys_viscosity",
+              viscosity_fluid=viscosity_fluid,
+              particles_per_height=resolution,
+              sound_speed=50 * sqrt(9.81 * 0.6), # This is used by De Courcy et al. (2024)
+              tspan=(0.0, 7 / sqrt(9.81 / 0.6)), # This is used by De Courcy et al. (2024)
+            #   tspan=(0.0, 0.0001), # This is used by De Courcy et al. (2024)
+              parallelization_backend=CUDABackend(),
+              neighborhood_search=neighborhood_search)
diff --git a/examples/paper/dam_break_2d_val_600_phys_viscosity_2ph.jl b/examples/paper/dam_break_2d_val_600_phys_viscosity_2ph.jl
new file mode 100644
index 0000000000..b2509dd0e3
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600_phys_viscosity_2ph.jl
@@ -0,0 +1,131 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# When using data center CPUs with large numbers of cores, especially on multi-socket
+# systems with multiple NUMA nodes, pinning threads to cores can significantly
+# improve performance, even for low resolutions.
+# using ThreadPinning
+# pinthreads(:numa)
+
+#TODO: duplicated
+# Size parameters
+H = 0.6
+# W = 2 * H
+
+# `resolution` in this case is set relative to `H`, the initial height of the fluid.
+# Use 40, 80 or 400 for validation.
+# Note: 400 takes about 30 minutes on a large data center CPU (much longer with serial update)
+resolution = 200
+
+#TODO: duplicated
+fluid_particle_spacing = H / resolution
+
+trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              fluid_particle_spacing=fluid_particle_spacing,
+              sol=nothing,
+              ode=nothing)
+
+# tank_size = (floor(5.366 * H / fluid_particle_spacing) * fluid_particle_spacing, 4.0)
+
+
+min_corner = (-1.5, -1.5)
+max_corner = (6.5, 5.0)
+cell_list = FullGridCellList(; min_corner, max_corner, max_points_per_cell=30)
+
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list, update_strategy=ParallelUpdate())
+# neighborhood_search = GridNeighborhoodSearch{2}(; cell_list)
+
+# physical values
+nu_water = 8.9E-7*10
+nu_air = 1.544E-5*10
+
+# switch to physical viscosity model
+viscosity_fluid = ViscosityMorris(nu = nu_water)
+# viscosity_fluid = ViscosityAdami(nu = 8.9E-7)
+#  viscosity_fluid = nothing
+
+# set air viscosity model
+viscosity_air = ViscosityMorris(nu = nu_air)
+
+#TODO: duplicated
+smoothing_length = 2 * fluid_particle_spacing
+smoothing_kernel = WendlandC2Kernel{2}()
+fluid_density_calculator = ContinuityDensity()
+
+
+# ==========================================================================================
+# ==== Setup air_system layer
+
+air_size = (tank_size[1], tank_size[2]-H)
+air_size2 = (tank_size[1] - W, tank_size[2])
+air_density = 1.0
+
+# Air above the initial water
+air_system = RectangularShape(fluid_particle_spacing,
+                              round.(Int, air_size ./ fluid_particle_spacing),
+                              zeros(length(air_size)), density=air_density)
+
+# Air for the rest of the empty volume
+air_system2 = RectangularShape(fluid_particle_spacing,
+                               round.(Int, air_size2 ./ fluid_particle_spacing),
+                               (W, 0.0), density=air_density)
+
+# move on top of the water
+for i in axes(air_system.coordinates, 2)
+    air_system.coordinates[:, i] .+= [0.0, H]
+end
+
+air_system = union(air_system, air_system2)
+
+#TODO: duplicated
+sound_speed=50 * sqrt(9.81 * 0.6)
+gravity = 9.81
+
+air_eos = StateEquationCole(; sound_speed, reference_density=air_density, exponent=1,
+                            clip_negative_pressure=false, background_pressure=10.0)
+
+air_system_system = WeaklyCompressibleSPHSystem(air_system, fluid_density_calculator,
+                                                air_eos, smoothing_kernel, smoothing_length,
+                                                viscosity=viscosity_air,
+                                                acceleration=(0.0, -gravity))
+
+tank_air_density = fill!(similar(tank.boundary.density), air_density)
+tank_air_mass = fill!(similar(tank.boundary.mass), air_density*fluid_particle_spacing^2)
+
+air_boundary_model = BoundaryModelDummyParticles(tank_air_density,
+                                             tank_air_mass,
+                                             state_equation=air_eos,
+                                             boundary_density_calculator,
+                                             smoothing_kernel,
+                                             smoothing_length,
+                                             correction=nothing,
+                                             reference_particle_spacing=fluid_particle_spacing,
+                                             viscosity=viscosity_wall)
+
+air_boundary_system = WallBoundarySystem(tank.boundary, air_boundary_model,
+                                     adhesion_coefficient=0.0)
+
+
+trixi_include(@__MODULE__,
+              joinpath(validation_dir(), "dam_break_2d",
+                       "setup_marrone_2011.jl"),
+              use_edac=false,
+              extra_string = "_phys_viscosity_2ph_v2_pa10",
+              viscosity_fluid=viscosity_fluid,
+              particles_per_height=resolution,
+              sound_speed=50 * sqrt(9.81 * 0.6), # This is used by De Courcy et al. (2024) (120)
+              tspan=(0.0, 7 / sqrt(9.81 / 0.6)), # This is used by De Courcy et al. (2024)
+              #   tspan=(0.0, 0.0001),
+              cfl=0.5,
+              parallelization_backend=PolyesterBackend(),
+              neighborhood_search=neighborhood_search,
+              extra_system=air_system_system,
+              extra_system2=air_boundary_system)
diff --git a/examples/paper/dam_break_2d_val_600_phys_viscosity_2ph_gpu.jl b/examples/paper/dam_break_2d_val_600_phys_viscosity_2ph_gpu.jl
new file mode 100644
index 0000000000..0256e48e78
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600_phys_viscosity_2ph_gpu.jl
@@ -0,0 +1,17 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# ==========================================================================================
+# ==== WCSPH simulation
+trixi_include_changeprecision(Float32, @__MODULE__,
+              joinpath(examples_dir(), "paper",
+                       "dam_break_2d_val_600_phys_viscosity_2ph.jl"),
+                       parallelization_backend=CUDABackend())
diff --git a/examples/paper/dam_break_2d_val_600_phys_viscosity_gpu.jl b/examples/paper/dam_break_2d_val_600_phys_viscosity_gpu.jl
new file mode 100644
index 0000000000..f1e2773408
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600_phys_viscosity_gpu.jl
@@ -0,0 +1,16 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# ==========================================================================================
+# ==== WCSPH simulation
+trixi_include_changeprecision(Float32, @__MODULE__,
+              joinpath(examples_dir(), "paper",
+                       "dam_break_2d_val_600_phys_viscosity.jl"))
diff --git a/examples/paper/dam_break_2d_val_600_phys_viscosity_shifting.jl b/examples/paper/dam_break_2d_val_600_phys_viscosity_shifting.jl
new file mode 100644
index 0000000000..0087375653
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600_phys_viscosity_shifting.jl
@@ -0,0 +1,51 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# When using data center CPUs with large numbers of cores, especially on multi-socket
+# systems with multiple NUMA nodes, pinning threads to cores can significantly
+# improve performance, even for low resolutions.
+# using ThreadPinning
+# pinthreads(:numa)
+
+# `resolution` in this case is set relative to `H`, the initial height of the fluid.
+# Use 40, 80 or 400 for validation.
+# Note: 400 takes about 30 minutes on a large data center CPU (much longer with serial update)
+resolution = 600
+
+min_corner = (-1.5, -1.5)
+max_corner = (6.5, 5.0)
+cell_list = FullGridCellList(; min_corner, max_corner, max_points_per_cell=30)
+
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list, update_strategy=ParallelUpdate())
+# neighborhood_search = GridNeighborhoodSearch{2}(; cell_list)
+
+# switch to physical viscosity model
+viscosity_fluid = ViscosityMorris(nu = 8.9E-7)
+# viscosity_fluid = ViscosityAdami(nu = 8.9E-7)
+# alpha = 0.02
+# viscosity_fluid = ArtificialViscosityMonaghan(alpha=alpha, beta=0.0)
+
+
+# ==========================================================================================
+# ==== WCSPH simulation
+trixi_include(@__MODULE__,
+              joinpath(validation_dir(), "dam_break_2d",
+                       "setup_marrone_2011.jl"),
+              use_edac=false,
+              extra_string = "_phys_viscosity_shifting",
+              viscosity_fluid=viscosity_fluid,
+              particles_per_height=resolution,
+              shifting_technique=ConsistentShiftingSun2019(),
+            #   shifting_technique=ParticleShiftingTechniqueSun2017(),
+              sound_speed=50 * sqrt(9.81 * 0.6), # This is used by De Courcy et al. (2024)
+              tspan=(0.0, 7 / sqrt(9.81 / 0.6)), # This is used by De Courcy et al. (2024)
+              parallelization_backend=PolyesterBackend(),
+              neighborhood_search=neighborhood_search)
diff --git a/examples/paper/dam_break_2d_val_600_phys_viscosity_shifting_gpu.jl b/examples/paper/dam_break_2d_val_600_phys_viscosity_shifting_gpu.jl
new file mode 100644
index 0000000000..471b26200c
--- /dev/null
+++ b/examples/paper/dam_break_2d_val_600_phys_viscosity_shifting_gpu.jl
@@ -0,0 +1,16 @@
+# This file computes the pressure sensor data of the dam break setup described in
+#
+# J. J. De Courcy, T. C. S. Rendall, L. Constantin, B. Titurus, J. E. Cooper.
+# "Incompressible δ-SPH via artificial compressibility".
+# In: Computer Methods in Applied Mechanics and Engineering, Volume 420 (2024),
+# https://doi.org/10.1016/j.cma.2023.116700
+
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+# ==========================================================================================
+# ==== WCSPH simulation
+trixi_include_changeprecision(Float32, @__MODULE__,
+              joinpath(examples_dir(), "paper",
+                       "dam_break_2d_val_600_phys_viscosity_shifting.jl"), parallelization_backend=CUDABackend())
diff --git a/src/callbacks/post_process.jl b/src/callbacks/post_process.jl
index 5c3beefcbc..a6df83a305 100644
--- a/src/callbacks/post_process.jl
+++ b/src/callbacks/post_process.jl
@@ -230,20 +230,20 @@ end
 function (pp::PostprocessCallback)(integrator; from_initialize=false)
     @trixi_timeit timer() "apply postprocess cb" begin
         vu_ode = integrator.u
-        if from_initialize
+        # if from_initialize
             # Avoid calling `get_du` here, since it will call the RHS function
             # if it is called before the first time step.
             # This would cause problems with `semi.update_callback_used`,
             # which might not yet be set to `true` at this point if the `UpdateCallback`
             # comes AFTER the `PostprocessCallback` in the `CallbackSet`.
             dv_ode, du_ode = zero(vu_ode).x
-        else
-            # Depending on the time integration scheme, this might call the RHS function
-            @trixi_timeit timer() "update du" begin
-                # Don't create sub-timers here to avoid cluttering the timer output
-                @notimeit timer() dv_ode, du_ode = get_du(integrator).x
-            end
-        end
+        # else
+        #     # Depending on the time integration scheme, this might call the RHS function
+        #     @trixi_timeit timer() "update du" begin
+        #         # Don't create sub-timers here to avoid cluttering the timer output
+        #         @notimeit timer() dv_ode, du_ode = get_du(integrator).x
+        #     end
+        # end
         v_ode, u_ode = vu_ode.x
         semi = integrator.p
         t = integrator.t
diff --git a/src/callbacks/solution_saving.jl b/src/callbacks/solution_saving.jl
index 0c24ad55f3..0e67188126 100644
--- a/src/callbacks/solution_saving.jl
+++ b/src/callbacks/solution_saving.jl
@@ -155,16 +155,17 @@ function (solution_callback::SolutionSavingCallback)(integrator; from_initialize
      prefix, latest_saved_iter, max_coordinates) = solution_callback
 
     vu_ode = integrator.u
-    if from_initialize
+    # TODO doesn't work when `v_ode` and `u_ode` have different eltypes
+    # if from_initialize
         # Avoid calling `get_du` here, since it will call the RHS function
         # if it is called before the first time step.
         # This would cause problems with `semi.update_callback_used`,
         # which might not yet be set to `true` at this point if the `UpdateCallback`
         # comes AFTER the `SolutionSavingCallback` in the `CallbackSet`.
         dvdu_ode = zero(vu_ode)
-    else
-        dvdu_ode = get_du(integrator)
-    end
+    # else
+    #     dvdu_ode = get_du(integrator)
+    # end
     semi = integrator.p
     iter = get_iter(interval, integrator)
 
diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index f854d80d6b..d1413fd067 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -97,6 +97,8 @@ end
 # This can be dispatched by system type
 @inline initial_coordinates(system) = system.initial_condition.coordinates
 
+@inline coordinates_eltype(system::AbstractSystem) = eltype(initial_coordinates(system))
+
 @propagate_inbounds function current_velocity(v, system, particle)
     return extract_svector(current_velocity(v, system), system, particle)
 end
diff --git a/src/general/initial_condition.jl b/src/general/initial_condition.jl
index 4eb1af8f00..8d447745b2 100644
--- a/src/general/initial_condition.jl
+++ b/src/general/initial_condition.jl
@@ -81,17 +81,17 @@ initial_condition = InitialCondition(; coordinates, velocity=x -> 2x, mass=1.0,
 
 # output
 ┌──────────────────────────────────────────────────────────────────────────────────────────────────┐
-│ InitialCondition{Float64}                                                                        │
-│ ═════════════════════════                                                                        │
+│ InitialCondition{Float64, Float64}                                                               │
+│ ═════════════════════════=========                                                               │
 │ #dimensions: ……………………………………………… 2                                                                │
 │ #particles: ………………………………………………… 3                                                                │
 │ particle spacing: ………………………………… -1.0                                                             │
 └──────────────────────────────────────────────────────────────────────────────────────────────────┘
 ```
 """
-struct InitialCondition{ELTYPE, MATRIX, VECTOR}
+struct InitialCondition{ELTYPE, C, MATRIX, VECTOR}
     particle_spacing :: ELTYPE
-    coordinates      :: MATRIX # Array{ELTYPE, 2}
+    coordinates      :: C      # Array{coordinates_eltype, 2}
     velocity         :: MATRIX # Array{ELTYPE, 2}
     mass             :: VECTOR # Array{ELTYPE, 1}
     density          :: VECTOR # Array{ELTYPE, 1}
@@ -111,9 +111,19 @@ end
 # Function barrier to make `NDIMS` static and therefore SVectors type-stable
 function InitialCondition{NDIMS}(coordinates, velocity, mass, density,
                                  pressure, particle_spacing) where {NDIMS}
-    ELTYPE = eltype(coordinates)
+    if !(particle_spacing isa AbstractFloat)
+        throw(ArgumentError("particle spacing must be a floating point number. " *
+                            "The type of the particle spacing determines the eltype " *
+                            "of the `InitialCondition`."))
+    end
+
+    # The arguments can all be functions, so use `particle_spacing` for the eltype
+    ELTYPE = typeof(particle_spacing)
     n_particles = size(coordinates, 2)
 
+    # Coordinates can have a different eltype, e.g., Float64 if others are Float32
+    coordinates_eltype = eltype(coordinates)
+
     if n_particles == 0
         return InitialCondition(particle_spacing, coordinates, zeros(ELTYPE, NDIMS, 0),
                                 zeros(ELTYPE, 0), zeros(ELTYPE, 0), zeros(ELTYPE, 0))
@@ -121,7 +131,8 @@ function InitialCondition{NDIMS}(coordinates, velocity, mass, density,
 
     # SVector of coordinates to pass to functions.
     # This will return a vector of SVectors in 2D and 3D, but an 1×n matrix in 1D.
-    coordinates_svector_ = reinterpret(reshape, SVector{NDIMS, ELTYPE}, coordinates)
+    coordinates_svector_ = reinterpret(reshape, SVector{NDIMS, coordinates_eltype},
+                                       coordinates)
     # In 1D, this will reshape the 1×n matrix to a vector, in 2D/3D it will do nothing
     coordinates_svector = reshape(coordinates_svector_, length(coordinates_svector_))
 
@@ -194,19 +205,21 @@ end
 function Base.show(io::IO, ic::InitialCondition)
     @nospecialize ic # reduce precompilation time
 
-    print(io, "InitialCondition{$(eltype(ic))}()")
+    print(io, "InitialCondition{$(eltype(ic)), $(coordinates_eltype(ic))}()")
 end
 
 function Base.show(io::IO, ::MIME"text/plain", ic::InitialCondition)
     @nospecialize ic # reduce precompilation time
 
     if get(io, :compact, false)
-        show(io, system)
+        show(io, ic)
     else
-        summary_header(io, "InitialCondition{$(eltype(ic))}")
+        summary_header(io, "InitialCondition")
         summary_line(io, "#dimensions", "$(ndims(ic))")
         summary_line(io, "#particles", "$(nparticles(ic))")
         summary_line(io, "particle spacing", "$(first(ic.particle_spacing))")
+        summary_line(io, "eltype", "$(eltype(ic))")
+        summary_line(io, "coordinate eltype", "$(coordinates_eltype(ic))")
         summary_footer(io)
     end
 end
@@ -229,7 +242,9 @@ end
     return size(initial_condition.coordinates, 1)
 end
 
-@inline function Base.eltype(initial_condition::InitialCondition)
+@inline Base.eltype(::InitialCondition{ELTYPE}) where {ELTYPE} = ELTYPE
+
+@inline function coordinates_eltype(initial_condition::InitialCondition)
     return eltype(initial_condition.coordinates)
 end
 
diff --git a/src/general/interpolation.jl b/src/general/interpolation.jl
index a530c7d86f..8269f33628 100644
--- a/src/general/interpolation.jl
+++ b/src/general/interpolation.jl
@@ -404,7 +404,9 @@ function interpolate_line(start, end_, n_points, semi, ref_system, v_ode, u_ode;
     # Convert to coordinate matrix
     points_coords_ = collect(reinterpret(reshape, eltype(start_svector), points_coords))
 
-    return interpolate_points(points_coords_, semi, ref_system, v_ode, u_ode;
+    points_coords__ = Adapt.adapt(semi.parallelization_backend, points_coords_)
+
+    return interpolate_points(points_coords__, semi, ref_system, v_ode, u_ode;
                               smoothing_length=smoothing_length, include_wall_velocity,
                               cut_off_bnd=cut_off_bnd, clip_negative_pressure)
 end
@@ -513,9 +515,15 @@ function process_neighborhood_searches(semi, u_ode, ref_system, smoothing_length
         old_nhs = get_neighborhood_search(ref_system, system, semi)
         nhs = PointNeighbors.copy_neighborhood_search(old_nhs, search_radius,
                                                       nparticles(system))
-        PointNeighbors.initialize!(nhs, point_coords, system_coords)
 
-        return nhs
+        # In case of GPU backends, we need to move the internal data structures
+        # of the neighborhood search to the GPU as well.
+        # On the CPU, this is a no-op.
+        nhs_ = Adapt.adapt(semi.parallelization_backend, nhs)
+
+        PointNeighbors.initialize!(nhs_, point_coords, system_coords)
+
+        return nhs_
     end
 
     return neighborhood_searches
diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
index 1c136ebc69..562c672e6e 100644
--- a/src/general/semidiscretization.jl
+++ b/src/general/semidiscretization.jl
@@ -287,9 +287,15 @@ u0: ([...], [...]) *this line is ignored by filter*
 function semidiscretize(semi, tspan; reset_threads=true)
     (; systems) = semi
 
+    # Check that all systems have the same eltype
     @assert all(system -> eltype(system) === eltype(systems[1]), systems)
     ELTYPE = eltype(systems[1])
 
+    # Check that all systems have the same coordinates eltype
+    @assert all(system -> coordinates_eltype(system) === coordinates_eltype(systems[1]),
+                systems)
+    cELTYPE = coordinates_eltype(systems[1])
+
     # Optionally reset Polyester.jl threads. See
     # https://github.com/trixi-framework/Trixi.jl/issues/1583
     # https://github.com/JuliaSIMD/Polyester.jl/issues/30
@@ -302,7 +308,7 @@ function semidiscretize(semi, tspan; reset_threads=true)
 
     # Use either the specified backend, e.g., `CUDABackend` or `MetalBackend` or
     # use CPU vectors for all CPU backends.
-    u0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_u))
+    u0_ode_ = allocate(semi.parallelization_backend, cELTYPE, sum(sizes_u))
     v0_ode_ = allocate(semi.parallelization_backend, ELTYPE, sum(sizes_v))
 
     if semi.parallelization_backend isa KernelAbstractions.Backend
diff --git a/src/schemes/boundary/open_boundary/system.jl b/src/schemes/boundary/open_boundary/system.jl
index 4f409716e6..d032bd8410 100644
--- a/src/schemes/boundary/open_boundary/system.jl
+++ b/src/schemes/boundary/open_boundary/system.jl
@@ -524,7 +524,7 @@ end
 
 # One face of the boundary zone is the transition to the fluid domain.
 # The face opposite to this transition face is a free surface.
-@inline function modify_shifting_at_free_surfaces!(system::OpenBoundarySystem, u, semi)
+@inline function modify_shifting_at_free_surfaces!(system::OpenBoundarySystem, u, semi, u_ode)
     (; fluid_system, cache) = system
 
     @threaded semi for particle in each_integrated_particle(system)
diff --git a/src/schemes/boundary/wall_boundary/rhs.jl b/src/schemes/boundary/wall_boundary/rhs.jl
index e0553764ef..a66dc2fa4f 100644
--- a/src/schemes/boundary/wall_boundary/rhs.jl
+++ b/src/schemes/boundary/wall_boundary/rhs.jl
@@ -16,6 +16,11 @@ function interact!(dv, v_particle_system, u_particle_system,
                    semi)
     (; boundary_model) = particle_system
 
+    # skip interactions of systems with different reference densities
+    if abs(particle_system.boundary_model.state_equation.reference_density-neighbor_system.state_equation.reference_density) > 1
+        return dv
+    end
+
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
diff --git a/src/schemes/boundary/wall_boundary/system.jl b/src/schemes/boundary/wall_boundary/system.jl
index 1226d8f8bc..151787e7aa 100644
--- a/src/schemes/boundary/wall_boundary/system.jl
+++ b/src/schemes/boundary/wall_boundary/system.jl
@@ -14,10 +14,10 @@ The interaction between fluid and boundary particles is specified by the boundar
 - `adhesion_coefficient`: Coefficient specifying the adhesion of a fluid to the surface.
    Note: currently it is assumed that all fluids have the same adhesion coefficient.
 """
-struct WallBoundarySystem{BM, NDIMS, ELTYPE <: Real, IC, CO, M, IM,
+struct WallBoundarySystem{BM, ELTYPE <: Real, NDIMS, IC, CO, M, IM,
                           CA} <: AbstractBoundarySystem{NDIMS}
     initial_condition    :: IC
-    coordinates          :: CO # Array{ELTYPE, 2}
+    coordinates          :: CO # Array{coordinates_eltype, 2}
     boundary_model       :: BM
     prescribed_motion    :: M
     ismoving             :: IM # Ref{Bool} (to make a mutable field compatible with GPUs)
@@ -28,9 +28,9 @@ struct WallBoundarySystem{BM, NDIMS, ELTYPE <: Real, IC, CO, M, IM,
     # See the comments in general/gpu.jl for more details.
     function WallBoundarySystem(initial_condition, coordinates, boundary_model,
                                 prescribed_motion, ismoving, adhesion_coefficient, cache)
-        ELTYPE = eltype(coordinates)
+        ELTYPE = eltype(initial_condition)
 
-        new{typeof(boundary_model), size(coordinates, 1), ELTYPE, typeof(initial_condition),
+        new{typeof(boundary_model), ELTYPE, size(coordinates, 1), typeof(initial_condition),
             typeof(coordinates), typeof(prescribed_motion), typeof(ismoving),
             typeof(cache)}(initial_condition, coordinates, boundary_model,
                            prescribed_motion, ismoving, adhesion_coefficient, cache)
@@ -61,9 +61,7 @@ function create_cache_boundary(prescribed_motion::PrescribedMotion, initial_cond
     return (; velocity, acceleration, initial_coordinates)
 end
 
-@inline function Base.eltype(system::WallBoundarySystem)
-    eltype(system.coordinates)
-end
+@inline Base.eltype(::WallBoundarySystem{<:Any, ELTYPE}) where {ELTYPE} = ELTYPE
 
 @inline function nparticles(system::WallBoundarySystem)
     size(system.coordinates, 2)
diff --git a/src/schemes/fluid/fluid.jl b/src/schemes/fluid/fluid.jl
index 35d14b5e04..ced86d228f 100644
--- a/src/schemes/fluid/fluid.jl
+++ b/src/schemes/fluid/fluid.jl
@@ -161,6 +161,10 @@ end
                            pos_diff, distance, m_b, rho_a, rho_b, particle_system,
                            grad_kernel)
     end
+
+        # continuity_equation_shifting!(dv, shifting_technique(particle_system),
+        #                           particle_system, neighbor_system,
+        #                           particle, neighbor, grad_kernel, rho_a, rho_b, m_b)
 end
 
 function calculate_dt(v_ode, u_ode, cfl_number, system::AbstractFluidSystem, semi)
diff --git a/src/schemes/fluid/shifting_techniques.jl b/src/schemes/fluid/shifting_techniques.jl
index 96fd6c1eee..2a32bd6464 100644
--- a/src/schemes/fluid/shifting_techniques.jl
+++ b/src/schemes/fluid/shifting_techniques.jl
@@ -18,7 +18,9 @@ function create_cache_shifting(initial_condition, ::AbstractShiftingTechnique)
     delta_v = zeros(eltype(initial_condition), ndims(initial_condition),
                     nparticles(initial_condition))
 
-    return (; delta_v)
+    nct = zeros(eltype(initial_condition), nparticles(initial_condition))
+
+    return (; delta_v, nct)
 end
 
 # `δv` is the correction to the particle velocity due to the shifting.
@@ -489,7 +491,7 @@ function update_shifting_inner!(system, shifting::ParticleShiftingTechnique,
         end
     end
 
-    modify_shifting_at_free_surfaces!(system, u, semi)
+    modify_shifting_at_free_surfaces!(system, u, semi, u_ode)
 
     return system
 end
@@ -693,10 +695,590 @@ function update_shifting!(system, shifting::TransportVelocityAdami, v, u, v_ode,
         end
     end
 
-    modify_shifting_at_free_surfaces!(system, u, semi)
+    modify_shifting_at_free_surfaces!(system, u, semi, u_ode)
 
     return system
 end
 
 # TODO: Implement free surface detection to disable shifting close to free surfaces
-@inline modify_shifting_at_free_surfaces!(system, u, semi) = system
+# function modify_shifting_at_free_surfaces!(system, u, semi, u_ode)
+#     T   = eltype(system.cache.delta_v)
+#     dim = ndims(system)
+#     np  = size(system.cache.delta_v, 2)
+#     epsT = eps(T)
+
+#     # ------------------------------------------------------------
+#     # PASS 1: free-surface indicators & wall stats
+#     #   - psi_nog:   Σ W_ab (EXCLUDING boundary systems)  → limiter/support
+#     #   - gsum_wg:   Σ ∇W_ab (INCLUDING ghosts)           → FS normals
+#     #   - psi_wall:  Σ W_ab (boundary only)               → wall ratio/damping
+#     #   - S (2x2):   Σ (∇W)(∇W)^T (boundary only)         → corner detector
+#     # ------------------------------------------------------------
+#     coordsA  = current_coordinates(u, system)
+#     psi_nog  = zeros(T, np)
+#     gsum_wg  = zeros(T, dim, np)
+#     psi_wall = zeros(T, np)
+#     S11 = zeros(T, np); S12 = zeros(T, np); S22 = zeros(T, np)  # used in 2D
+
+#     foreach_system(semi) do sysB
+#         # if u_ode not passed, we can only see same-system neighbors
+#         if (u_ode === nothing) && (sysB !== system)
+#             return
+#         end
+#         uB      = (u_ode === nothing) ? u : wrap_u(u_ode, sysB, semi)
+#         coordsB = current_coordinates(uB, sysB)
+
+#         foreach_point_neighbor(system, sysB, coordsA, coordsB, semi;
+#                                points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#             Wab  = smoothing_kernel(system, distance, a)
+#             gWab = smoothing_kernel_grad(system, pos_diff, distance, a)
+
+#             # WITH ghosts → robust normals near walls
+#             @inbounds for i in 1:dim
+#                 gsum_wg[i, a] += gWab[i]
+#             end
+
+#             if sysB isa AbstractBoundarySystem
+#                 @inbounds psi_wall[a] += Wab
+#                 if dim >= 2
+#                     @inbounds begin
+#                         S11[a] += gWab[1]*gWab[1]
+#                         S12[a] += gWab[1]*gWab[2]
+#                         S22[a] += gWab[2]*gWab[2]
+#                     end
+#                 end
+#             else
+#                 @inbounds psi_nog[a] += Wab   # support WITHOUT ghosts
+#             end
+#         end
+#     end
+
+#     smax_nog  = maximum(psi_nog)
+#     h         = smoothing_length(system, 1)
+
+#     # Tunables (more permissive than before)
+#     λ_min     = T(0.50)   # relaxed cutoff (was 0.55)
+#     α_mask    = T(0.85)   # FS detection threshold (unchanged)
+#     α_limit   = T(0.80)   # NEW: softer limiter denominator → more δu
+#     τ_grad    = T(0.30)
+#     s_min     = T(0.15)   # NEW: minimum limiter floor so FS keeps some δu
+
+#     # FS mask: detect using support (no-ghosts) and gradient magnitude (with-ghosts)
+#     FS = falses(np)
+#     @inbounds for a in 1:np
+#         g2 = zero(T)
+#         for i in 1:dim
+#             g2 += gsum_wg[i, a]^2
+#         end
+#         gnorm = sqrt(g2)
+#         FS[a] = (psi_nog[a] < α_mask * smax_nog) || (h * gnorm > τ_grad)
+#     end
+
+#     # Unit normals from WITH-ghosts gradient sum (only where FS=true)
+#     n_hat = zeros(T, dim, np)
+#     @inbounds for a in 1:np
+#         if FS[a]
+#             g2 = zero(T)
+#             for i in 1:dim
+#                 g2 += gsum_wg[i, a]^2
+#             end
+#             gnorm = sqrt(g2)
+#             if gnorm > T(1e2)*epsT
+#                 for i in 1:dim
+#                     n_hat[i, a] = -gsum_wg[i, a] / gnorm
+#                 end
+#             end
+#         end
+#     end
+
+#     # ------------------------------------------------------------
+#     # Curvature κ ≈ -∇·n_hat, fluid-only gather (kept, but softer limiter later)
+#     # ------------------------------------------------------------
+#     kappa = zeros(T, np)
+#     foreach_point_neighbor(system, system, coordsA, coordsA, semi;
+#                            points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#         if FS[a] || FS[b]
+#             gW = smoothing_kernel_grad(system, pos_diff, distance, a)
+#             acc = zero(T)
+#             @inbounds for i in 1:dim
+#                 acc += (n_hat[i, b] - n_hat[i, a]) * gW[i]
+#             end
+#             @inbounds kappa[a] -= acc
+#         end
+#     end
+#     κ0 = max(T(1.2)/h, T(1e-6))  # relaxed (was 0.8/h)
+#     p  = T(1.5)                  # softer slope (was 2)
+
+#     # ------------------------------------------------------------
+#     # Tangential XSPH smoothing on δv (two-buffer, normalized): gentler
+#     # ------------------------------------------------------------
+#     alpha_t = T(0.04)  # gentler (was 0.08)
+#     dvt_src = zeros(T, dim, np) # tangential component of δv (source)
+#     dvt_acc = zeros(T, dim, np) # gather increments
+#     wsum    = zeros(T, np)
+
+#     if alpha_t > 0
+#         # build tangential source
+#         @inbounds for a in 1:np
+#             if FS[a]
+#                 dotnv = zero(T)
+#                 for i in 1:dim
+#                     dotnv += n_hat[i, a] * system.cache.delta_v[i, a]
+#                 end
+#                 for i in 1:dim
+#                     dvt_src[i, a] = system.cache.delta_v[i, a] - dotnv * n_hat[i, a]
+#                 end
+#             end
+#         end
+
+#         # gather smoothing (fluid-only, FS–FS)
+#         foreach_point_neighbor(system, system, coordsA, coordsA, semi;
+#                                points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#             if FS[a] && FS[b]
+#                 Wab = smoothing_kernel(system, distance, a)
+#                 @inbounds begin
+#                     for i in 1:dim
+#                         dvt_acc[i, a] += (dvt_src[i, b] - dvt_src[i, a]) * Wab
+#                     end
+#                     wsum[a] += Wab
+#                 end
+#             end
+#         end
+
+#         # finalize tangential smoothing back into dvt_src
+#         @inbounds for a in 1:np
+#             if FS[a] && isfinite(wsum[a]) && wsum[a] > epsT
+#                 scale = alpha_t / wsum[a]
+#                 for i in 1:dim
+#                     dvt_src[i, a] += scale * dvt_acc[i, a]
+#                 end
+#             end
+#         end
+#     end
+
+#     # ------------------------------------------------------------
+#     # PASS 2: strongest two wall normals per particle (for corners)
+#     # ------------------------------------------------------------
+#     wall_g1 = zeros(T, dim, np)   # strongest wall normal proxy
+#     wall_g2 = zeros(T, dim, np)   # second strongest
+#     nrm1    = zeros(T, np)        # |g1|^2
+#     nrm2    = zeros(T, np)        # |g2|^2
+
+#     foreach_system(semi) do sysB
+#         if !(sysB isa AbstractBoundarySystem)
+#             return
+#         end
+#         if (u_ode === nothing) && (sysB !== system)
+#             return
+#         end
+#         uB      = (u_ode === nothing) ? u : wrap_u(u_ode, sysB, semi)
+#         coordsB = current_coordinates(uB, sysB)
+
+#         # accumulate gradient sum for THIS wall system
+#         gtmp = zeros(T, dim, np)
+#         foreach_point_neighbor(system, sysB, coordsA, coordsB, semi;
+#                                points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#             gW = smoothing_kernel_grad(system, pos_diff, distance, a)
+#             @inbounds for i in 1:dim
+#                 gtmp[i, a] += gW[i]
+#             end
+#         end
+
+#         # update top-2 per particle
+#         @inbounds for a in 1:np
+#             wx = gtmp[1,a]; wy = dim>=2 ? gtmp[2,a] : zero(T); wz = dim==3 ? gtmp[3,a] : zero(T)
+#             val = wx*wx + wy*wy + wz*wz
+#             if val > nrm1[a] + epsT
+#                 for i in 1:dim
+#                     wall_g2[i,a] = wall_g1[i,a]
+#                     wall_g1[i,a] = gtmp[i,a]
+#                 end
+#                 nrm2[a] = nrm1[a]
+#                 nrm1[a] = val
+#             elseif val > nrm2[a] + epsT
+#                 for i in 1:dim
+#                     wall_g2[i,a] = gtmp[i,a]
+#                 end
+#                 nrm2[a] = val
+#             end
+#         end
+#     end
+
+#     # ------------------------------------------------------------
+#     # PASS 3: modify δv in place (kernel body)
+#     # ------------------------------------------------------------
+#     β_wall   = T(2)         # wall-proximity damping exponent
+#     κ_corner = T(0.30)      # corner eigen-ratio threshold (2D)
+
+#     @threaded semi for a in each_integrated_particle(system)
+#         # --- 3.0: corner kill-switch (2D only)
+#         if dim == 2
+#             tr  = @inbounds S11[a] + S22[a]
+#             det = @inbounds S11[a]*S22[a] - S12[a]*S12[a]
+#             disc = max(tr*tr - 4*det, zero(T))
+#             lam1 = T(0.5)*(tr + sqrt(disc))
+#             lam2 = T(0.5)*(tr - sqrt(disc))
+#             if lam1 > epsT && lam2/lam1 > κ_corner
+#                 @inbounds for i in 1:dim
+#                     system.cache.delta_v[i, a] = zero(T)
+#                 end
+#                 return
+#             end
+#         end
+
+#         # --- 3.1: smooth wall-proximity damping
+#         rw = @inbounds psi_wall[a] / (psi_wall[a] + psi_nog[a] + epsT)
+#         scale_wall = (one(T) - clamp(rw, zero(T), one(T)))^β_wall
+#         @inbounds for i in 1:dim
+#             system.cache.delta_v[i, a] *= scale_wall
+#         end
+
+#         # --- 3.2: λ-cutoff (Sun'19): disable PST if λ is too small
+#         λ = @inbounds psi_nog[a] / (smax_nog + epsT)
+#         if λ < λ_min
+#             @inbounds for i in 1:dim
+#                 system.cache.delta_v[i, a] = zero(T)
+#             end
+#             return
+#         end
+
+#         # --- 3.3: enforce wall no-penetration (project out up to 2 wall normals)
+#         n1 = nrm1[a]; n2 = nrm2[a]
+#         if n1 > epsT
+#             dot1 = zero(T)
+#             @inbounds for i in 1:dim
+#                 dot1 += system.cache.delta_v[i,a] * wall_g1[i,a]
+#             end
+#             scal1 = dot1 / (n1 + epsT)
+#             @inbounds for i in 1:dim
+#                 system.cache.delta_v[i,a] -= scal1 * wall_g1[i,a]
+#             end
+#         end
+#         if n2 > epsT
+#             # avoid re-projecting along near-colinear normals
+#             dot2 = zero(T); g1g2 = zero(T)
+#             @inbounds for i in 1:dim
+#                 dot2 += system.cache.delta_v[i,a] * wall_g2[i,a]
+#                 g1g2 += wall_g1[i,a]*wall_g2[i,a]
+#             end
+#             cos2 = (n1 > epsT) ? (g1g2*g1g2)/(n1*n2 + epsT) : zero(T)
+#             if cos2 < T(0.99)
+#                 scal2 = dot2 / (n2 + epsT)
+#                 @inbounds for i in 1:dim
+#                     system.cache.delta_v[i,a] -= scal2 * wall_g2[i,a]
+#                 end
+#             end
+#         end
+
+#         # --- 3.4: free-surface tangential-only (with sign test)
+#         g2 = zero(T)
+#         @inbounds for i in 1:dim
+#             g2 += gsum_wg[i,a] * gsum_wg[i,a]
+#         end
+#         gnorm = sqrt(g2)
+#         if FS[a] && gnorm > epsT
+#             # apply only if (n · δu*) >= 0 (i.e., outward)
+#             ndotu = zero(T)
+#             @inbounds for i in 1:dim
+#                 ndotu += (-gsum_wg[i,a] / gnorm) * system.cache.delta_v[i,a]
+#             end
+#             if ndotu >= 0
+#                 @inbounds for i in 1:dim
+#                     system.cache.delta_v[i,a] -= ndotu * (-gsum_wg[i,a] / (gnorm + epsT))
+#                 end
+#             end
+#             # soft FS limiter based on support without ghosts (LESS damping)
+#             s = @inbounds psi_nog[a] / (α_limit*smax_nog + epsT)
+#             s = clamp(s, s_min, one(T))   # NEW: floor keeps some δu
+#             @inbounds for i in 1:dim
+#                 system.cache.delta_v[i,a] *= s
+#             end
+#         end
+
+#         # --- 3.5: curvature-aware limiter (softer)
+#         kap    = @inbounds abs(kappa[a])
+#         f_curv = one(T) / (one(T) + (kap / (κ0 + epsT))^p)
+#         @inbounds for i in 1:dim
+#             system.cache.delta_v[i, a] *= f_curv
+#         end
+
+#         # --- 3.6: gently blend smoothed tangential δv (additive, not replacement)
+#         if alpha_t > 0 && FS[a]
+#             # add a small smoothed tangential increment, keep current normal part
+#             dotnv = zero(T)
+#             @inbounds for i in 1:dim
+#                 dotnv += n_hat[i, a] * system.cache.delta_v[i, a]
+#             end
+#             @inbounds for i in 1:dim
+#                 # δv ← current + (dvt_src - current_tangential) * η
+#                 # but dvt_src already includes (I - nnᵀ)δv + smoothing; add small increment:
+#                 inc = dvt_src[i, a] - (system.cache.delta_v[i, a] - dotnv * n_hat[i, a])
+#                 system.cache.delta_v[i, a] += inc   # η=1 here since alpha_t is already small
+#             end
+#         end
+#     end
+
+#     # ------------------------------------------------------------
+#     # PASS 4: sanitize NaNs/Infs (very cheap)
+#     # ------------------------------------------------------------
+#     @inbounds for a in 1:np
+#         for i in 1:dim
+#             v = system.cache.delta_v[i, a]
+#             if !isfinite(v)
+#                 system.cache.delta_v[i, a] = zero(T)
+#             end
+#         end
+#     end
+
+#     return system
+# end
+
+"""
+    modify_shifting_at_free_surfaces!(system, u, semi, u_ode;
+                                      λ_off::Real = 0.78,
+                                      λ_on::Real  = 0.97,
+                                      s_min::Real = 0.03,
+                                      γ::Real     = 2.5)
+
+Scale the **particle shifting velocity** `δv` near the free surface using a simple,
+robust **neighbor–concentration ramp**. This attenuates shifting where particle
+support is truncated (tips/crests/intersections) and leaves it unchanged in the bulk.
+
+# How it works
+For each particle `a`, we build a **support fullness**
+`λ = N_a / N_ref ∈ [0,1]`, where `N_a` is the (fluid-phase) neighbor count found by
+the neighbor loop and `N_ref = max_a N_a` (an interior reference).
+We then compute a smooth ramp `s(λ)` and scale the current shifting velocity:
+
+s_lin(λ) = clamp( (λ - λ_off) / (λ_on - λ_off), 0, 1 )
+s(λ) = max(s_min, s_lin(λ))^γ
+δv_a ← s(λ_a) * δv_a
+
+markdown
+Code kopieren
+
+Thus:
+- Very sparse support (`λ ≤ λ_off`) → `s ≈ s_min` (almost off).
+- Full support (`λ ≥ λ_on`)       → `s ≈ 1`    (no attenuation).
+- In-between (`λ_off < λ < λ_on`) → linear ramp (optionally made steeper by `γ`).
+
+This strategy is commonly used in improved/enhanced PST variants to moderate
+shifting at free surfaces without explicit normal/curvature estimation.
+
+# Keyword parameters
+- `λ_off` (default `0.78`): **Ramp start**. Support below this is considered
+  “too sparse” for reliable shifting. Increasing `λ_off` **reduces** surface shifting
+  (safer at thin tongues/crests), decreasing it **increases** surface shifting.
+- `λ_on`  (default `0.97`): **Ramp end**. Support above this receives **full**
+  shifting. Increasing `λ_on` makes full shifting harder to reach; decreasing it
+  expands the region with full shifting.
+- `s_min` (default `0.03`): **Floor** of the scaling factor to avoid freezing
+  tangential reordering completely at the interface. Set to `0.0` to **disable
+  shifting entirely** when `λ` is below `λ_off`. Larger `s_min` keeps a bit more
+  motion but can promote peel-off/clustering if too high.
+- `γ`     (default `2.5`): **Nonlinearity** of the ramp. `γ > 1` concentrates
+  shifting in the bulk and damps it near the free surface; `γ < 1` does the
+  opposite (more aggressive at the surface). Typical safe range is `1.5–3.0`.
+
+# Practical tuning
+- **Leading edge (dam-break tongue too mobile):** raise `λ_off` a little
+  (e.g. `0.80`) or increase `γ` (e.g. `3.0`).
+- **Surface clustering/banding:** raise `λ_off` or lower `s_min` (down to `0.0`
+  if you prefer to fully switch off in very sparse regions). If clustering persists,
+  consider adding a tiny outward **sign-clamp** (remove outward component of `δv`
+  only when very close to the free surface) or a light δ-SPH density diffusion
+  in your solver (outside this function).
+- **More surface reordering:** lower `λ_off` slightly (e.g. `0.75`) or reduce `γ`.
+
+# Notes & assumptions
+- `N_a` is the neighbor count gathered by the provided neighbor traversal
+  (as written, it counts neighbors from whatever systems you iterate in
+  `foreach_system(semi)`; if you only want same-phase neighbors, limit that loop
+  to `system` only).
+- This routine **does not modify wall particles** and does not explicitly
+  project along wall normals; it simply scales `δv` by local support fullness.
+- The scaling is GPU- and threading-friendly: no global reductions beyond
+  computing `N_ref = maximum(ncnt)` and only per-particle operations thereafter.
+
+# Example
+```julia
+# safer at violent free surfaces (less shift at tips)
+modify_shifting_at_free_surfaces!(sys, u, semi, u_ode; λ_off=0.80, λ_on=0.97, s_min=0.02, γ=3.0)
+
+# slightly more surface ordering
+modify_shifting_at_free_surfaces!(sys, u, semi, u_ode; λ_off=0.75, λ_on=0.95, s_min=0.05, γ=1.8)
+"""
+# function modify_shifting_at_free_surfaces!(system, u, semi, u_ode;
+#                                            λ_off::Real = 0.78,
+#                                            λ_on::Real  = 0.97,
+#                                            s_min::Real = 0.03,
+#                                            γ::Real     = 2.5)
+
+function modify_shifting_at_free_surfaces!(system, u, semi, u_ode;
+                                           λ_off::Real = 0.70, #2 78 #3 70 > 65 already rounds out too much # 70
+                                           λ_on::Real  = 0.90, #2 95 #3 90 still fine #2 90
+                                           s_min::Real = 0.0, #3 03 0.1 <- detaches 0.05 detaches from wall #2 0
+                                           γ::Real     = 2.5) #2 2.5 #2 2.0 worse #2 3.0
+    dim = ndims(system)
+    epsT = eps(eltype(system.cache.delta_v))
+
+    coordsA = current_coordinates(u, system)
+    set_zero!(system.cache.nct)
+
+    foreach_system(semi) do sysB
+        uB      = wrap_u(u_ode, sysB, semi)
+        coordsB = current_coordinates(uB, sysB)
+
+        foreach_point_neighbor(system, sysB, coordsA, coordsB, semi;
+                                points=each_integrated_particle(system)) do a,b,pos_diff,distance
+                @inbounds system.cache.nct[a] += 1
+        end
+    end
+
+    Nref = max(maximum(system.cache.nct), 1)
+
+    inv_range = 1 / (λ_on - λ_off + epsT)
+
+    @threaded semi for a in each_integrated_particle(system)
+        λ = @inbounds system.cache.nct[a] / Nref     # 0..1 proxy for support fullness
+        s = (λ - λ_off) * inv_range         # linear ramp [α0..α1] → [0..1]
+        s = s < epsT ? 0 : (s > 1 ? 1 : s)
+        s = max(s, s_min)                  # keep some ordering if desired
+        sγ = s^γ
+        for i in 1:dim
+            @inbounds system.cache.delta_v[i, a] *= sγ
+        end
+    end
+
+    return system
+end
+
+
+"""
+Two-metric free-surface attenuation for PST (same-phase neighbors; walls only for FS mask).
+Adds:
+  • Tip clamp: extra ramp for very low same-phase support (leading edge).
+  • Anti-clustering equalizer: downscale where N_a > local weighted average.
+
+Tunables (safe defaults):
+  λ_off=0.76, λ_on=0.965, γ=2.0, s_min=0.05,
+  α_mask=0.92, clampλ=0.99,
+  λ_tip0=0.52, λ_tip1=0.65, γ_tip=2.0, q_eq=2.0
+"""
+# function modify_shifting_at_free_surfaces!(system, u, semi, u_ode;
+#     λ_off::Real=0.76, λ_on::Real=0.965, γ::Real=2.0, s_min::Real=0.05,
+#     α_mask::Real=0.92, clampλ::Real=0.99,
+#     λ_tip0::Real=0.52, λ_tip1::Real=0.65, γ_tip::Real=2.0, q_eq::Real=2.0)
+
+#     T   = eltype(system.cache.delta_v)
+#     dim = ndims(system)
+#     np  = size(system.cache.delta_v, 2)
+#     epsT = eps(T)
+
+#     coordsA = current_coordinates(u, system)
+
+#     # --- Pass 1: same-phase neighbor data -----------------------------------
+#     N_fluid     = zeros(T, np)          # same-phase neighbor count
+#     gsum_fluid  = zeros(T, dim, np)     # same-phase Σ∇W (for sign clamp)
+
+#     foreach_point_neighbor(system, system, coordsA, coordsA, semi;
+#                            points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#         @inbounds N_fluid[a] += one(T)
+#         gWab = smoothing_kernel_grad(system, pos_diff, distance, a)
+#         @inbounds for i in 1:dim
+#             gsum_fluid[i, a] += gWab[i]
+#         end
+#     end
+
+#     # --- Pass 2: N_all = same-phase + wall neighbors (FS mask only) ----------
+#     N_all = copy(N_fluid)
+#     foreach_system(semi) do sysB
+#         (sysB isa AbstractBoundarySystem) || return
+#         uB      = (u_ode === nothing) ? u : wrap_u(u_ode, sysB, semi)
+#         coordsB = current_coordinates(uB, sysB)
+#         foreach_point_neighbor(system, sysB, coordsA, coordsB, semi;
+#                                points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#             @inbounds N_all[a] += one(T)
+#         end
+#     end
+
+#     # --- Pass 3: local weighted average of same-phase neighbor count ----------
+#     nbar_num = zeros(T, np)   # Σ W_ab * N_fluid[b]
+#     nbar_den = zeros(T, np)   # Σ W_ab
+#     foreach_point_neighbor(system, system, coordsA, coordsA, semi;
+#                            points=each_integrated_particle(system)) do a,b,pos_diff,distance
+#         Wab = smoothing_kernel(system, distance, a)
+#         @inbounds begin
+#             nbar_num[a] += Wab * N_fluid[b]
+#             nbar_den[a] += Wab
+#         end
+#     end
+
+#     # References from maxima (interior-like)
+#     Nref_fluid = max(maximum(N_fluid), one(T))
+#     Nref_all   = max(maximum(N_all),   one(T))
+
+#     # Ramps/params
+#     α0 = T(λ_off); α1 = T(λ_on); invr = one(T)/(α1-α0 + epsT)
+#     αt0 = T(λ_tip0); αt1 = T(λ_tip1); invrt = one(T)/(αt1-αt0 + epsT)
+#     gexp = T(γ); gtip = T(γ_tip); smin = T(s_min)
+#     αmask = T(α_mask); clampλT = T(clampλ)
+#     qeq = T(q_eq)
+
+#     @threaded semi for a in each_integrated_particle(system)
+#         # Free-surface mask uses all neighbors (fluid + walls)
+#         λ_all = @inbounds N_all[a] / Nref_all
+#         is_FS = λ_all < αmask
+#         if !is_FS
+#             return
+#         end
+
+#         # Same-phase fullness
+#         λf = @inbounds N_fluid[a] / Nref_fluid
+
+#         # Main FS ramp
+#         s = (λf - α0) * invr
+#         s = s < zero(T) ? zero(T) : (s > one(T) ? one(T) : s)
+
+#         # Tip clamp (very low same-phase support → heavily attenuate)
+#         s_tip = (λf - αt0) * invrt
+#         s_tip = s_tip < zero(T) ? zero(T) : (s_tip > one(T) ? one(T) : s_tip)
+
+#         s_total = max(smin, (s^gexp) * (s_tip^gtip))
+
+#         @inbounds for i in 1:dim
+#             system.cache.delta_v[i, a] *= s_total
+#         end
+
+#         # Anti-clustering equalizer vs local weighted average (same-phase only)
+#         nbar = @inbounds nbar_num[a] / (nbar_den[a] + epsT)
+#         if @inbounds N_fluid[a] > nbar
+#             feq = (nbar / (@inbounds N_fluid[a] + epsT))^qeq
+#             @inbounds for i in 1:dim
+#                 system.cache.delta_v[i, a] *= feq
+#             end
+#         end
+
+#         # Tiny outward sign clamp close to FS (prevents peel-off)
+#         if λ_all < clampλT
+#             g2 = zero(T)
+#             @inbounds for i in 1:dim
+#                 g2 += gsum_fluid[i, a]^2
+#             end
+#             if g2 > T(1e2)*epsT
+#                 invg = inv(sqrt(g2))
+#                 ndot = zero(T)
+#                 @inbounds for i in 1:dim
+#                     ndot += system.cache.delta_v[i, a] * (-gsum_fluid[i, a] * invg)
+#                 end
+#                 if ndot > 0
+#                     @inbounds for i in 1:dim
+#                         system.cache.delta_v[i, a] -= ndot * (-gsum_fluid[i, a] * invg)
+#                     end
+#                 end
+#             end
+#         end
+#     end
+
+#     return system
+# end
diff --git a/src/schemes/fluid/viscosity.jl b/src/schemes/fluid/viscosity.jl
index fcca2383fa..1bfff39de0 100644
--- a/src/schemes/fluid/viscosity.jl
+++ b/src/schemes/fluid/viscosity.jl
@@ -203,7 +203,7 @@ function adami_viscosity_force(smoothing_length_average, pos_diff, distance, gra
     return visc .* v_diff
 end
 
-@inline function (viscosity::ViscosityAdami)(particle_system, neighbor_system,
+@propagate_inbounds function (viscosity::ViscosityAdami)(particle_system, neighbor_system,
                                              v_particle_system, v_neighbor_system,
                                              particle, neighbor, pos_diff,
                                              distance, sound_speed, m_a, m_b,
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index e2d244c079..fb23d66959 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -7,6 +7,10 @@ function interact!(dv, v_particle_system, u_particle_system,
                    particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
     (; density_calculator, correction) = particle_system
 
+    if neighbor_system isa AbstractBoundarySystem && abs(neighbor_system.boundary_model.state_equation.reference_density-particle_system.state_equation.reference_density) > 1
+        return dv
+    end
+
     sound_speed = system_sound_speed(particle_system)
 
     surface_tension_a = surface_tension_model(particle_system)
@@ -26,6 +30,13 @@ function interact!(dv, v_particle_system, u_particle_system,
                                                                                 neighbor,
                                                                                 pos_diff,
                                                                                 distance
+
+        smoothing_length_ = smoothing_length(particle_system, particle)
+
+        if neighbor_system isa AbstractFluidSystem && abs(neighbor_system.state_equation.reference_density- particle_system.state_equation.reference_density) > 1
+            smoothing_length_ *= Float32(0.5)
+        end
+
         # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
         # in bounds of the respective system. For performance reasons, we use `@inbounds`
         # in this hot loop to avoid bounds checking when extracting particle quantities.
@@ -39,7 +50,12 @@ function interact!(dv, v_particle_system, u_particle_system,
          surface_tension_correction) = free_surface_correction(correction, particle_system,
                                                                rho_mean)
 
-        grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
+        # grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
+        grad_kernel =  corrected_kernel_grad(system_smoothing_kernel(particle_system), pos_diff,
+                                 distance, smoothing_length_,
+                                 system_correction(particle_system), particle_system, particle)
+
+
 
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
diff --git a/src/schemes/fluid/weakly_compressible_sph/system.jl b/src/schemes/fluid/weakly_compressible_sph/system.jl
index c03c1ecd75..86464922f8 100644
--- a/src/schemes/fluid/weakly_compressible_sph/system.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/system.jl
@@ -85,8 +85,7 @@ end
 
 # The default constructor needs to be accessible for Adapt.jl to work with this struct.
 # See the comments in general/gpu.jl for more details.
-function WeaklyCompressibleSPHSystem(initial_condition,
-                                     density_calculator, state_equation,
+function WeaklyCompressibleSPHSystem(initial_condition, density_calculator, state_equation,
                                      smoothing_kernel, smoothing_length;
                                      acceleration=ntuple(_ -> zero(eltype(initial_condition)),
                                                          ndims(smoothing_kernel)),
@@ -224,9 +223,7 @@ function Base.show(io::IO, ::MIME"text/plain", system::WeaklyCompressibleSPHSyst
     end
 end
 
-@inline function Base.eltype(::WeaklyCompressibleSPHSystem{<:Any, ELTYPE}) where {ELTYPE}
-    return ELTYPE
-end
+@inline Base.eltype(::WeaklyCompressibleSPHSystem{<:Any, ELTYPE}) where {ELTYPE} = ELTYPE
 
 @inline function v_nvariables(system::WeaklyCompressibleSPHSystem)
     return v_nvariables(system, system.density_calculator)
@@ -248,7 +245,7 @@ system_correction(system::WeaklyCompressibleSPHSystem) = system.correction
     return current_velocity(v, system.density_calculator, system)
 end
 
-@inline function current_velocity(v, ::SummationDensity,
+@propagate_inbounds function current_velocity(v, ::SummationDensity,
                                   system::WeaklyCompressibleSPHSystem)
     # When using `SummationDensity`, `v` contains only the velocity
     return v
@@ -265,7 +262,7 @@ end
     return current_density(v, system.density_calculator, system)
 end
 
-@inline function current_density(v, ::SummationDensity,
+@propagate_inbounds function current_density(v, ::SummationDensity,
                                  system::WeaklyCompressibleSPHSystem)
     # When using `SummationDensity`, the density is stored in the cache
     return system.cache.density
diff --git a/src/setups/rectangular_shape.jl b/src/setups/rectangular_shape.jl
index 1c30ef5c1b..27b518ef8a 100644
--- a/src/setups/rectangular_shape.jl
+++ b/src/setups/rectangular_shape.jl
@@ -76,7 +76,8 @@ function RectangularShape(particle_spacing, n_particles_per_dimension, min_coord
                           coordinates_perturbation=nothing,
                           mass=nothing, density=nothing, pressure=0.0,
                           acceleration=nothing, state_equation=nothing,
-                          place_on_shell=false, loop_order=nothing)
+                          place_on_shell=false, loop_order=nothing,
+                          coordinates_eltype=Float64)
     if particle_spacing < eps()
         throw(ArgumentError("`particle_spacing` needs to be positive and larger than $(eps())"))
     end
@@ -92,10 +93,11 @@ function RectangularShape(particle_spacing, n_particles_per_dimension, min_coord
     end
 
     ELTYPE = eltype(particle_spacing)
-
     n_particles = prod(n_particles_per_dimension)
 
-    coordinates = rectangular_shape_coords(particle_spacing, n_particles_per_dimension,
+    # The type of the particle spacing determines the eltype of the coordinates
+    coordinates = rectangular_shape_coords(convert(coordinates_eltype, particle_spacing),
+                                           n_particles_per_dimension,
                                            min_coordinates, place_on_shell=place_on_shell,
                                            loop_order=loop_order)
 
diff --git a/src/setups/rectangular_tank.jl b/src/setups/rectangular_tank.jl
index ce420fbdf3..7665528c68 100644
--- a/src/setups/rectangular_tank.jl
+++ b/src/setups/rectangular_tank.jl
@@ -81,9 +81,9 @@ RectangularTank{3, 6, Float64}(...) *the rest of this line is ignored by filter*
 
 See also: [`reset_wall!`](@ref).
 """
-struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real}
-    fluid                     :: InitialCondition{ELTYPE}
-    boundary                  :: InitialCondition{ELTYPE}
+struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real, F, B}
+    fluid                     :: F
+    boundary                  :: B
     fluid_size                :: NTuple{NDIMS, ELTYPE}
     tank_size                 :: NTuple{NDIMS, ELTYPE}
     faces_                    :: NTuple{NDIMSt2, Bool} # store if face in dir exists (-x +x -y +y -z +z)
@@ -95,16 +95,16 @@ struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real}
 
     function RectangularTank(particle_spacing, fluid_size, tank_size, fluid_density;
                              velocity=zeros(length(fluid_size)), fluid_mass=nothing,
-                             pressure=0,
-                             acceleration=nothing, state_equation=nothing,
+                             pressure=0, acceleration=nothing, state_equation=nothing,
                              boundary_density=fluid_density,
                              n_layers=1, spacing_ratio=1,
                              min_coordinates=zeros(length(fluid_size)),
-                             faces=Tuple(trues(2 * length(fluid_size))))
+                             faces=Tuple(trues(2 * length(fluid_size))),
+                             coordinates_eltype=Float64)
         NDIMS = length(fluid_size)
         ELTYPE = eltype(particle_spacing)
-        fluid_size_ = Tuple(ELTYPE.(fluid_size))
-        tank_size_ = Tuple(ELTYPE.(tank_size))
+        fluid_size_ = Tuple(convert.(ELTYPE, fluid_size))
+        tank_size_ = Tuple(convert.(ELTYPE, tank_size))
 
         if particle_spacing < eps()
             throw(ArgumentError("`particle_spacing` needs to be positive and larger than $(eps())."))
@@ -135,10 +135,11 @@ struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real}
                                                       spacing_ratio)
 
         boundary_spacing = particle_spacing / spacing_ratio
+
+        # The type of the particle spacing determines the eltype of the coordinates
         boundary_coordinates,
-        face_indices = initialize_boundaries(boundary_spacing,
-                                             tank_size_,
-                                             n_boundaries_per_dim,
+        face_indices = initialize_boundaries(convert.(coordinates_eltype, boundary_spacing),
+                                             tank_size_, n_boundaries_per_dim,
                                              n_layers, faces)
 
         boundary_masses = boundary_density * boundary_spacing^NDIMS *
@@ -148,8 +149,7 @@ struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real}
 
         n_particles_per_dim,
         fluid_size_ = check_tank_overlap(fluid_size_, tank_size_,
-                                         particle_spacing,
-                                         n_particles_per_dim)
+                                         particle_spacing, n_particles_per_dim)
 
         boundary = InitialCondition(coordinates=boundary_coordinates,
                                     velocity=boundary_velocities,
@@ -166,24 +166,27 @@ struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real}
                 fluid = RectangularShape(particle_spacing, n_particles_per_dim,
                                          zeros(NDIMS);
                                          velocity, pressure, acceleration, state_equation,
-                                         mass=fluid_mass)
+                                         mass=fluid_mass, coordinates_eltype)
             else
                 fluid = RectangularShape(particle_spacing, n_particles_per_dim,
                                          zeros(NDIMS);
                                          density=fluid_density, velocity, pressure,
-                                         acceleration, state_equation, mass=fluid_mass)
+                                         acceleration, state_equation, mass=fluid_mass,
+                                         coordinates_eltype)
             end
             # Move the tank corner in the negative coordinate directions to the desired position
             fluid.coordinates .+= min_coordinates
         else
             # Fluid is empty
-            fluid = InitialCondition(coordinates=zeros(ELTYPE, NDIMS, 0), density=1.0)
+            fluid = InitialCondition(coordinates=zeros(coordinates_eltype, NDIMS, 0),
+                                     density=1.0,
+                                     particle_spacing=convert(ELTYPE, particle_spacing))
         end
 
-        return new{NDIMS, 2 * NDIMS, ELTYPE}(fluid, boundary, fluid_size_, tank_size_,
-                                             faces, face_indices,
-                                             particle_spacing, spacing_ratio, n_layers,
-                                             n_particles_per_dim)
+        return new{NDIMS, 2 * NDIMS, ELTYPE, typeof(fluid),
+                   typeof(boundary)}(fluid, boundary, fluid_size_, tank_size_,
+                                    faces, face_indices, particle_spacing, spacing_ratio,
+                                    n_layers, n_particles_per_dim)
     end
 end
 
diff --git a/validation/dam_break_2d/plot_pressure_sensors.jl b/validation/dam_break_2d/plot_pressure_sensors.jl
index 152257fe3a..c33c3b8f0a 100644
--- a/validation/dam_break_2d/plot_pressure_sensors.jl
+++ b/validation/dam_break_2d/plot_pressure_sensors.jl
@@ -11,9 +11,9 @@ using Glob
 using Printf
 
 # Set to save figures as SVG
-save_figures = false
+save_figures = true
 # Set to true to include simulation results in the `out` folder (if available)
-include_sim_results = false
+include_sim_results = true
 
 # Initial width of the fluid
 H = 0.6
@@ -26,8 +26,8 @@ case_dir = joinpath(validation_dir(), "dam_break_2d")
 
 edac_reference_files = joinpath.(case_dir,
                                  [
-                                     "validation_reference_edac_40.json",
-                                     "validation_reference_edac_80.json",
+                                    #  "validation_reference_edac_40.json",
+                                    #  "validation_reference_edac_80.json",
                                      "validation_reference_edac_400.json"
                                  ])
 edac_sim_files = include_sim_results ?
@@ -38,8 +38,8 @@ edac_files = sort(merged_files, by=extract_number_from_filename)
 
 wcsph_reference_files = joinpath.(case_dir,
                                   [
-                                      "validation_reference_wcsph_40.json",
-                                      "validation_reference_wcsph_80.json",
+                                    #   "validation_reference_wcsph_40.json",
+                                    #   "validation_reference_wcsph_80.json",
                                       "validation_reference_wcsph_400.json"
                                   ])
 wcsph_sim_files = include_sim_results ?
@@ -81,6 +81,9 @@ function plot_sensor_results(axs, files)
                       normalization_factor_pressure
         label_prefix = occursin("reference", json_file) ? "TrixiParticles " : ""
         res = extract_number_from_filename(json_file)
+        if res < 400
+            continue
+        end
 
         lines!(axs[1], t, pressure_P1; label="$label_prefix H/$res",
                color=idx, colormap=:tab10, colorrange=(1, 10))
@@ -100,7 +103,7 @@ axs_edac = [Axis(fig_sensors[1, i], title="Sensor P$i (EDAC)")
 axs_wcsph = [Axis(fig_sensors[3, i], title="Sensor P$i (WCSPH)")
              for i in 1:n_sensors]
 
-function plot_reference(ax, time, data, label, color=:red, linestyle=:solid, linewidth=3)
+function plot_reference(ax, time, data, label, color=:red, linestyle=:solid, linewidth=2)
     lines!(ax, time, data; color, linestyle, linewidth, label)
 end
 
diff --git a/validation/dam_break_2d/sensors.jl b/validation/dam_break_2d/sensors.jl
index e3d9251515..cff8f37f7a 100644
--- a/validation/dam_break_2d/sensors.jl
+++ b/validation/dam_break_2d/sensors.jl
@@ -1,5 +1,5 @@
 function max_x_coord(system, data, t)
-    return maximum(j -> data.coordinates[1, j], axes(data.coordinates, 2))
+    return maximum(view(data.coordinates, 1, :))
 end
 
 function interpolated_pressure(coord_top, coord_bottom, v_ode, u_ode, t, system, semi) end
@@ -10,8 +10,8 @@ function interpolated_pressure(coord_top, coord_bottom, v_ode, u_ode, t,
     interpolated_values = interpolate_line(coord_top, coord_bottom,
                                            n_interpolation_points, semi, system, v_ode,
                                            u_ode,
-                                           smoothing_length=2.0 *
-                                                            TrixiParticles.initial_smoothing_length(system),
+                                        #    smoothing_length=2.0 *
+                                        #                     TrixiParticles.initial_smoothing_length(system),
                                            clip_negative_pressure=true, cut_off_bnd=false)
     return sum(map(x -> isnan(x) ? 0.0 : x, interpolated_values.pressure)) /
            n_interpolation_points
diff --git a/validation/dam_break_2d/setup_marrone_2011.jl b/validation/dam_break_2d/setup_marrone_2011.jl
index ca4a88b7c3..898324fea6 100644
--- a/validation/dam_break_2d/setup_marrone_2011.jl
+++ b/validation/dam_break_2d/setup_marrone_2011.jl
@@ -28,9 +28,16 @@ H = 0.6
 particle_spacing = H / particles_per_height
 
 # Import variables from the example file
+alpha = 0.02
+viscosity_fluid = ArtificialViscosityMonaghan(alpha=alpha, beta=0.0)
+shifting_technique = nothing
 trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
               alpha=0.02, # This is used by Marrone et al. (2011)
-              sol=nothing, ode=nothing, fluid_particle_spacing=particle_spacing)
+              sol=nothing,
+              ode=nothing,
+              shifting_technique=shifting_technique,
+              viscosity_fluid=viscosity_fluid,
+              fluid_particle_spacing=particle_spacing)
 
 use_edac = false # Set to false to use WCSPH
 
@@ -93,7 +100,9 @@ else
     method = "wcsph"
 end
 
-formatted_string = string(particles_per_height)
+
+extra_string = ""
+formatted_string = string(particles_per_height) * extra_string
 
 postprocessing_cb = PostprocessCallback(; dt=0.01 / sqrt(gravity / H),
                                         output_directory="out",
@@ -111,15 +120,20 @@ boundary_density_calculator = AdamiPressureExtrapolation(allow_loop_flipping=fal
 # i.e. t(g/H)^(1/2) = (1.5, 2.36, 3.0, 5.7, 6.45).
 # Note that the images in Marrone et al. are obtained with `particles_per_height = 320`.
 saving_paper = SolutionSavingCallback(save_times=[0.0, 1.5, 2.36, 3.0, 5.7, 6.45] ./
-                                                 sqrt(gravity / H),
-                                      prefix="marrone_times")
+                                                 sqrt(gravity / H), prefix="marrone_times")
 
 trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
               fluid_particle_spacing=particle_spacing,
               boundary_density_calculator=boundary_density_calculator,
               state_equation=state_equation,
+              viscosity_fluid=viscosity_fluid,
               solution_prefix="validation_" * method * "_" * formatted_string,
               tspan=tspan, fluid_system=fluid_system,
               update_strategy=nothing,
               extra_callback=postprocessing_cb,
-              extra_callback2=saving_paper)
+              extra_callback2=saving_paper,
+              extra_system=nothing,
+              extra_system2=nothing,
+              cfl=0.9,
+              parallelization_backend=PolyesterBackend(),
+              neighborhood_search=GridNeighborhoodSearch{2}(update_strategy=nothing))
diff --git a/validation/dam_break_2d/validation_dam_break_2d.jl b/validation/dam_break_2d/validation_dam_break_2d.jl
index 22f563e245..493573c84d 100644
--- a/validation/dam_break_2d/validation_dam_break_2d.jl
+++ b/validation/dam_break_2d/validation_dam_break_2d.jl
@@ -19,10 +19,10 @@ using TrixiParticles.JSON
 # `resolution` in this case is set relative to `H`, the initial height of the fluid.
 # Use 40, 80 or 400 for validation.
 # Note: 400 takes about 30 minutes on a large data center CPU (much longer with serial update)
-resolution = 40
+resolution = 600
 
 # Use `SerialUpdate()` to obtain consistent results when using multiple threads
-update_strategy = nothing
+update_strategy = ParallelUpdate()
 # update_strategy = SerialUpdate()
 
 # ==========================================================================================
@@ -30,11 +30,13 @@ update_strategy = nothing
 trixi_include(@__MODULE__,
               joinpath(validation_dir(), "dam_break_2d",
                        "setup_marrone_2011.jl"),
-              use_edac=false, update_strategy=update_strategy,
+              use_edac=false,
               particles_per_height=resolution,
               sound_speed=50 * sqrt(9.81 * 0.6), # This is used by De Courcy et al. (2024)
               alpha=0.01, # This is used by De Courcy et al. (2024)
-              tspan=(0.0, 7 / sqrt(9.81 / 0.6))) # This is used by De Courcy et al. (2024)
+              tspan=(0.0, 7 / sqrt(9.81 / 0.6)), # This is used by De Courcy et al. (2024)
+              parallelization_backend=PolyesterBackend(),
+              neighborhood_search=GridNeighborhoodSearch{2}(update_strategy=update_strategy))
 
 reference_file_wcsph_name = joinpath(validation_dir(), "dam_break_2d",
                                      "validation_reference_wcsph_$formatted_string.json")
@@ -59,11 +61,13 @@ error_wcsph_P2 = interpolated_mse(reference_data["pressure_P2_fluid_1"]["time"],
 trixi_include(@__MODULE__,
               joinpath(validation_dir(), "dam_break_2d",
                        "setup_marrone_2011.jl"),
-              use_edac=true, update_strategy=update_strategy,
+              use_edac=true,
               particles_per_height=resolution,
               sound_speed=50 * sqrt(9.81 * 0.6), # This is used by De Courcy et al. (2024)
               alpha=0.01, # This is used by De Courcy et al. (2024)
-              tspan=(0.0, 7 / sqrt(9.81 / 0.6))) # This is used by De Courcy et al. (2024)
+              tspan=(0.0, 7 / sqrt(9.81 / 0.6)),  # This is used by De Courcy et al. (2024)
+              parallelization_backend=PolyesterBackend(),
+              neighborhood_search=GridNeighborhoodSearch{2}(update_strategy=update_strategy))
 
 reference_file_edac_name = joinpath(validation_dir(), "dam_break_2d",
                                     "validation_reference_edac_$formatted_string.json")
@@ -83,5 +87,6 @@ error_edac_P2 = interpolated_mse(reference_data["pressure_P2_fluid_1"]["time"],
                                  run_data["pressure_P2_fluid_1"]["time"],
                                  run_data["pressure_P2_fluid_1"]["values"])
 
-print(error_edac_P1, " ", error_edac_P2, " ",
-      error_wcsph_P1, " ", error_wcsph_P2, "\n")
+print("Error of EDAC and WCSPH for dam break 2D with resolution $resolution:\n")
+print("EDAC P1 ", error_edac_P1, " P2 ", error_edac_P2, "\n")
+print("WCSPH P1 ", error_wcsph_P1, " P2 ", error_wcsph_P2, "\n")
diff --git a/validation/dam_break_2d/validation_dam_break_2d_gpu.jl b/validation/dam_break_2d/validation_dam_break_2d_gpu.jl
new file mode 100644
index 0000000000..79855b8ea8
--- /dev/null
+++ b/validation/dam_break_2d/validation_dam_break_2d_gpu.jl
@@ -0,0 +1,20 @@
+using TrixiParticles
+using TrixiParticles.JSON
+using CUDA
+
+min_corner = (-1.5, -1.5)
+max_corner = (6.5, 5.0)
+cell_list = FullGridCellList(; min_corner, max_corner, max_points_per_cell=30)
+
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list, update_strategy=ParallelUpdate())
+# neighborhood_search = GridNeighborhoodSearch{2}(; cell_list)
+
+# trixi_include_changeprecision(Float32, @__MODULE__,
+#                                 joinpath(validation_dir(),
+#                                         "dam_break_2d", "validation_dam_break_2d.jl");
+#                                 parallelization_backend=CUDABackend(),
+#                                 neighborhood_search=neighborhood_search)
+
+trixi_include_changeprecision(Float32, @__MODULE__,
+                                joinpath(examples_dir(),
+                                        "fluid", "dam_break_ds.jl");)