From f81adcc196d753eb967dc41466b8ce5f790b7a05 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Sun, 7 Dec 2025 21:35:05 +0800
Subject: [PATCH 1/3] Optimize snap_psibeta_half_tddft for better performance

Key optimizations include:
1. Cache Gauss-Legendre Grid: Implemented thread-safe caching for radial integration grid points and weights.
2. Precompute Spherical Harmonics: Precomputed Ylm and A dot r on the Lebedev angular grid to reduce inner loop overhead.
3. Memory Optimization: Lifted vector allocations out of loops and reused buffers to minimize allocation costs.
4. Inline Interpolation: Inlined polynomial interpolation and used precomputed inverse step size to avoid divisions.
5. Common Subexpression Elimination: Extracted invariant factors from the innermost loop to reduce arithmetic operations.
---
 .../module_rt/snap_psibeta_half_tddft.cpp     | 128 ++++++++++++++----
 1 file changed, 105 insertions(+), 23 deletions(-)
diff --git a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
index a0bb7c3548..c479c74a41 100644
--- a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
+++ b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
@@ -106,12 +106,49 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
     const int mesh_r1 = orb.Phi[T1].PhiLN(L1, N1).getNr();
     const double* psi_1 = orb.Phi[T1].PhiLN(L1, N1).getPsi();
     const double dk_1 = orb.Phi[T1].PhiLN(L1, N1).getDk();
+    const double inv_dk_1 = 1.0 / dk_1;
 
     const int ridial_grid_num = 140;
     const int angular_grid_num = 110;
     std::vector<double> r_ridial(ridial_grid_num);
     std::vector<double> weights_ridial(ridial_grid_num);
 
+    // OPTIMIZATION START: Cache standard Gauss-Legendre grid
+    static std::vector<double> gl_x(ridial_grid_num);
+    static std::vector<double> gl_w(ridial_grid_num);
+    static bool gl_init = false;
+
+    // Thread-safe initialization
+    if (!gl_init)
+    {
+        #pragma omp critical(init_gauss_legendre)
+        {
+            if (!gl_init)
+            {
+                ModuleBase::Integral::Gauss_Legendre_grid_and_weight(ridial_grid_num, gl_x.data(), gl_w.data());
+                gl_init = true;
+            }
+        }
+    }
+    // OPTIMIZATION END
+
+    // Precompute A dot r_angular for Lebedev grid
+    std::vector<double> A_dot_lebedev(angular_grid_num);
+    for (int ian = 0; ian < angular_grid_num; ++ian)
+    {
+        A_dot_lebedev[ian] = A.x * ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian] +
+                             A.y * ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian] +
+                             A.z * ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian];
+    }
+
+    // Buffers to reuse
+    std::vector<std::complex<double>> result_angular;
+    std::vector<std::complex<double>> result_angular_r_commu_x;
+    std::vector<std::complex<double>> result_angular_r_commu_y;
+    std::vector<std::complex<double>> result_angular_r_commu_z;
+    std::vector<double> rly1;
+    std::vector<std::vector<double>> rly0_all(angular_grid_num);
+
     int index = 0;
     for (int nb = 0; nb < nproj; nb++)
     {
@@ -128,28 +165,52 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
         const double dk_0 = infoNL_.Beta[T0].Proj[nb].getDk();
 
         const double Rcut0 = infoNL_.Beta[T0].Proj[nb].getRcut();
-        ModuleBase::Integral::Gauss_Legendre_grid_and_weight(radial0[0],
-                                                             radial0[mesh_r0 - 1],
-                                                             ridial_grid_num,
-                                                             r_ridial.data(),
-                                                             weights_ridial.data());
+
+        // OPTIMIZATION: Use precomputed standard Gauss-Legendre grid
+        double r_min = radial0[0];
+        double r_max = radial0[mesh_r0 - 1];
+        double xl = (r_max - r_min) * 0.5;
+        double xmean = (r_max + r_min) * 0.5;
+        
+        for(int i=0; i<ridial_grid_num; ++i)
+        {
+            r_ridial[i] = xmean + xl * gl_x[i];
+            weights_ridial[i] = xl * gl_w[i];
+        }
 
         const double A_phase = A * R0;
         const std::complex<double> exp_iAR0 = std::exp(ModuleBase::IMAG_UNIT * A_phase);
 
-        std::vector<double> rly0(L0);
-        std::vector<double> rly1(L1);
+        // Precompute rly0 for all angular points
+        for(int ian = 0; ian < angular_grid_num; ++ian) {
+            ModuleBase::Ylm::rl_sph_harm(L0, 
+                ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian], 
+                ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian], 
+                ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian], 
+                rly0_all[ian]);
+        }
+
+        // Resize buffers
+        if (result_angular.size() < 2 * L0 + 1)
+        {
+            result_angular.resize(2 * L0 + 1);
+            if (calc_r)
+            {
+                result_angular_r_commu_x.resize(2 * L0 + 1);
+                result_angular_r_commu_y.resize(2 * L0 + 1);
+                result_angular_r_commu_z.resize(2 * L0 + 1);
+            }
+        }
+
         for (int ir = 0; ir < ridial_grid_num; ir++)
         {
-            std::vector<std::complex<double>> result_angular(2 * L0 + 1, 0.0);
-            std::vector<std::complex<double>> result_angular_r_commu_x;
-            std::vector<std::complex<double>> result_angular_r_commu_y;
-            std::vector<std::complex<double>> result_angular_r_commu_z;
+            // Reset result accumulators
+            std::fill(result_angular.begin(), result_angular.begin() + (2 * L0 + 1), 0.0);
             if (calc_r)
             {
-                result_angular_r_commu_x.resize(2 * L0 + 1, 0.0);
-                result_angular_r_commu_y.resize(2 * L0 + 1, 0.0);
-                result_angular_r_commu_z.resize(2 * L0 + 1, 0.0);
+                std::fill(result_angular_r_commu_x.begin(), result_angular_r_commu_x.begin() + (2 * L0 + 1), 0.0);
+                std::fill(result_angular_r_commu_y.begin(), result_angular_r_commu_y.begin() + (2 * L0 + 1), 0.0);
+                std::fill(result_angular_r_commu_z.begin(), result_angular_r_commu_z.begin() + (2 * L0 + 1), 0.0);
             }
 
             for (int ian = 0; ian < angular_grid_num; ian++)
@@ -158,11 +219,13 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
                 const double y = ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian];
                 const double z = ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian];
                 const double weights_angular = ModuleBase::Integral::Lebedev_Laikov_grid110_w[ian];
-                const ModuleBase::Vector3<double> r_angular_tmp(x, y, z);
-
-                const ModuleBase::Vector3<double> r_coor = r_ridial[ir] * r_angular_tmp;
+                
+                const double r_val = r_ridial[ir];
+                const ModuleBase::Vector3<double> r_coor(r_val * x, r_val * y, r_val * z);
+                
                 const ModuleBase::Vector3<double> tmp_r_coor = r_coor + dRa;
                 const double tmp_r_coor_norm = tmp_r_coor.norm();
+                
                 if (tmp_r_coor_norm > Rcut1) 
                 {
                     continue;
@@ -174,21 +237,40 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
                     tmp_r_unit = tmp_r_coor / tmp_r_coor_norm;
                 }
 
-                ModuleBase::Ylm::rl_sph_harm(L0, x, y, z, rly0);
+                const std::vector<double>& rly0_vec = rly0_all[ian];
 
                 ModuleBase::Ylm::rl_sph_harm(L1, tmp_r_unit.x, tmp_r_unit.y, tmp_r_unit.z, rly1);
 
-                const double phase = A * r_coor;
+                const double phase = r_val * A_dot_lebedev[ian];
                 const std::complex<double> exp_iAr = std::exp(ModuleBase::IMAG_UNIT * phase);
 
                 const ModuleBase::Vector3<double> tmp_r_coor_r_commu = r_coor + R0;
-                const double interp_v = ModuleBase::PolyInt::Polynomial_Interpolation(psi_1, 
-                      mesh_r1, dk_1, tmp_r_coor_norm);
+                
+                // OPTIMIZATION: Inline Polynomial Interpolation
+                double position = tmp_r_coor_norm * inv_dk_1;
+                int iq = static_cast<int>(position);
+                double interp_v = 0.0;
+                
+                if (iq <= mesh_r1 - 4)
+                {
+                    const double x0 = position - static_cast<double>(iq);
+                    const double x1 = 1.0 - x0;
+                    const double x2 = 2.0 - x0;
+                    const double x3 = 3.0 - x0;
+                    interp_v = x1*x2*(psi_1[iq]*x3+psi_1[iq+3]*x0)/6.0
+                             + x0*x3*(psi_1[iq+1]*x2-psi_1[iq+2]*x1)/2.0;
+                }
+                
+                const double weight_interp = interp_v * weights_angular;
+                const int offset_L0 = L0 * L0;
+                const int offset_L1 = L1 * L1 + m1;
+                const double rly1_val = rly1[offset_L1];
+                
+                const std::complex<double> common_factor = exp_iAr * rly1_val * weight_interp;
 
                 for (int m0 = 0; m0 < 2 * L0 + 1; m0++)
                 {
-                    std::complex<double> temp = exp_iAr * rly0[L0 * L0 + m0] * rly1[L1 * L1 + m1]
-                                                * interp_v * weights_angular;
+                    std::complex<double> temp = common_factor * rly0_vec[offset_L0 + m0];
                     result_angular[m0] += temp;
 
                     if (calc_r)

From 3e1fe152d75667c0763bdd62d5ae5b660e1cbf41 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Sun, 7 Dec 2025 22:52:50 +0800
Subject: [PATCH 2/3] Refactor snap_psibeta_half_tddft: improved code
 structure, added comments, and consolidated optimizations

---
 .../module_rt/snap_psibeta_half_tddft.cpp     | 434 ++++++++++--------
 1 file changed, 242 insertions(+), 192 deletions(-)

diff --git a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
index c479c74a41..9ebc18d257 100644
--- a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
+++ b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
@@ -5,12 +5,89 @@
 #include "source_base/math_polyint.h"
 #include "source_base/timer.h"
 #include "source_base/ylm.h"
+#include <vector>
+#include <complex>
+#include <cmath>
 
 namespace module_rt
 {
 
-// nlm[0] : <phi|exp^{-iAr}|beta>
-// nlm[1, 2, 3,] : <phi|r_a * exp^{-iAr}|beta>, which a = x, y, z.
+/**
+ * @brief Initialize Gauss-Legendre grid points and weights.
+ *        Thread-safe initialization using static local variable.
+ * 
+ * @param grid_size Number of grid points (140)
+ * @param gl_x Output: Grid points in [-1, 1]
+ * @param gl_w Output: Weights
+ */
+static void init_gauss_legendre_grid(int grid_size, std::vector<double>& gl_x, std::vector<double>& gl_w)
+{
+    static bool init = false;
+    // Thread-safe initialization
+    #pragma omp critical(init_gauss_legendre)
+    {
+        if (!init)
+        {
+            ModuleBase::Integral::Gauss_Legendre_grid_and_weight(grid_size, gl_x.data(), gl_w.data());
+            init = true;
+        }
+    }
+}
+
+/**
+ * @brief Interpolate radial function value at a given distance using cubic interpolation.
+ * 
+ * @param psi Pointer to radial function array
+ * @param mesh Number of mesh points
+ * @param inv_dk Inverse of grid spacing (1/dk)
+ * @param distance Distance to interpolate at
+ * @return double Interpolated value
+ */
+inline double interpolate_radial(
+    const double* psi, 
+    int mesh, 
+    double inv_dk, 
+    double distance)
+{
+    double position = distance * inv_dk;
+    int iq = static_cast<int>(position);
+    
+    // Boundary check safe-guard
+    if (iq >= mesh - 4) return 0.0;
+
+    const double x0 = position - static_cast<double>(iq);
+    const double x1 = 1.0 - x0;
+    const double x2 = 2.0 - x0;
+    const double x3 = 3.0 - x0;
+    
+    // Cubic interpolation formula
+    return x1*x2*(psi[iq]*x3+psi[iq+3]*x0)/6.0
+         + x0*x3*(psi[iq+1]*x2-psi[iq+2]*x1)/2.0;
+}
+
+/**
+ * @brief Main function to calculate overlap integrals <phi|exp^{-iAr}|beta>
+ *        and its derivatives (if calc_r is true).
+ * 
+ *        This function integrates the overlap between a local orbital phi (at R1)
+ *        and a non-local projector beta (at R0), modulated by a plane-wave-like phase factor
+ *        exp^{-iAr}, where A is a vector potential.
+ * 
+ * @param orb LCAO Orbitals information
+ * @param infoNL_ Non-local pseudopotential information
+ * @param nlm Output: 
+ *            nlm[0] : <phi|exp^{-iAr}|beta>
+ *            nlm[1, 2, 3] : <phi|r_a * exp^{-iAr}|beta>, a = x, y, z (if calc_r=true)
+ * @param R1 Position of atom 1 (orbital phi)
+ * @param T1 Type of atom 1
+ * @param L1 Angular momentum of orbital phi
+ * @param m1 Magnetic quantum number of orbital phi
+ * @param N1 Radial quantum number of orbital phi
+ * @param R0 Position of atom 0 (projector beta)
+ * @param T0 Type of atom 0
+ * @param A Vector potential A (or related field vector)
+ * @param calc_r Whether to calculate position operator matrix elements
+ */
 void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
                              const InfoNonlocal& infoNL_,
                              std::vector<std::vector<std::complex<double>>>& nlm,
@@ -19,120 +96,76 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
                              const int& L1,
                              const int& m1,
                              const int& N1,
-                             const ModuleBase::Vector3<double>& R0, // The projector.
+                             const ModuleBase::Vector3<double>& R0, 
                              const int& T0,
                              const ModuleBase::Vector3<double>& A,
                              const bool& calc_r)
 {
     ModuleBase::timer::tick("module_rt", "snap_psibeta_half_tddft");
 
-    // find number of projectors on atom R0
+    // 1. Initialization and Early Exits
     const int nproj = infoNL_.nproj[T0];
-    if (nproj == 0)
-    {
-        if (calc_r)
-        {
-            nlm.resize(4);
-        }
-        else
-        {
-            nlm.resize(1);
-        }
-        return;
-    }
-
-    std::vector<bool> calproj(nproj);
-    std::vector<int> rmesh1(nproj);
-
-    if (calc_r)
-    {
-        nlm.resize(4);
-    }
-    else
-    {
-        nlm.resize(1);
-    }
-
-    // Count number of projectors (l,m)
+    
+    // Resize output vector based on whether position operator matrix elements are needed
+    int required_size = calc_r ? 4 : 1;
+    if (nlm.size() != required_size) nlm.resize(required_size);
+    
+    if (nproj == 0) return;
+
+    // 2. Determine total number of projectors and identify active ones based on cutoff
     int natomwfc = 0;
-    for (int ip = 0; ip < nproj; ip++)
-    {
-        //============================
-        // Use pseudo-atomic orbitals
-        //============================
-
-        const int L0 = infoNL_.Beta[T0].Proj[ip].getL(); // mohan add 2021-05-07
-        natomwfc += 2 * L0 + 1;
-    }
-
-    for (int dim = 0; dim < nlm.size(); dim++)
-    {
-        nlm[dim].resize(natomwfc);
-        for (auto& x: nlm[dim])
-        {
-            x = 0.0;
-        }
-    }
-
-    // rcut of orbtials and projectors
-    // in our calculation, we always put orbital phi at the left side of <phi|beta>
-    // because <phi|beta> = <beta|phi>
+    std::vector<bool> calproj(nproj, false);
+    
     const double Rcut1 = orb.Phi[T1].getRcut();
     const ModuleBase::Vector3<double> dRa = R0 - R1;
-
-    double distance10 = dRa.norm();
-
-    bool all_out = true;
+    const double distance10 = dRa.norm();
+    
+    bool any_active = false;
     for (int ip = 0; ip < nproj; ip++)
     {
+        const int L0 = infoNL_.Beta[T0].Proj[ip].getL();
+        natomwfc += 2 * L0 + 1;
+        
         const double Rcut0 = infoNL_.Beta[T0].Proj[ip].getRcut();
-        if (distance10 > (Rcut1 + Rcut0))
+        if (distance10 <= (Rcut1 + Rcut0))
         {
-            calproj[ip] = false;
-        }
-        else
-        {
-            all_out = false;
             calproj[ip] = true;
+            any_active = true;
         }
     }
 
-    if (all_out)
+    // Initialize output values to zero and resize inner vectors
+    for (auto& x : nlm) {
+        x.assign(natomwfc, 0.0);
+    }
+
+    if (!any_active)
     {
         ModuleBase::timer::tick("module_rt", "snap_psibeta_half_tddft");
         return;
     }
 
-    const int mesh_r1 = orb.Phi[T1].PhiLN(L1, N1).getNr();
-    const double* psi_1 = orb.Phi[T1].PhiLN(L1, N1).getPsi();
-    const double dk_1 = orb.Phi[T1].PhiLN(L1, N1).getDk();
+    // 3. Prepare Orbital Data (Phi)
+    const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1);
+    const int mesh_r1 = phi_ln.getNr();
+    const double* psi_1 = phi_ln.getPsi();
+    const double dk_1 = phi_ln.getDk();
     const double inv_dk_1 = 1.0 / dk_1;
 
-    const int ridial_grid_num = 140;
+    // 4. Prepare Integration Grids
+    const int radial_grid_num = 140;
     const int angular_grid_num = 110;
-    std::vector<double> r_ridial(ridial_grid_num);
-    std::vector<double> weights_ridial(ridial_grid_num);
-
-    // OPTIMIZATION START: Cache standard Gauss-Legendre grid
-    static std::vector<double> gl_x(ridial_grid_num);
-    static std::vector<double> gl_w(ridial_grid_num);
-    static bool gl_init = false;
+    
+    // Cached standard Gauss-Legendre grid
+    static std::vector<double> gl_x(radial_grid_num);
+    static std::vector<double> gl_w(radial_grid_num);
+    init_gauss_legendre_grid(radial_grid_num, gl_x, gl_w);
 
-    // Thread-safe initialization
-    if (!gl_init)
-    {
-        #pragma omp critical(init_gauss_legendre)
-        {
-            if (!gl_init)
-            {
-                ModuleBase::Integral::Gauss_Legendre_grid_and_weight(ridial_grid_num, gl_x.data(), gl_w.data());
-                gl_init = true;
-            }
-        }
-    }
-    // OPTIMIZATION END
+    // Buffers for mapped radial grid
+    std::vector<double> r_radial(radial_grid_num);
+    std::vector<double> w_radial(radial_grid_num);
 
-    // Precompute A dot r_angular for Lebedev grid
+    // Precompute A dot r_angular (A * u_angle) for the Lebedev grid
     std::vector<double> A_dot_lebedev(angular_grid_num);
     for (int ian = 0; ian < angular_grid_num; ++ian)
     {
@@ -141,189 +174,206 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb,
                              A.z * ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian];
     }
 
-    // Buffers to reuse
-    std::vector<std::complex<double>> result_angular;
-    std::vector<std::complex<double>> result_angular_r_commu_x;
-    std::vector<std::complex<double>> result_angular_r_commu_y;
-    std::vector<std::complex<double>> result_angular_r_commu_z;
-    std::vector<double> rly1;
-    std::vector<std::vector<double>> rly0_all(angular_grid_num);
+    // Reuseable buffers for inner loops to avoid allocation
+    std::vector<std::complex<double>> result_angular; // Accumulator for angular integration
+    // Accumulators for position operator components
+    std::vector<std::complex<double>> res_ang_x, res_ang_y, res_ang_z;
+    
+    std::vector<double> rly1((L1 + 1) * (L1 + 1)); // Spherical harmonics buffer for L1
+    std::vector<std::vector<double>> rly0_cache(angular_grid_num); // Cache for L0 Ylm
 
-    int index = 0;
+    // 5. Loop over Projectors (Beta)
+    int index_offset = 0;
     for (int nb = 0; nb < nproj; nb++)
     {
         const int L0 = infoNL_.Beta[T0].Proj[nb].getL();
+        const int num_m0 = 2 * L0 + 1;
+
         if (!calproj[nb])
         {
-            index += 2 * L0 + 1;
+            index_offset += num_m0;
             continue;
         }
 
-        const int mesh_r0 = infoNL_.Beta[T0].Proj[nb].getNr();
-        const double* beta_r = infoNL_.Beta[T0].Proj[nb].getBeta_r();
-        const double* radial0 = infoNL_.Beta[T0].Proj[nb].getRadial();
-        const double dk_0 = infoNL_.Beta[T0].Proj[nb].getDk();
+        const auto& proj = infoNL_.Beta[T0].Proj[nb];
+        const int mesh_r0 = proj.getNr();
+        const double* beta_r = proj.getBeta_r();
+        const double* radial0 = proj.getRadial();
+        const double dk_0 = proj.getDk();
+        const double Rcut0 = proj.getRcut();
 
-        const double Rcut0 = infoNL_.Beta[T0].Proj[nb].getRcut();
-
-        // OPTIMIZATION: Use precomputed standard Gauss-Legendre grid
+        // 5.1 Map Gauss-Legendre grid to radial interval [r_min, r_max]
         double r_min = radial0[0];
         double r_max = radial0[mesh_r0 - 1];
         double xl = (r_max - r_min) * 0.5;
         double xmean = (r_max + r_min) * 0.5;
         
-        for(int i=0; i<ridial_grid_num; ++i)
+        for(int i=0; i<radial_grid_num; ++i)
         {
-            r_ridial[i] = xmean + xl * gl_x[i];
-            weights_ridial[i] = xl * gl_w[i];
+            r_radial[i] = xmean + xl * gl_x[i];
+            w_radial[i] = xl * gl_w[i];
         }
 
         const double A_phase = A * R0;
         const std::complex<double> exp_iAR0 = std::exp(ModuleBase::IMAG_UNIT * A_phase);
 
-        // Precompute rly0 for all angular points
+        // 5.2 Precompute Spherical Harmonics (Ylm) for L0 on angular grid
+        // Since L0 changes with projector, we compute this per projector loop.
         for(int ian = 0; ian < angular_grid_num; ++ian) {
             ModuleBase::Ylm::rl_sph_harm(L0, 
                 ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian], 
                 ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian], 
                 ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian], 
-                rly0_all[ian]);
+                rly0_cache[ian]);
         }
 
-        // Resize buffers
-        if (result_angular.size() < 2 * L0 + 1)
+        // Resize accumulators if needed
+        if (result_angular.size() < num_m0)
         {
-            result_angular.resize(2 * L0 + 1);
+            result_angular.resize(num_m0);
             if (calc_r)
             {
-                result_angular_r_commu_x.resize(2 * L0 + 1);
-                result_angular_r_commu_y.resize(2 * L0 + 1);
-                result_angular_r_commu_z.resize(2 * L0 + 1);
+                res_ang_x.resize(num_m0);
+                res_ang_y.resize(num_m0);
+                res_ang_z.resize(num_m0);
             }
         }
 
-        for (int ir = 0; ir < ridial_grid_num; ir++)
+        // 5.3 Radial Integration Loop
+        for (int ir = 0; ir < radial_grid_num; ir++)
         {
-            // Reset result accumulators
-            std::fill(result_angular.begin(), result_angular.begin() + (2 * L0 + 1), 0.0);
+            const double r_val = r_radial[ir];
+
+            // Reset angular accumulators for this radial shell
+            std::fill(result_angular.begin(), result_angular.begin() + num_m0, 0.0);
             if (calc_r)
             {
-                std::fill(result_angular_r_commu_x.begin(), result_angular_r_commu_x.begin() + (2 * L0 + 1), 0.0);
-                std::fill(result_angular_r_commu_y.begin(), result_angular_r_commu_y.begin() + (2 * L0 + 1), 0.0);
-                std::fill(result_angular_r_commu_z.begin(), result_angular_r_commu_z.begin() + (2 * L0 + 1), 0.0);
+                std::fill(res_ang_x.begin(), res_ang_x.begin() + num_m0, 0.0);
+                std::fill(res_ang_y.begin(), res_ang_y.begin() + num_m0, 0.0);
+                std::fill(res_ang_z.begin(), res_ang_z.begin() + num_m0, 0.0);
             }
 
+            // 5.4 Angular Integration Loop (Lebedev Grid)
             for (int ian = 0; ian < angular_grid_num; ian++)
             {
                 const double x = ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian];
                 const double y = ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian];
                 const double z = ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian];
-                const double weights_angular = ModuleBase::Integral::Lebedev_Laikov_grid110_w[ian];
+                const double w_ang = ModuleBase::Integral::Lebedev_Laikov_grid110_w[ian];
                 
-                const double r_val = r_ridial[ir];
-                const ModuleBase::Vector3<double> r_coor(r_val * x, r_val * y, r_val * z);
+                // Vector r = r_val * u_angle
+                double rx = r_val * x;
+                double ry = r_val * y;
+                double rz = r_val * z;
                 
-                const ModuleBase::Vector3<double> tmp_r_coor = r_coor + dRa;
-                const double tmp_r_coor_norm = tmp_r_coor.norm();
+                // Vector r' = r + R0 - R1 = r + dRa
+                double tx = rx + dRa.x;
+                double ty = ry + dRa.y;
+                double tz = rz + dRa.z;
                 
-                if (tmp_r_coor_norm > Rcut1) 
+                double tnorm = std::sqrt(tx*tx + ty*ty + tz*tz);
+                
+                // If r' is outside the cutoff of Phi(r'), skip
+                if (tnorm > Rcut1) continue;
+
+                // Compute Ylm for L1 at direction r'
+                if (tnorm > 1e-10)
                 {
-                    continue;
+                    double inv_tnorm = 1.0 / tnorm;
+                    ModuleBase::Ylm::rl_sph_harm(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, rly1);
                 }
-
-                ModuleBase::Vector3<double> tmp_r_unit;
-                if (tmp_r_coor_norm > 1e-10)
+                else
                 {
-                    tmp_r_unit = tmp_r_coor / tmp_r_coor_norm;
+                    // At origin, only Y_00 is non-zero (if using real spherical harmonics convention)
+                    ModuleBase::Ylm::rl_sph_harm(L1, 0.0, 0.0, 1.0, rly1); 
                 }
 
-                const std::vector<double>& rly0_vec = rly0_all[ian];
-
-                ModuleBase::Ylm::rl_sph_harm(L1, tmp_r_unit.x, tmp_r_unit.y, tmp_r_unit.z, rly1);
-
+                // Calculate common phase and weight factor
+                // phase = A * r = r_val * (A * u_angle)
                 const double phase = r_val * A_dot_lebedev[ian];
                 const std::complex<double> exp_iAr = std::exp(ModuleBase::IMAG_UNIT * phase);
 
-                const ModuleBase::Vector3<double> tmp_r_coor_r_commu = r_coor + R0;
+                // Interpolate Psi at |r'|
+                double interp_psi = interpolate_radial(psi_1, mesh_r1, inv_dk_1, tnorm);
                 
-                // OPTIMIZATION: Inline Polynomial Interpolation
-                double position = tmp_r_coor_norm * inv_dk_1;
-                int iq = static_cast<int>(position);
-                double interp_v = 0.0;
-                
-                if (iq <= mesh_r1 - 4)
-                {
-                    const double x0 = position - static_cast<double>(iq);
-                    const double x1 = 1.0 - x0;
-                    const double x2 = 2.0 - x0;
-                    const double x3 = 3.0 - x0;
-                    interp_v = x1*x2*(psi_1[iq]*x3+psi_1[iq+3]*x0)/6.0
-                             + x0*x3*(psi_1[iq+1]*x2-psi_1[iq+2]*x1)/2.0;
-                }
-                
-                const double weight_interp = interp_v * weights_angular;
-                const int offset_L0 = L0 * L0;
                 const int offset_L1 = L1 * L1 + m1;
-                const double rly1_val = rly1[offset_L1];
+                const double ylm_L1_val = rly1[offset_L1];
                 
-                const std::complex<double> common_factor = exp_iAr * rly1_val * weight_interp;
+                // Combined factor: exp(iAr) * Y_L1m1(r') * Psi(|r'|) * weight_angle
+                const std::complex<double> common_factor = exp_iAr * ylm_L1_val * interp_psi * w_ang;
 
-                for (int m0 = 0; m0 < 2 * L0 + 1; m0++)
+                // Retrieve precomputed Y_L0m0(r)
+                const std::vector<double>& rly0_vec = rly0_cache[ian];
+                const int offset_L0 = L0 * L0;
+
+                // Accumulate results for all m0 components
+                for (int m0 = 0; m0 < num_m0; m0++)
                 {
-                    std::complex<double> temp = common_factor * rly0_vec[offset_L0 + m0];
-                    result_angular[m0] += temp;
+                    std::complex<double> term = common_factor * rly0_vec[offset_L0 + m0];
+                    result_angular[m0] += term;
 
                     if (calc_r)
                     {
-                        result_angular_r_commu_x[m0] += temp * tmp_r_coor_r_commu.x;
-                        result_angular_r_commu_y[m0] += temp * tmp_r_coor_r_commu.y;
-                        result_angular_r_commu_z[m0] += temp * tmp_r_coor_r_commu.z;
+                        // Position operator r_op = r + R0
+                        // Note: Term involves (r_op)_a * exp(...).
+                        double r_op_x = rx + R0.x;
+                        double r_op_y = ry + R0.y;
+                        double r_op_z = rz + R0.z;
+                        
+                        res_ang_x[m0] += term * r_op_x;
+                        res_ang_y[m0] += term * r_op_y;
+                        res_ang_z[m0] += term * r_op_z;
                     }
                 }
-            }
-
-            int index_tmp = index;
-            const double temp = ModuleBase::PolyInt::Polynomial_Interpolation(beta_r,
-                     mesh_r0, dk_0, r_ridial[ir]) * r_ridial[ir] * weights_ridial[ir];
-
-            if (!calc_r)
+            } // End Angular Loop
+
+            // 5.5 Combine Radial and Angular parts
+            // Interpolate Beta(|r|)
+            // Note: The original code implies beta_r stores values that might need scaling or are just the function values.
+            // Typically radial integration is \int f(r) r^2 dr.
+            // Here we have factor: beta_val * r_radial[ir] * w_radial[ir]
+            // w_radial includes the Jacobian for the change of variable from [-1,1] to [r_min, r_max].
+            // The extra r_radial[ir] suggests either beta is stored as r*beta, or we are doing \int ... r dr (2D?), 
+            // or Jacobian r^2 is split. Assuming original logic is correct.
+            
+            double beta_val = ModuleBase::PolyInt::Polynomial_Interpolation(
+                                beta_r, mesh_r0, dk_0, r_radial[ir]);
+            
+            double radial_factor = beta_val * r_radial[ir] * w_radial[ir];
+
+            int current_idx = index_offset;
+            for (int m0 = 0; m0 < num_m0; m0++)
             {
-                for (int m0 = 0; m0 < 2 * L0 + 1; m0++)
-                {
-                    nlm[0][index_tmp] += temp * result_angular[m0] * exp_iAR0;
-                    index_tmp++;
-                }
-            }
-            else
-            {
-                for (int m0 = 0; m0 < 2 * L0 + 1; m0++)
+                // Final accumulation into global nlm array
+                // Add phase exp(i A * R0)
+                nlm[0][current_idx] += radial_factor * result_angular[m0] * exp_iAR0;
+                
+                if (calc_r)
                 {
-                    nlm[0][index_tmp] += temp * result_angular[m0] * exp_iAR0;
-                    nlm[1][index_tmp] += temp * result_angular_r_commu_x[m0] * exp_iAR0;
-                    nlm[2][index_tmp] += temp * result_angular_r_commu_y[m0] * exp_iAR0;
-                    nlm[3][index_tmp] += temp * result_angular_r_commu_z[m0] * exp_iAR0;
-                    index_tmp++;
+                    nlm[1][current_idx] += radial_factor * res_ang_x[m0] * exp_iAR0;
+                    nlm[2][current_idx] += radial_factor * res_ang_y[m0] * exp_iAR0;
+                    nlm[3][current_idx] += radial_factor * res_ang_z[m0] * exp_iAR0;
                 }
+                current_idx++;
             }
-        }
 
-        index += 2 * L0 + 1;
-    }
+        } // End Radial Loop
 
+        index_offset += num_m0;
+    } // End Projector Loop
+
+    // 6. Final Conjugation
+    // Apply conjugation to all elements as per convention <phi|beta> = <beta|phi>*
     for(int dim = 0; dim < nlm.size(); dim++)
     {
         for (auto &x : nlm[dim])
         {
-            // nlm[0] is <phi|exp^{-iAr}|beta>
-            // nlm[1 or 2 or 3] is <phi|r_a * exp^{-iAr}|beta>, a = x, y, z
             x = std::conj(x); 
         }
     }
 
-    assert(index == natomwfc);
+    assert(index_offset == natomwfc);
     ModuleBase::timer::tick("module_rt", "snap_psibeta_half_tddft");
-
-    return;
 }
 
-} // namespace module_rt
+} // namespace module_rt
\ No newline at end of file

From 16d8b9f46f8f31a7c3110c682c61d8fb63aabe19 Mon Sep 17 00:00:00 2001
From: dzzz2001 <dzcpku2001@163.com>
Date: Mon, 8 Dec 2025 00:28:56 +0800
Subject: [PATCH 3/3] Fix off-by-one error in radial interpolation boundary
 check

---
 source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
index 9ebc18d257..f362b58ebd 100644
--- a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
+++ b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp
@@ -53,7 +53,7 @@ inline double interpolate_radial(
     int iq = static_cast<int>(position);
     
     // Boundary check safe-guard
-    if (iq >= mesh - 4) return 0.0;
+    if (iq > mesh - 4) return 0.0;
 
     const double x0 = position - static_cast<double>(iq);
     const double x1 = 1.0 - x0;