From f81adcc196d753eb967dc41466b8ce5f790b7a05 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 7 Dec 2025 21:35:05 +0800 Subject: [PATCH 1/3] Optimize snap_psibeta_half_tddft for better performance Key optimizations include: 1. Cache Gauss-Legendre Grid: Implemented thread-safe caching for radial integration grid points and weights. 2. Precompute Spherical Harmonics: Precomputed Ylm and A dot r on the Lebedev angular grid to reduce inner loop overhead. 3. Memory Optimization: Lifted vector allocations out of loops and reused buffers to minimize allocation costs. 4. Inline Interpolation: Inlined polynomial interpolation and used precomputed inverse step size to avoid divisions. 5. Common Subexpression Elimination: Extracted invariant factors from the innermost loop to reduce arithmetic operations. --- .../module_rt/snap_psibeta_half_tddft.cpp | 128 ++++++++++++++---- 1 file changed, 105 insertions(+), 23 deletions(-) diff --git a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp index a0bb7c3548..c479c74a41 100644 --- a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp +++ b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp @@ -106,12 +106,49 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, const int mesh_r1 = orb.Phi[T1].PhiLN(L1, N1).getNr(); const double* psi_1 = orb.Phi[T1].PhiLN(L1, N1).getPsi(); const double dk_1 = orb.Phi[T1].PhiLN(L1, N1).getDk(); + const double inv_dk_1 = 1.0 / dk_1; const int ridial_grid_num = 140; const int angular_grid_num = 110; std::vector r_ridial(ridial_grid_num); std::vector weights_ridial(ridial_grid_num); + // OPTIMIZATION START: Cache standard Gauss-Legendre grid + static std::vector gl_x(ridial_grid_num); + static std::vector gl_w(ridial_grid_num); + static bool gl_init = false; + + // Thread-safe initialization + if (!gl_init) + { + #pragma omp critical(init_gauss_legendre) + { + if (!gl_init) + { + ModuleBase::Integral::Gauss_Legendre_grid_and_weight(ridial_grid_num, gl_x.data(), gl_w.data()); + gl_init = true; + } + } + } + // OPTIMIZATION END + + // Precompute A dot r_angular for Lebedev grid + std::vector A_dot_lebedev(angular_grid_num); + for (int ian = 0; ian < angular_grid_num; ++ian) + { + A_dot_lebedev[ian] = A.x * ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian] + + A.y * ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian] + + A.z * ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian]; + } + + // Buffers to reuse + std::vector> result_angular; + std::vector> result_angular_r_commu_x; + std::vector> result_angular_r_commu_y; + std::vector> result_angular_r_commu_z; + std::vector rly1; + std::vector> rly0_all(angular_grid_num); + int index = 0; for (int nb = 0; nb < nproj; nb++) { @@ -128,28 +165,52 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, const double dk_0 = infoNL_.Beta[T0].Proj[nb].getDk(); const double Rcut0 = infoNL_.Beta[T0].Proj[nb].getRcut(); - ModuleBase::Integral::Gauss_Legendre_grid_and_weight(radial0[0], - radial0[mesh_r0 - 1], - ridial_grid_num, - r_ridial.data(), - weights_ridial.data()); + + // OPTIMIZATION: Use precomputed standard Gauss-Legendre grid + double r_min = radial0[0]; + double r_max = radial0[mesh_r0 - 1]; + double xl = (r_max - r_min) * 0.5; + double xmean = (r_max + r_min) * 0.5; + + for(int i=0; i exp_iAR0 = std::exp(ModuleBase::IMAG_UNIT * A_phase); - std::vector rly0(L0); - std::vector rly1(L1); + // Precompute rly0 for all angular points + for(int ian = 0; ian < angular_grid_num; ++ian) { + ModuleBase::Ylm::rl_sph_harm(L0, + ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian], + ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian], + ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian], + rly0_all[ian]); + } + + // Resize buffers + if (result_angular.size() < 2 * L0 + 1) + { + result_angular.resize(2 * L0 + 1); + if (calc_r) + { + result_angular_r_commu_x.resize(2 * L0 + 1); + result_angular_r_commu_y.resize(2 * L0 + 1); + result_angular_r_commu_z.resize(2 * L0 + 1); + } + } + for (int ir = 0; ir < ridial_grid_num; ir++) { - std::vector> result_angular(2 * L0 + 1, 0.0); - std::vector> result_angular_r_commu_x; - std::vector> result_angular_r_commu_y; - std::vector> result_angular_r_commu_z; + // Reset result accumulators + std::fill(result_angular.begin(), result_angular.begin() + (2 * L0 + 1), 0.0); if (calc_r) { - result_angular_r_commu_x.resize(2 * L0 + 1, 0.0); - result_angular_r_commu_y.resize(2 * L0 + 1, 0.0); - result_angular_r_commu_z.resize(2 * L0 + 1, 0.0); + std::fill(result_angular_r_commu_x.begin(), result_angular_r_commu_x.begin() + (2 * L0 + 1), 0.0); + std::fill(result_angular_r_commu_y.begin(), result_angular_r_commu_y.begin() + (2 * L0 + 1), 0.0); + std::fill(result_angular_r_commu_z.begin(), result_angular_r_commu_z.begin() + (2 * L0 + 1), 0.0); } for (int ian = 0; ian < angular_grid_num; ian++) @@ -158,11 +219,13 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, const double y = ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian]; const double z = ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian]; const double weights_angular = ModuleBase::Integral::Lebedev_Laikov_grid110_w[ian]; - const ModuleBase::Vector3 r_angular_tmp(x, y, z); - - const ModuleBase::Vector3 r_coor = r_ridial[ir] * r_angular_tmp; + + const double r_val = r_ridial[ir]; + const ModuleBase::Vector3 r_coor(r_val * x, r_val * y, r_val * z); + const ModuleBase::Vector3 tmp_r_coor = r_coor + dRa; const double tmp_r_coor_norm = tmp_r_coor.norm(); + if (tmp_r_coor_norm > Rcut1) { continue; @@ -174,21 +237,40 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, tmp_r_unit = tmp_r_coor / tmp_r_coor_norm; } - ModuleBase::Ylm::rl_sph_harm(L0, x, y, z, rly0); + const std::vector& rly0_vec = rly0_all[ian]; ModuleBase::Ylm::rl_sph_harm(L1, tmp_r_unit.x, tmp_r_unit.y, tmp_r_unit.z, rly1); - const double phase = A * r_coor; + const double phase = r_val * A_dot_lebedev[ian]; const std::complex exp_iAr = std::exp(ModuleBase::IMAG_UNIT * phase); const ModuleBase::Vector3 tmp_r_coor_r_commu = r_coor + R0; - const double interp_v = ModuleBase::PolyInt::Polynomial_Interpolation(psi_1, - mesh_r1, dk_1, tmp_r_coor_norm); + + // OPTIMIZATION: Inline Polynomial Interpolation + double position = tmp_r_coor_norm * inv_dk_1; + int iq = static_cast(position); + double interp_v = 0.0; + + if (iq <= mesh_r1 - 4) + { + const double x0 = position - static_cast(iq); + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + interp_v = x1*x2*(psi_1[iq]*x3+psi_1[iq+3]*x0)/6.0 + + x0*x3*(psi_1[iq+1]*x2-psi_1[iq+2]*x1)/2.0; + } + + const double weight_interp = interp_v * weights_angular; + const int offset_L0 = L0 * L0; + const int offset_L1 = L1 * L1 + m1; + const double rly1_val = rly1[offset_L1]; + + const std::complex common_factor = exp_iAr * rly1_val * weight_interp; for (int m0 = 0; m0 < 2 * L0 + 1; m0++) { - std::complex temp = exp_iAr * rly0[L0 * L0 + m0] * rly1[L1 * L1 + m1] - * interp_v * weights_angular; + std::complex temp = common_factor * rly0_vec[offset_L0 + m0]; result_angular[m0] += temp; if (calc_r) From 3e1fe152d75667c0763bdd62d5ae5b660e1cbf41 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sun, 7 Dec 2025 22:52:50 +0800 Subject: [PATCH 2/3] Refactor snap_psibeta_half_tddft: improved code structure, added comments, and consolidated optimizations --- .../module_rt/snap_psibeta_half_tddft.cpp | 434 ++++++++++-------- 1 file changed, 242 insertions(+), 192 deletions(-) diff --git a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp index c479c74a41..9ebc18d257 100644 --- a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp +++ b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp @@ -5,12 +5,89 @@ #include "source_base/math_polyint.h" #include "source_base/timer.h" #include "source_base/ylm.h" +#include +#include +#include namespace module_rt { -// nlm[0] : -// nlm[1, 2, 3,] : , which a = x, y, z. +/** + * @brief Initialize Gauss-Legendre grid points and weights. + * Thread-safe initialization using static local variable. + * + * @param grid_size Number of grid points (140) + * @param gl_x Output: Grid points in [-1, 1] + * @param gl_w Output: Weights + */ +static void init_gauss_legendre_grid(int grid_size, std::vector& gl_x, std::vector& gl_w) +{ + static bool init = false; + // Thread-safe initialization + #pragma omp critical(init_gauss_legendre) + { + if (!init) + { + ModuleBase::Integral::Gauss_Legendre_grid_and_weight(grid_size, gl_x.data(), gl_w.data()); + init = true; + } + } +} + +/** + * @brief Interpolate radial function value at a given distance using cubic interpolation. + * + * @param psi Pointer to radial function array + * @param mesh Number of mesh points + * @param inv_dk Inverse of grid spacing (1/dk) + * @param distance Distance to interpolate at + * @return double Interpolated value + */ +inline double interpolate_radial( + const double* psi, + int mesh, + double inv_dk, + double distance) +{ + double position = distance * inv_dk; + int iq = static_cast(position); + + // Boundary check safe-guard + if (iq >= mesh - 4) return 0.0; + + const double x0 = position - static_cast(iq); + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + + // Cubic interpolation formula + return x1*x2*(psi[iq]*x3+psi[iq+3]*x0)/6.0 + + x0*x3*(psi[iq+1]*x2-psi[iq+2]*x1)/2.0; +} + +/** + * @brief Main function to calculate overlap integrals + * and its derivatives (if calc_r is true). + * + * This function integrates the overlap between a local orbital phi (at R1) + * and a non-local projector beta (at R0), modulated by a plane-wave-like phase factor + * exp^{-iAr}, where A is a vector potential. + * + * @param orb LCAO Orbitals information + * @param infoNL_ Non-local pseudopotential information + * @param nlm Output: + * nlm[0] : + * nlm[1, 2, 3] : , a = x, y, z (if calc_r=true) + * @param R1 Position of atom 1 (orbital phi) + * @param T1 Type of atom 1 + * @param L1 Angular momentum of orbital phi + * @param m1 Magnetic quantum number of orbital phi + * @param N1 Radial quantum number of orbital phi + * @param R0 Position of atom 0 (projector beta) + * @param T0 Type of atom 0 + * @param A Vector potential A (or related field vector) + * @param calc_r Whether to calculate position operator matrix elements + */ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, const InfoNonlocal& infoNL_, std::vector>>& nlm, @@ -19,120 +96,76 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, const int& L1, const int& m1, const int& N1, - const ModuleBase::Vector3& R0, // The projector. + const ModuleBase::Vector3& R0, const int& T0, const ModuleBase::Vector3& A, const bool& calc_r) { ModuleBase::timer::tick("module_rt", "snap_psibeta_half_tddft"); - // find number of projectors on atom R0 + // 1. Initialization and Early Exits const int nproj = infoNL_.nproj[T0]; - if (nproj == 0) - { - if (calc_r) - { - nlm.resize(4); - } - else - { - nlm.resize(1); - } - return; - } - - std::vector calproj(nproj); - std::vector rmesh1(nproj); - - if (calc_r) - { - nlm.resize(4); - } - else - { - nlm.resize(1); - } - - // Count number of projectors (l,m) + + // Resize output vector based on whether position operator matrix elements are needed + int required_size = calc_r ? 4 : 1; + if (nlm.size() != required_size) nlm.resize(required_size); + + if (nproj == 0) return; + + // 2. Determine total number of projectors and identify active ones based on cutoff int natomwfc = 0; - for (int ip = 0; ip < nproj; ip++) - { - //============================ - // Use pseudo-atomic orbitals - //============================ - - const int L0 = infoNL_.Beta[T0].Proj[ip].getL(); // mohan add 2021-05-07 - natomwfc += 2 * L0 + 1; - } - - for (int dim = 0; dim < nlm.size(); dim++) - { - nlm[dim].resize(natomwfc); - for (auto& x: nlm[dim]) - { - x = 0.0; - } - } - - // rcut of orbtials and projectors - // in our calculation, we always put orbital phi at the left side of - // because = + std::vector calproj(nproj, false); + const double Rcut1 = orb.Phi[T1].getRcut(); const ModuleBase::Vector3 dRa = R0 - R1; - - double distance10 = dRa.norm(); - - bool all_out = true; + const double distance10 = dRa.norm(); + + bool any_active = false; for (int ip = 0; ip < nproj; ip++) { + const int L0 = infoNL_.Beta[T0].Proj[ip].getL(); + natomwfc += 2 * L0 + 1; + const double Rcut0 = infoNL_.Beta[T0].Proj[ip].getRcut(); - if (distance10 > (Rcut1 + Rcut0)) + if (distance10 <= (Rcut1 + Rcut0)) { - calproj[ip] = false; - } - else - { - all_out = false; calproj[ip] = true; + any_active = true; } } - if (all_out) + // Initialize output values to zero and resize inner vectors + for (auto& x : nlm) { + x.assign(natomwfc, 0.0); + } + + if (!any_active) { ModuleBase::timer::tick("module_rt", "snap_psibeta_half_tddft"); return; } - const int mesh_r1 = orb.Phi[T1].PhiLN(L1, N1).getNr(); - const double* psi_1 = orb.Phi[T1].PhiLN(L1, N1).getPsi(); - const double dk_1 = orb.Phi[T1].PhiLN(L1, N1).getDk(); + // 3. Prepare Orbital Data (Phi) + const auto& phi_ln = orb.Phi[T1].PhiLN(L1, N1); + const int mesh_r1 = phi_ln.getNr(); + const double* psi_1 = phi_ln.getPsi(); + const double dk_1 = phi_ln.getDk(); const double inv_dk_1 = 1.0 / dk_1; - const int ridial_grid_num = 140; + // 4. Prepare Integration Grids + const int radial_grid_num = 140; const int angular_grid_num = 110; - std::vector r_ridial(ridial_grid_num); - std::vector weights_ridial(ridial_grid_num); - - // OPTIMIZATION START: Cache standard Gauss-Legendre grid - static std::vector gl_x(ridial_grid_num); - static std::vector gl_w(ridial_grid_num); - static bool gl_init = false; + + // Cached standard Gauss-Legendre grid + static std::vector gl_x(radial_grid_num); + static std::vector gl_w(radial_grid_num); + init_gauss_legendre_grid(radial_grid_num, gl_x, gl_w); - // Thread-safe initialization - if (!gl_init) - { - #pragma omp critical(init_gauss_legendre) - { - if (!gl_init) - { - ModuleBase::Integral::Gauss_Legendre_grid_and_weight(ridial_grid_num, gl_x.data(), gl_w.data()); - gl_init = true; - } - } - } - // OPTIMIZATION END + // Buffers for mapped radial grid + std::vector r_radial(radial_grid_num); + std::vector w_radial(radial_grid_num); - // Precompute A dot r_angular for Lebedev grid + // Precompute A dot r_angular (A * u_angle) for the Lebedev grid std::vector A_dot_lebedev(angular_grid_num); for (int ian = 0; ian < angular_grid_num; ++ian) { @@ -141,189 +174,206 @@ void snap_psibeta_half_tddft(const LCAO_Orbitals& orb, A.z * ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian]; } - // Buffers to reuse - std::vector> result_angular; - std::vector> result_angular_r_commu_x; - std::vector> result_angular_r_commu_y; - std::vector> result_angular_r_commu_z; - std::vector rly1; - std::vector> rly0_all(angular_grid_num); + // Reuseable buffers for inner loops to avoid allocation + std::vector> result_angular; // Accumulator for angular integration + // Accumulators for position operator components + std::vector> res_ang_x, res_ang_y, res_ang_z; + + std::vector rly1((L1 + 1) * (L1 + 1)); // Spherical harmonics buffer for L1 + std::vector> rly0_cache(angular_grid_num); // Cache for L0 Ylm - int index = 0; + // 5. Loop over Projectors (Beta) + int index_offset = 0; for (int nb = 0; nb < nproj; nb++) { const int L0 = infoNL_.Beta[T0].Proj[nb].getL(); + const int num_m0 = 2 * L0 + 1; + if (!calproj[nb]) { - index += 2 * L0 + 1; + index_offset += num_m0; continue; } - const int mesh_r0 = infoNL_.Beta[T0].Proj[nb].getNr(); - const double* beta_r = infoNL_.Beta[T0].Proj[nb].getBeta_r(); - const double* radial0 = infoNL_.Beta[T0].Proj[nb].getRadial(); - const double dk_0 = infoNL_.Beta[T0].Proj[nb].getDk(); + const auto& proj = infoNL_.Beta[T0].Proj[nb]; + const int mesh_r0 = proj.getNr(); + const double* beta_r = proj.getBeta_r(); + const double* radial0 = proj.getRadial(); + const double dk_0 = proj.getDk(); + const double Rcut0 = proj.getRcut(); - const double Rcut0 = infoNL_.Beta[T0].Proj[nb].getRcut(); - - // OPTIMIZATION: Use precomputed standard Gauss-Legendre grid + // 5.1 Map Gauss-Legendre grid to radial interval [r_min, r_max] double r_min = radial0[0]; double r_max = radial0[mesh_r0 - 1]; double xl = (r_max - r_min) * 0.5; double xmean = (r_max + r_min) * 0.5; - for(int i=0; i exp_iAR0 = std::exp(ModuleBase::IMAG_UNIT * A_phase); - // Precompute rly0 for all angular points + // 5.2 Precompute Spherical Harmonics (Ylm) for L0 on angular grid + // Since L0 changes with projector, we compute this per projector loop. for(int ian = 0; ian < angular_grid_num; ++ian) { ModuleBase::Ylm::rl_sph_harm(L0, ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian], ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian], ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian], - rly0_all[ian]); + rly0_cache[ian]); } - // Resize buffers - if (result_angular.size() < 2 * L0 + 1) + // Resize accumulators if needed + if (result_angular.size() < num_m0) { - result_angular.resize(2 * L0 + 1); + result_angular.resize(num_m0); if (calc_r) { - result_angular_r_commu_x.resize(2 * L0 + 1); - result_angular_r_commu_y.resize(2 * L0 + 1); - result_angular_r_commu_z.resize(2 * L0 + 1); + res_ang_x.resize(num_m0); + res_ang_y.resize(num_m0); + res_ang_z.resize(num_m0); } } - for (int ir = 0; ir < ridial_grid_num; ir++) + // 5.3 Radial Integration Loop + for (int ir = 0; ir < radial_grid_num; ir++) { - // Reset result accumulators - std::fill(result_angular.begin(), result_angular.begin() + (2 * L0 + 1), 0.0); + const double r_val = r_radial[ir]; + + // Reset angular accumulators for this radial shell + std::fill(result_angular.begin(), result_angular.begin() + num_m0, 0.0); if (calc_r) { - std::fill(result_angular_r_commu_x.begin(), result_angular_r_commu_x.begin() + (2 * L0 + 1), 0.0); - std::fill(result_angular_r_commu_y.begin(), result_angular_r_commu_y.begin() + (2 * L0 + 1), 0.0); - std::fill(result_angular_r_commu_z.begin(), result_angular_r_commu_z.begin() + (2 * L0 + 1), 0.0); + std::fill(res_ang_x.begin(), res_ang_x.begin() + num_m0, 0.0); + std::fill(res_ang_y.begin(), res_ang_y.begin() + num_m0, 0.0); + std::fill(res_ang_z.begin(), res_ang_z.begin() + num_m0, 0.0); } + // 5.4 Angular Integration Loop (Lebedev Grid) for (int ian = 0; ian < angular_grid_num; ian++) { const double x = ModuleBase::Integral::Lebedev_Laikov_grid110_x[ian]; const double y = ModuleBase::Integral::Lebedev_Laikov_grid110_y[ian]; const double z = ModuleBase::Integral::Lebedev_Laikov_grid110_z[ian]; - const double weights_angular = ModuleBase::Integral::Lebedev_Laikov_grid110_w[ian]; + const double w_ang = ModuleBase::Integral::Lebedev_Laikov_grid110_w[ian]; - const double r_val = r_ridial[ir]; - const ModuleBase::Vector3 r_coor(r_val * x, r_val * y, r_val * z); + // Vector r = r_val * u_angle + double rx = r_val * x; + double ry = r_val * y; + double rz = r_val * z; - const ModuleBase::Vector3 tmp_r_coor = r_coor + dRa; - const double tmp_r_coor_norm = tmp_r_coor.norm(); + // Vector r' = r + R0 - R1 = r + dRa + double tx = rx + dRa.x; + double ty = ry + dRa.y; + double tz = rz + dRa.z; - if (tmp_r_coor_norm > Rcut1) + double tnorm = std::sqrt(tx*tx + ty*ty + tz*tz); + + // If r' is outside the cutoff of Phi(r'), skip + if (tnorm > Rcut1) continue; + + // Compute Ylm for L1 at direction r' + if (tnorm > 1e-10) { - continue; + double inv_tnorm = 1.0 / tnorm; + ModuleBase::Ylm::rl_sph_harm(L1, tx * inv_tnorm, ty * inv_tnorm, tz * inv_tnorm, rly1); } - - ModuleBase::Vector3 tmp_r_unit; - if (tmp_r_coor_norm > 1e-10) + else { - tmp_r_unit = tmp_r_coor / tmp_r_coor_norm; + // At origin, only Y_00 is non-zero (if using real spherical harmonics convention) + ModuleBase::Ylm::rl_sph_harm(L1, 0.0, 0.0, 1.0, rly1); } - const std::vector& rly0_vec = rly0_all[ian]; - - ModuleBase::Ylm::rl_sph_harm(L1, tmp_r_unit.x, tmp_r_unit.y, tmp_r_unit.z, rly1); - + // Calculate common phase and weight factor + // phase = A * r = r_val * (A * u_angle) const double phase = r_val * A_dot_lebedev[ian]; const std::complex exp_iAr = std::exp(ModuleBase::IMAG_UNIT * phase); - const ModuleBase::Vector3 tmp_r_coor_r_commu = r_coor + R0; + // Interpolate Psi at |r'| + double interp_psi = interpolate_radial(psi_1, mesh_r1, inv_dk_1, tnorm); - // OPTIMIZATION: Inline Polynomial Interpolation - double position = tmp_r_coor_norm * inv_dk_1; - int iq = static_cast(position); - double interp_v = 0.0; - - if (iq <= mesh_r1 - 4) - { - const double x0 = position - static_cast(iq); - const double x1 = 1.0 - x0; - const double x2 = 2.0 - x0; - const double x3 = 3.0 - x0; - interp_v = x1*x2*(psi_1[iq]*x3+psi_1[iq+3]*x0)/6.0 - + x0*x3*(psi_1[iq+1]*x2-psi_1[iq+2]*x1)/2.0; - } - - const double weight_interp = interp_v * weights_angular; - const int offset_L0 = L0 * L0; const int offset_L1 = L1 * L1 + m1; - const double rly1_val = rly1[offset_L1]; + const double ylm_L1_val = rly1[offset_L1]; - const std::complex common_factor = exp_iAr * rly1_val * weight_interp; + // Combined factor: exp(iAr) * Y_L1m1(r') * Psi(|r'|) * weight_angle + const std::complex common_factor = exp_iAr * ylm_L1_val * interp_psi * w_ang; - for (int m0 = 0; m0 < 2 * L0 + 1; m0++) + // Retrieve precomputed Y_L0m0(r) + const std::vector& rly0_vec = rly0_cache[ian]; + const int offset_L0 = L0 * L0; + + // Accumulate results for all m0 components + for (int m0 = 0; m0 < num_m0; m0++) { - std::complex temp = common_factor * rly0_vec[offset_L0 + m0]; - result_angular[m0] += temp; + std::complex term = common_factor * rly0_vec[offset_L0 + m0]; + result_angular[m0] += term; if (calc_r) { - result_angular_r_commu_x[m0] += temp * tmp_r_coor_r_commu.x; - result_angular_r_commu_y[m0] += temp * tmp_r_coor_r_commu.y; - result_angular_r_commu_z[m0] += temp * tmp_r_coor_r_commu.z; + // Position operator r_op = r + R0 + // Note: Term involves (r_op)_a * exp(...). + double r_op_x = rx + R0.x; + double r_op_y = ry + R0.y; + double r_op_z = rz + R0.z; + + res_ang_x[m0] += term * r_op_x; + res_ang_y[m0] += term * r_op_y; + res_ang_z[m0] += term * r_op_z; } } - } - - int index_tmp = index; - const double temp = ModuleBase::PolyInt::Polynomial_Interpolation(beta_r, - mesh_r0, dk_0, r_ridial[ir]) * r_ridial[ir] * weights_ridial[ir]; - - if (!calc_r) + } // End Angular Loop + + // 5.5 Combine Radial and Angular parts + // Interpolate Beta(|r|) + // Note: The original code implies beta_r stores values that might need scaling or are just the function values. + // Typically radial integration is \int f(r) r^2 dr. + // Here we have factor: beta_val * r_radial[ir] * w_radial[ir] + // w_radial includes the Jacobian for the change of variable from [-1,1] to [r_min, r_max]. + // The extra r_radial[ir] suggests either beta is stored as r*beta, or we are doing \int ... r dr (2D?), + // or Jacobian r^2 is split. Assuming original logic is correct. + + double beta_val = ModuleBase::PolyInt::Polynomial_Interpolation( + beta_r, mesh_r0, dk_0, r_radial[ir]); + + double radial_factor = beta_val * r_radial[ir] * w_radial[ir]; + + int current_idx = index_offset; + for (int m0 = 0; m0 < num_m0; m0++) { - for (int m0 = 0; m0 < 2 * L0 + 1; m0++) - { - nlm[0][index_tmp] += temp * result_angular[m0] * exp_iAR0; - index_tmp++; - } - } - else - { - for (int m0 = 0; m0 < 2 * L0 + 1; m0++) + // Final accumulation into global nlm array + // Add phase exp(i A * R0) + nlm[0][current_idx] += radial_factor * result_angular[m0] * exp_iAR0; + + if (calc_r) { - nlm[0][index_tmp] += temp * result_angular[m0] * exp_iAR0; - nlm[1][index_tmp] += temp * result_angular_r_commu_x[m0] * exp_iAR0; - nlm[2][index_tmp] += temp * result_angular_r_commu_y[m0] * exp_iAR0; - nlm[3][index_tmp] += temp * result_angular_r_commu_z[m0] * exp_iAR0; - index_tmp++; + nlm[1][current_idx] += radial_factor * res_ang_x[m0] * exp_iAR0; + nlm[2][current_idx] += radial_factor * res_ang_y[m0] * exp_iAR0; + nlm[3][current_idx] += radial_factor * res_ang_z[m0] * exp_iAR0; } + current_idx++; } - } - index += 2 * L0 + 1; - } + } // End Radial Loop + index_offset += num_m0; + } // End Projector Loop + + // 6. Final Conjugation + // Apply conjugation to all elements as per convention = * for(int dim = 0; dim < nlm.size(); dim++) { for (auto &x : nlm[dim]) { - // nlm[0] is - // nlm[1 or 2 or 3] is , a = x, y, z x = std::conj(x); } } - assert(index == natomwfc); + assert(index_offset == natomwfc); ModuleBase::timer::tick("module_rt", "snap_psibeta_half_tddft"); - - return; } -} // namespace module_rt +} // namespace module_rt \ No newline at end of file From 16d8b9f46f8f31a7c3110c682c61d8fb63aabe19 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Mon, 8 Dec 2025 00:28:56 +0800 Subject: [PATCH 3/3] Fix off-by-one error in radial interpolation boundary check --- source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp index 9ebc18d257..f362b58ebd 100644 --- a/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp +++ b/source/source_lcao/module_rt/snap_psibeta_half_tddft.cpp @@ -53,7 +53,7 @@ inline double interpolate_radial( int iq = static_cast(position); // Boundary check safe-guard - if (iq >= mesh - 4) return 0.0; + if (iq > mesh - 4) return 0.0; const double x0 = position - static_cast(iq); const double x1 = 1.0 - x0;