Merge branch 'develop' into fix/dsp-kpar

Cstandardlib · web-flow · commit 059b2a86bfd0 · 2025-12-11T18:21:55.000+08:00
diff --git a/source/source_base/module_device/cuda_compat.h b/source/source_base/module_device/cuda_compat.h
@@ -0,0 +1,34 @@
+/**
+ * @file cuda_compat.h
+ * @brief Compatibility layer for CUDA and NVTX headers across different CUDA Toolkit versions.
+ *
+ * This header abstracts the differences in NVTX (NVIDIA Tools Extension) header locations
+ * between CUDA Toolkit versions.
+ *
+ * @note Depends on the CUDA_VERSION macro defined in <cuda.h>.
+ *
+ */
+
+#ifndef CUDA_COMPAT_H_
+#define CUDA_COMPAT_H_
+
+#include <cuda.h> // defines CUDA_VERSION
+
+// NVTX header for CUDA versions prior to 12.9 vs. 12.9+
+// This block ensures the correct NVTX header path is used based on CUDA_VERSION.
+// - For CUDA Toolkit < 12.9, the legacy header "nvToolsExt.h" is included.
+// - For CUDA Toolkit >= 12.9, the modern header "nvtx3/nvToolsExt.h" is included,
+// and NVTX v2 is removed from 12.9.
+// This allows NVTX profiling APIs (e.g. nvtxRangePush) to be used consistently
+// across different CUDA versions.
+// See:
+// https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#id4
+#if defined(__CUDA) && defined(__USE_NVTX)
+#if CUDA_VERSION < 12090
+    #include "nvToolsExt.h"
+#else
+    #include "nvtx3/nvToolsExt.h"
+#endif
+#endif
+
+#endif // CUDA_COMPAT_H_
diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp
@@ -15,11 +15,7 @@
 #include "source_base/formatter.h"
 
 #if defined(__CUDA) && defined(__USE_NVTX)
-#if CUDA_VERSION < 12090
-#include "nvToolsExt.h"
-#else
-#include "nvtx3/nvToolsExt.h"
-#endif
+#include "source_base/module_device/cuda_compat.h"
 #include "source_io/module_parameter/parameter.h"
 #endif
 
diff --git a/source/source_estate/module_dm/cal_edm_tddft.cpp b/source/source_estate/module_dm/cal_edm_tddft.cpp
@@ -15,10 +15,10 @@ namespace elecstate
 {
 void print_local_matrix(std::ostream& os,
                         const std::complex<double>* matrix_data,
-                        int local_rows, // pv.nrow
-                        int local_cols, // pv.ncol
-                        const std::string& matrix_name = "",
-                        int rank = -1)
+                        int local_rows,
+                        int local_cols,
+                        const std::string& matrix_name,
+                        int rank)
 {
     if (!matrix_name.empty() || rank >= 0)
     {
@@ -59,6 +59,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv,
                    K_Vectors& kv,
                    hamilt::Hamilt<std::complex<double>>* p_hamilt)
 {
+    ModuleBase::TITLE("elecstate", "cal_edm_tddft");
     ModuleBase::timer::tick("elecstate", "cal_edm_tddft");
 
     const int nlocal = PARAM.globalv.nlocal;
@@ -311,6 +312,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv,
                           K_Vectors& kv,
                           hamilt::Hamilt<std::complex<double>>* p_hamilt)
 {
+    ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor");
     ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor");
 
     const int nlocal = PARAM.globalv.nlocal;
@@ -541,6 +543,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,
                                  K_Vectors& kv,
                                  hamilt::Hamilt<std::complex<double>>* p_hamilt)
 {
+    ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor_lapack");
     ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack");
 
     const int nlocal = PARAM.globalv.nlocal;
diff --git a/source/source_estate/module_dm/cal_edm_tddft.h b/source/source_estate/module_dm/cal_edm_tddft.h
@@ -8,6 +8,13 @@
 
 namespace elecstate
 {
+void print_local_matrix(std::ostream& os,
+                        const std::complex<double>* matrix_data,
+                        int local_rows, // pv.nrow
+                        int local_cols, // pv.ncol
+                        const std::string& matrix_name = "",
+                        int rank = -1);
+
 void cal_edm_tddft(Parallel_Orbitals& pv,
                    LCAO_domain::Setup_DM<std::complex<double>>& dmat,
                    K_Vectors& kv,
diff --git a/source/source_hsolver/kernels/cuda/diag_cusolver.cuh b/source/source_hsolver/kernels/cuda/diag_cusolver.cuh
@@ -3,12 +3,6 @@
 #include <cuda.h>
 #include <complex>
 
-#if CUDA_VERSION < 12090
-#include "nvToolsExt.h"
-#else
-#include "nvtx3/nvToolsExt.h"
-#endif
-
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 
@@ -39,7 +33,7 @@ class Diag_Cusolver_gvd{
     double *d_A = nullptr;
     double *d_B = nullptr;
     double *d_work = nullptr;
-    
+
     cuDoubleComplex *d_A2 = nullptr;
     cuDoubleComplex *d_B2 = nullptr;
     cuDoubleComplex *d_work2 = nullptr;
@@ -54,7 +48,7 @@ class Diag_Cusolver_gvd{
 //  - init_double : initializing relevant double type data structures and gpu apis' handle and memory
 //  - init_complex : initializing relevant complex type data structures and gpu apis' handle and memory
 //      Input Parameters
-//          N: the dimension of the matrix 
+//          N: the dimension of the matrix
     void init_double(int N);
     void init_complex(int N);
 
@@ -70,17 +64,17 @@ public:
 //  - Dngvd_double : dense double type matrix
 //  - Dngvd_complex : dense complex type matrix
 //      Input Parameters
-//          N: the number of rows of the matrix 
-//          M: the number of cols of the matrix  
-//          A: the hermitian matrix A in A x=lambda B (column major) 
-//          B: the SPD matrix B in A x=lambda B (column major) 
+//          N: the number of rows of the matrix
+//          M: the number of cols of the matrix
+//          A: the hermitian matrix A in A x=lambda B (column major)
+//          B: the SPD matrix B in A x=lambda B (column major)
 //      Output Parameter
 //          W: generalized eigenvalues
 //          V: generalized eigenvectors (column major)
 
     void Dngvd_double(int N, int M, double *A, double *B, double *W, double *V);
     void Dngvd_complex(int N, int M, std::complex<double> *A, std::complex<double> *B, double *W, std::complex<double> *V);
-    
+
     void Dngvd(int N, int M, double *A, double *B, double *W, double *V)
     {
         return Dngvd_double(N, M, A, B, W, V);
diff --git a/source/source_lcao/module_rt/evolve_elec.cpp b/source/source_lcao/module_rt/evolve_elec.cpp
@@ -46,7 +46,7 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
     {
         phm->updateHk(ik);
 
-        ModuleBase::timer::tick("Efficiency", "evolve_k");
+        ModuleBase::timer::tick("TD_Efficiency", "evolve_k");
         psi->fix_k(ik);
         psi_laststep->fix_k(ik);
 
@@ -70,6 +70,8 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
         }
         else
         {
+            ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+
             const int len_psi_k_1 = use_lapack ? nband : psi->get_nbands();
             const int len_psi_k_2 = use_lapack ? nlocal : psi->get_nbasis();
             const int len_HS_laststep = use_lapack ? nlocal * nlocal : para_orb.nloc;
@@ -135,6 +137,8 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                                      len_HS_laststep);
             syncmem_double_h2d_op()(ekb_tensor.data<double>(), &(ekb(ik, 0)), nband);
 
+            ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+
             evolve_psi_tensor<Device>(nband,
                                       nlocal,
                                       &(para_orb),
@@ -149,6 +153,7 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                                       print_matrix,
                                       use_lapack);
 
+            ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
             // Need to distribute global psi back to all processes
             if (use_lapack)
             {
@@ -192,11 +197,14 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                 MPI_Bcast(&(ekb(ik, 0)), nband, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
             }
 #endif
+
+            ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+
             // GlobalV::ofs_running << "Print ekb: " << std::endl;
             // ekb.print(GlobalV::ofs_running);
         }
 
-        ModuleBase::timer::tick("Efficiency", "evolve_k");
+        ModuleBase::timer::tick("TD_Efficiency", "evolve_k");
     } // end k
 
     ModuleBase::timer::tick("Evolve_elec", "solve_psi");
diff --git a/source/source_lcao/module_rt/evolve_psi.cpp b/source/source_lcao/module_rt/evolve_psi.cpp
@@ -30,11 +30,8 @@ void evolve_psi(const int nband,
                 std::ofstream& ofs_running,
                 const int print_matrix)
 {
-    ModuleBase::TITLE("Evolve_psi", "evolve_psi");
-    // ofs_running << " Evolving electronic wave functions begins" << std::endl;
-
+    ModuleBase::TITLE("module_rt", "evolve_psi");
     time_t time_start = time(nullptr);
-    // ofs_running << " Start Time : " << ctime(&time_start);
 
 #ifdef __MPI
 
@@ -112,12 +109,10 @@ void evolve_psi(const int nband,
     delete[] Hold;
     delete[] U_operator;
 
-#endif
+#endif // __MPI
 
     time_t time_end = time(nullptr);
-    ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end);
-
-    // ofs_running << " Evolving electronic wave functions ends" << std::endl;
+    ModuleBase::GlobalFunc::OUT_TIME("evolve_psi", time_start, time_end);
 
     return;
 }
@@ -137,6 +132,9 @@ void evolve_psi_tensor(const int nband,
                        const int print_matrix,
                        const bool use_lapack)
 {
+    ModuleBase::TITLE("module_rt", "evolve_psi_tensor");
+    time_t time_start = time(nullptr);
+
     // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
     ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
     // ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
@@ -154,17 +152,12 @@ void evolve_psi_tensor(const int nband,
     }
 #endif // __CUDA
 
-    // ofs_running << " evolve_psi_tensor::start " << std::endl;
-
-    ModuleBase::TITLE("Evolve_psi", "evolve_psi");
-    time_t time_start = time(nullptr);
-    // ofs_running << " Start Time : " << ctime(&time_start);
-
 #ifdef __MPI
-
     hamilt::MatrixBlock<std::complex<double>> h_mat, s_mat;
     p_hamilt->matrix(h_mat, s_mat);
 
+    ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+
     // Create Tensor objects for temporary data and sync from host to device
     const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc;
     ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
@@ -198,6 +191,8 @@ void evolve_psi_tensor(const int nband,
         syncmem_complex_h2d_op()(Hold.data<std::complex<double>>(), h_mat.p, len_HS);
     }
 
+    ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
+
     ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
     U_operator.zero();
 
@@ -238,7 +233,7 @@ void evolve_psi_tensor(const int nband,
     /// @brief compute U_operator
     /// @input Stmp, Htmp, print_matrix
     /// @output U_operator
-    Propagator prop(propagator, pv, PARAM.mdp.md_dt);
+    Propagator prop(propagator, pv, PARAM.inp.td_dt);
     prop.compute_propagator_tensor<Device>(nlocal,
                                            Stmp,
                                            Htmp,
@@ -298,14 +293,8 @@ void evolve_psi_tensor(const int nband,
             compute_ekb_tensor_lapack<Device>(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
         }
     }
-
 #endif // __MPI
 
-    time_t time_end = time(nullptr);
-    ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end);
-
-    // ofs_running << " evolve_psi_tensor::end " << std::endl;
-
 #if ((defined __CUDA) /* || (defined __ROCM) */)
     if (ct_device_type == ct::DeviceType::GpuDevice)
     {
@@ -315,6 +304,9 @@ void evolve_psi_tensor(const int nband,
     }
 #endif // __CUDA
 
+    time_t time_end = time(nullptr);
+    ModuleBase::GlobalFunc::OUT_TIME("evolve_psi", time_start, time_end);
+
     return;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -15,10 +15,10 @@ namespace elecstate`
`15`	`15`	`{`
`16`	`16`	`void print_local_matrix(std::ostream& os,`
`17`	`17`	`const std::complex<double>* matrix_data,`
`18`		`- int local_rows, // pv.nrow`
`19`		`- int local_cols, // pv.ncol`
`20`		`- const std::string& matrix_name = "",`
`21`		`- int rank = -1)`
	`18`	`+ int local_rows,`
	`19`	`+ int local_cols,`
	`20`	`+ const std::string& matrix_name,`
	`21`	`+ int rank)`
`22`	`22`	`{`
`23`	`23`	`if (!matrix_name.empty() \|\| rank >= 0)`
`24`	`24`	`{`
`@@ -59,6 +59,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv,`
`59`	`59`	`K_Vectors& kv,`
`60`	`60`	`hamilt::Hamilt<std::complex<double>>* p_hamilt)`
`61`	`61`	`{`
	`62`	`+ ModuleBase::TITLE("elecstate", "cal_edm_tddft");`
`62`	`63`	`ModuleBase::timer::tick("elecstate", "cal_edm_tddft");`
`63`	`64`
`64`	`65`	`const int nlocal = PARAM.globalv.nlocal;`
`@@ -311,6 +312,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv,`
`311`	`312`	`K_Vectors& kv,`
`312`	`313`	`hamilt::Hamilt<std::complex<double>>* p_hamilt)`
`313`	`314`	`{`
	`315`	`+ ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor");`
`314`	`316`	`ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor");`
`315`	`317`
`316`	`318`	`const int nlocal = PARAM.globalv.nlocal;`
`@@ -541,6 +543,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,`
`541`	`543`	`K_Vectors& kv,`
`542`	`544`	`hamilt::Hamilt<std::complex<double>>* p_hamilt)`
`543`	`545`	`{`
	`546`	`+ ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor_lapack");`
`544`	`547`	`ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack");`
`545`	`548`
`546`	`549`	`const int nlocal = PARAM.globalv.nlocal;`