Skip to content

Commit 059b2a8

Browse files
authored
Merge branch 'develop' into fix/dsp-kpar
2 parents fba412d + 32ef0ed commit 059b2a8

File tree

7 files changed

+80
-46
lines changed

7 files changed

+80
-46
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/**
2+
* @file cuda_compat.h
3+
* @brief Compatibility layer for CUDA and NVTX headers across different CUDA Toolkit versions.
4+
*
5+
* This header abstracts the differences in NVTX (NVIDIA Tools Extension) header locations
6+
* between CUDA Toolkit versions.
7+
*
8+
* @note Depends on the CUDA_VERSION macro defined in <cuda.h>.
9+
*
10+
*/
11+
12+
#ifndef CUDA_COMPAT_H_
13+
#define CUDA_COMPAT_H_
14+
15+
#include <cuda.h> // defines CUDA_VERSION
16+
17+
// NVTX header for CUDA versions prior to 12.9 vs. 12.9+
18+
// This block ensures the correct NVTX header path is used based on CUDA_VERSION.
19+
// - For CUDA Toolkit < 12.9, the legacy header "nvToolsExt.h" is included.
20+
// - For CUDA Toolkit >= 12.9, the modern header "nvtx3/nvToolsExt.h" is included,
21+
// and NVTX v2 is removed from 12.9.
22+
// This allows NVTX profiling APIs (e.g. nvtxRangePush) to be used consistently
23+
// across different CUDA versions.
24+
// See:
25+
// https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#id4
26+
#if defined(__CUDA) && defined(__USE_NVTX)
27+
#if CUDA_VERSION < 12090
28+
#include "nvToolsExt.h"
29+
#else
30+
#include "nvtx3/nvToolsExt.h"
31+
#endif
32+
#endif
33+
34+
#endif // CUDA_COMPAT_H_

source/source_base/timer.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,7 @@
1515
#include "source_base/formatter.h"
1616

1717
#if defined(__CUDA) && defined(__USE_NVTX)
18-
#if CUDA_VERSION < 12090
19-
#include "nvToolsExt.h"
20-
#else
21-
#include "nvtx3/nvToolsExt.h"
22-
#endif
18+
#include "source_base/module_device/cuda_compat.h"
2319
#include "source_io/module_parameter/parameter.h"
2420
#endif
2521

source/source_estate/module_dm/cal_edm_tddft.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ namespace elecstate
1515
{
1616
void print_local_matrix(std::ostream& os,
1717
const std::complex<double>* matrix_data,
18-
int local_rows, // pv.nrow
19-
int local_cols, // pv.ncol
20-
const std::string& matrix_name = "",
21-
int rank = -1)
18+
int local_rows,
19+
int local_cols,
20+
const std::string& matrix_name,
21+
int rank)
2222
{
2323
if (!matrix_name.empty() || rank >= 0)
2424
{
@@ -59,6 +59,7 @@ void cal_edm_tddft(Parallel_Orbitals& pv,
5959
K_Vectors& kv,
6060
hamilt::Hamilt<std::complex<double>>* p_hamilt)
6161
{
62+
ModuleBase::TITLE("elecstate", "cal_edm_tddft");
6263
ModuleBase::timer::tick("elecstate", "cal_edm_tddft");
6364

6465
const int nlocal = PARAM.globalv.nlocal;
@@ -311,6 +312,7 @@ void cal_edm_tddft_tensor(Parallel_Orbitals& pv,
311312
K_Vectors& kv,
312313
hamilt::Hamilt<std::complex<double>>* p_hamilt)
313314
{
315+
ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor");
314316
ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor");
315317

316318
const int nlocal = PARAM.globalv.nlocal;
@@ -541,6 +543,7 @@ void cal_edm_tddft_tensor_lapack(Parallel_Orbitals& pv,
541543
K_Vectors& kv,
542544
hamilt::Hamilt<std::complex<double>>* p_hamilt)
543545
{
546+
ModuleBase::TITLE("elecstate", "cal_edm_tddft_tensor_lapack");
544547
ModuleBase::timer::tick("elecstate", "cal_edm_tddft_tensor_lapack");
545548

546549
const int nlocal = PARAM.globalv.nlocal;

source/source_estate/module_dm/cal_edm_tddft.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@
88

99
namespace elecstate
1010
{
11+
void print_local_matrix(std::ostream& os,
12+
const std::complex<double>* matrix_data,
13+
int local_rows, // pv.nrow
14+
int local_cols, // pv.ncol
15+
const std::string& matrix_name = "",
16+
int rank = -1);
17+
1118
void cal_edm_tddft(Parallel_Orbitals& pv,
1219
LCAO_domain::Setup_DM<std::complex<double>>& dmat,
1320
K_Vectors& kv,

source/source_hsolver/kernels/cuda/diag_cusolver.cuh

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,6 @@
33
#include <cuda.h>
44
#include <complex>
55

6-
#if CUDA_VERSION < 12090
7-
#include "nvToolsExt.h"
8-
#else
9-
#include "nvtx3/nvToolsExt.h"
10-
#endif
11-
126
#include <cuda_runtime.h>
137
#include <cusolverDn.h>
148

@@ -39,7 +33,7 @@ class Diag_Cusolver_gvd{
3933
double *d_A = nullptr;
4034
double *d_B = nullptr;
4135
double *d_work = nullptr;
42-
36+
4337
cuDoubleComplex *d_A2 = nullptr;
4438
cuDoubleComplex *d_B2 = nullptr;
4539
cuDoubleComplex *d_work2 = nullptr;
@@ -54,7 +48,7 @@ class Diag_Cusolver_gvd{
5448
// - init_double : initializing relevant double type data structures and gpu apis' handle and memory
5549
// - init_complex : initializing relevant complex type data structures and gpu apis' handle and memory
5650
// Input Parameters
57-
// N: the dimension of the matrix
51+
// N: the dimension of the matrix
5852
void init_double(int N);
5953
void init_complex(int N);
6054

@@ -70,17 +64,17 @@ public:
7064
// - Dngvd_double : dense double type matrix
7165
// - Dngvd_complex : dense complex type matrix
7266
// Input Parameters
73-
// N: the number of rows of the matrix
74-
// M: the number of cols of the matrix
75-
// A: the hermitian matrix A in A x=lambda B (column major)
76-
// B: the SPD matrix B in A x=lambda B (column major)
67+
// N: the number of rows of the matrix
68+
// M: the number of cols of the matrix
69+
// A: the hermitian matrix A in A x=lambda B (column major)
70+
// B: the SPD matrix B in A x=lambda B (column major)
7771
// Output Parameter
7872
// W: generalized eigenvalues
7973
// V: generalized eigenvectors (column major)
8074

8175
void Dngvd_double(int N, int M, double *A, double *B, double *W, double *V);
8276
void Dngvd_complex(int N, int M, std::complex<double> *A, std::complex<double> *B, double *W, std::complex<double> *V);
83-
77+
8478
void Dngvd(int N, int M, double *A, double *B, double *W, double *V)
8579
{
8680
return Dngvd_double(N, M, A, B, W, V);

source/source_lcao/module_rt/evolve_elec.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
4646
{
4747
phm->updateHk(ik);
4848

49-
ModuleBase::timer::tick("Efficiency", "evolve_k");
49+
ModuleBase::timer::tick("TD_Efficiency", "evolve_k");
5050
psi->fix_k(ik);
5151
psi_laststep->fix_k(ik);
5252

@@ -70,6 +70,8 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
7070
}
7171
else
7272
{
73+
ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
74+
7375
const int len_psi_k_1 = use_lapack ? nband : psi->get_nbands();
7476
const int len_psi_k_2 = use_lapack ? nlocal : psi->get_nbasis();
7577
const int len_HS_laststep = use_lapack ? nlocal * nlocal : para_orb.nloc;
@@ -135,6 +137,8 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
135137
len_HS_laststep);
136138
syncmem_double_h2d_op()(ekb_tensor.data<double>(), &(ekb(ik, 0)), nband);
137139

140+
ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
141+
138142
evolve_psi_tensor<Device>(nband,
139143
nlocal,
140144
&(para_orb),
@@ -149,6 +153,7 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
149153
print_matrix,
150154
use_lapack);
151155

156+
ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
152157
// Need to distribute global psi back to all processes
153158
if (use_lapack)
154159
{
@@ -192,11 +197,14 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
192197
MPI_Bcast(&(ekb(ik, 0)), nband, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
193198
}
194199
#endif
200+
201+
ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
202+
195203
// GlobalV::ofs_running << "Print ekb: " << std::endl;
196204
// ekb.print(GlobalV::ofs_running);
197205
}
198206

199-
ModuleBase::timer::tick("Efficiency", "evolve_k");
207+
ModuleBase::timer::tick("TD_Efficiency", "evolve_k");
200208
} // end k
201209

202210
ModuleBase::timer::tick("Evolve_elec", "solve_psi");

source/source_lcao/module_rt/evolve_psi.cpp

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,8 @@ void evolve_psi(const int nband,
3030
std::ofstream& ofs_running,
3131
const int print_matrix)
3232
{
33-
ModuleBase::TITLE("Evolve_psi", "evolve_psi");
34-
// ofs_running << " Evolving electronic wave functions begins" << std::endl;
35-
33+
ModuleBase::TITLE("module_rt", "evolve_psi");
3634
time_t time_start = time(nullptr);
37-
// ofs_running << " Start Time : " << ctime(&time_start);
3835

3936
#ifdef __MPI
4037

@@ -112,12 +109,10 @@ void evolve_psi(const int nband,
112109
delete[] Hold;
113110
delete[] U_operator;
114111

115-
#endif
112+
#endif // __MPI
116113

117114
time_t time_end = time(nullptr);
118-
ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end);
119-
120-
// ofs_running << " Evolving electronic wave functions ends" << std::endl;
115+
ModuleBase::GlobalFunc::OUT_TIME("evolve_psi", time_start, time_end);
121116

122117
return;
123118
}
@@ -137,6 +132,9 @@ void evolve_psi_tensor(const int nband,
137132
const int print_matrix,
138133
const bool use_lapack)
139134
{
135+
ModuleBase::TITLE("module_rt", "evolve_psi_tensor");
136+
time_t time_start = time(nullptr);
137+
140138
// ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
141139
ct::DeviceType ct_device_type = ct::DeviceTypeToEnum<Device>::value;
142140
// ct_Device = ct::DEVICE_CPU or ct::DEVICE_GPU
@@ -154,17 +152,12 @@ void evolve_psi_tensor(const int nband,
154152
}
155153
#endif // __CUDA
156154

157-
// ofs_running << " evolve_psi_tensor::start " << std::endl;
158-
159-
ModuleBase::TITLE("Evolve_psi", "evolve_psi");
160-
time_t time_start = time(nullptr);
161-
// ofs_running << " Start Time : " << ctime(&time_start);
162-
163155
#ifdef __MPI
164-
165156
hamilt::MatrixBlock<std::complex<double>> h_mat, s_mat;
166157
p_hamilt->matrix(h_mat, s_mat);
167158

159+
ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
160+
168161
// Create Tensor objects for temporary data and sync from host to device
169162
const int len_HS = use_lapack ? nlocal * nlocal : pv->nloc;
170163
ct::Tensor Stmp(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
@@ -198,6 +191,8 @@ void evolve_psi_tensor(const int nband,
198191
syncmem_complex_h2d_op()(Hold.data<std::complex<double>>(), h_mat.p, len_HS);
199192
}
200193

194+
ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
195+
201196
ct::Tensor U_operator(ct::DataType::DT_COMPLEX_DOUBLE, ct_device_type, ct::TensorShape({len_HS}));
202197
U_operator.zero();
203198

@@ -238,7 +233,7 @@ void evolve_psi_tensor(const int nband,
238233
/// @brief compute U_operator
239234
/// @input Stmp, Htmp, print_matrix
240235
/// @output U_operator
241-
Propagator prop(propagator, pv, PARAM.mdp.md_dt);
236+
Propagator prop(propagator, pv, PARAM.inp.td_dt);
242237
prop.compute_propagator_tensor<Device>(nlocal,
243238
Stmp,
244239
Htmp,
@@ -298,14 +293,8 @@ void evolve_psi_tensor(const int nband,
298293
compute_ekb_tensor_lapack<Device>(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
299294
}
300295
}
301-
302296
#endif // __MPI
303297

304-
time_t time_end = time(nullptr);
305-
ModuleBase::GlobalFunc::OUT_TIME("evolve(std::complex)", time_start, time_end);
306-
307-
// ofs_running << " evolve_psi_tensor::end " << std::endl;
308-
309298
#if ((defined __CUDA) /* || (defined __ROCM) */)
310299
if (ct_device_type == ct::DeviceType::GpuDevice)
311300
{
@@ -315,6 +304,9 @@ void evolve_psi_tensor(const int nband,
315304
}
316305
#endif // __CUDA
317306

307+
time_t time_end = time(nullptr);
308+
ModuleBase::GlobalFunc::OUT_TIME("evolve_psi", time_start, time_end);
309+
318310
return;
319311
}
320312

0 commit comments

Comments
 (0)