Skip to content

Commit f71921f

Browse files
dyzhengjieli-matrixdyzheng
authored
Feature: add use_k_continuity method for initializing psi with PW base (#6724)
* Feature: add use_k_continuity method for initialize psi * Fix: cuda and rocm compiling error * Fix: kpar error with use_k_continuity * Fix: test error of CI --------- Co-authored-by: Jie Li <76780849+jieli-matrix@users.noreply.github.com> Co-authored-by: dyzheng <zhengdy@bjaisi.com>
1 parent e98850a commit f71921f

File tree

17 files changed

+1109
-140
lines changed

17 files changed

+1109
-140
lines changed

docs/advanced/input_files/input-main.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- [pw\_seed](#pw_seed)
3838
- [pw\_diag\_thr](#pw_diag_thr)
3939
- [diago\_smooth\_ethr](#diago_smooth_ethr)
40+
- [use\_k\_continuity](#use_k_continuity)
4041
- [pw\_diag\_nmax](#pw_diag_nmax)
4142
- [pw\_diag\_ndim](#pw_diag_ndim)
4243
- [erf\_ecut](#erf_ecut)
@@ -774,6 +775,18 @@ These variables are used to control the plane wave related parameters.
774775
- **Description**: If `TRUE`, the smooth threshold strategy, which applies a larger threshold (10e-5) for the empty states, will be implemented in the diagonalization methods. (This strategy should not affect total energy, forces, and other ground-state properties, but computational efficiency will be improved.) If `FALSE`, the smooth threshold strategy will not be applied.
775776
- **Default**: false
776777

778+
### use_k_continuity
779+
780+
- **Type**: Boolean
781+
- **Availability**: Used only for plane wave basis set.
782+
- **Description**: Whether to use k-point continuity for initializing wave functions. When enabled, this strategy exploits the similarity between wavefunctions at neighboring k-points by propagating the wavefunction from a previously initialized k-point to a new k-point, significantly reducing the computational cost of the initial guess.
783+
784+
**Important constraints:**
785+
- Must be used together with `diago_smooth_ethr = 1` for optimal performance
786+
787+
This feature is particularly useful for calculations with dense k-point sampling where the computational cost of wavefunction initialization becomes significant.
788+
- **Default**: false
789+
777790
### pw_diag_nmax
778791

779792
- **Type**: Integer

source/module_esolver/esolver_ks_pw.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,8 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
520520
hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
521521
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
522522
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
523-
hsolver::DiagoIterAssist<T, Device>::need_subspace);
523+
hsolver::DiagoIterAssist<T, Device>::need_subspace,
524+
PARAM.inp.use_k_continuity);
524525

525526
hsolver_pw_obj.solve(this->p_hamilt,
526527
this->kspw_psi[0],

source/module_hsolver/diago_dav_subspace.cpp

Lines changed: 62 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -288,27 +288,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
288288
psi_iter + (nbase) * this->dim,
289289
this->dim);
290290

291-
std::vector<Real> e_temp_cpu(nbase, 0);
291+
// Eigenvalues operation section
292+
std::vector<Real> e_temp_cpu(this->notconv, 0);
292293
Real* e_temp_hd = e_temp_cpu.data();
293-
if(this->device == base_device::GpuDevice)
294+
if (this->device == base_device::GpuDevice)
294295
{
295296
e_temp_hd = nullptr;
296297
resmem_real_op()(this->ctx, e_temp_hd, nbase);
297298
}
298-
for (int m = 0; m < notconv; m++)
299+
300+
for (int m = 0; m < this->notconv; m++)
299301
{
300-
e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
301-
if (this->device == base_device::GpuDevice)
302-
{
303-
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase);
304-
}
305-
vector_mul_vector_op<T, Device>()(this->ctx,
306-
nbase,
307-
vcc + m * this->nbase_x,
308-
vcc + m * this->nbase_x,
309-
e_temp_hd);
302+
e_temp_cpu[m] = -(*eigenvalue_iter)[m];
303+
}
304+
305+
if (this->device == base_device::GpuDevice)
306+
{
307+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
310308
}
311-
if(this->device == base_device::GpuDevice)
309+
310+
apply_eigenvalues_op<T, Device>()(nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
311+
312+
if (this->device == base_device::GpuDevice)
312313
{
313314
delmem_real_op()(this->ctx, e_temp_hd);
314315
}
@@ -333,54 +334,58 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
333334
psi_iter + nbase * this->dim,
334335
this->dim);
335336

336-
// "precondition!!!"
337-
std::vector<Real> pre(this->dim, 0.0);
338-
for (int m = 0; m < notconv; m++)
339-
{
340-
for (size_t i = 0; i < this->dim; i++)
341-
{
342-
// pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
343-
double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
344-
pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
345-
}
337+
// Precondition section
346338
#if defined(__CUDA) || defined(__ROCM)
347-
if (this->device == base_device::GpuDevice)
348-
{
349-
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim);
350-
vector_div_vector_op<T, Device>()(this->ctx,
351-
this->dim,
352-
psi_iter + (nbase + m) * this->dim,
353-
psi_iter + (nbase + m) * this->dim,
354-
this->d_precondition);
355-
}
356-
else
339+
if (this->device == base_device::GpuDevice)
340+
{
341+
Real* eigenvalues_gpu = nullptr;
342+
resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
343+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu, (*eigenvalue_iter).data(), notconv);
344+
345+
precondition_op<T, Device>()(this->dim,
346+
psi_iter,
347+
nbase,
348+
notconv,
349+
d_precondition,
350+
eigenvalues_gpu);
351+
delmem_real_op()(this->ctx, eigenvalues_gpu);
352+
}
353+
else
357354
#endif
358-
{
359-
vector_div_vector_op<T, Device>()(this->ctx,
360-
this->dim,
361-
psi_iter + (nbase + m) * this->dim,
362-
psi_iter + (nbase + m) * this->dim,
363-
pre.data());
364-
}
355+
{
356+
precondition_op<T, Device>()(this->dim,
357+
psi_iter,
358+
nbase,
359+
notconv,
360+
this->precondition.data(),
361+
(*eigenvalue_iter).data());
365362
}
366363

367-
// "normalize!!!" in order to improve numerical stability of subspace diagonalization
368-
std::vector<Real> psi_norm(notconv, 0.0);
369-
for (size_t i = 0; i < notconv; i++)
364+
// Normalize section
365+
#if defined(__CUDA) || defined(__ROCM)
366+
if (this->device == base_device::GpuDevice)
367+
{
368+
Real* psi_norm = nullptr;
369+
resmem_real_op()(this->ctx, psi_norm, notconv);
370+
using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
371+
setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
372+
373+
normalize_op<T, Device>()(this->dim,
374+
psi_iter,
375+
nbase,
376+
notconv,
377+
psi_norm);
378+
delmem_real_op()(this->ctx, psi_norm);
379+
}
380+
else
381+
#endif
370382
{
371-
psi_norm[i] = dot_real_op<T, Device>()(this->ctx,
372-
this->dim,
373-
psi_iter + (nbase + i) * this->dim,
374-
psi_iter + (nbase + i) * this->dim,
375-
true);
376-
assert(psi_norm[i] > 0.0);
377-
psi_norm[i] = sqrt(psi_norm[i]);
378-
379-
vector_div_constant_op<T, Device>()(this->ctx,
380-
this->dim,
381-
psi_iter + (nbase + i) * this->dim,
382-
psi_iter + (nbase + i) * this->dim,
383-
psi_norm[i]);
383+
Real* psi_norm = nullptr;
384+
normalize_op<T, Device>()(this->dim,
385+
psi_iter,
386+
nbase,
387+
notconv,
388+
psi_norm);
384389
}
385390

386391
// update hpsi[:, nbase:nbase+notconv]

0 commit comments

Comments
 (0)