Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
b69e1e9
加入了PPCG的实现
collapsar-z May 15, 2026
2d51b95
fix ppcg and pass tests
May 15, 2026
0ac427a
fix: bugs in diago_ppcg_test.cpp; delete diago_ppcg_simple_test.cpp; …
Roux-sq May 16, 2026
9a1618d
feat: add some methods to faster ppcg algorithm, add tests to compare…
Roux-sq May 20, 2026
2348988
fix: bugs in compare bash
Roux-sq May 20, 2026
e8f3406
remove benchmark dir
Roux-sq May 21, 2026
205516f
remove benchmark dir
Roux-sq May 21, 2026
0182cc8
add annotation to ppcg code, change ppcg code to faster it
Roux-sq May 22, 2026
90ea6f6
Add OpenMP parallelization to bpcg, davidson, dav_subspace kernels
May 22, 2026
d14e173
BPCG: band-level OpenMP parallelization in line_minimize_with_block_o…
May 22, 2026
b519754
BPCG: band-level OpenMP in normalize_op
May 22, 2026
58b3a95
fix some bugs in ppcg and tried to faster the algo
Roux-sq May 22, 2026
fb4d7e2
P0+P1: OpenMP if-guards + consistency unit tests
May 22, 2026
542bb4d
add gpu
collapsar-z May 23, 2026
66f4f85
add gpu
collapsar-z May 23, 2026
f4ecedf
WIP: 本地修改
May 30, 2026
1822b95
made ppcg FASTER
Roux-sq May 31, 2026
7019422
Refactor hsolver orthogonalization kernels
collapsar-z Jun 4, 2026
17e6880
add bench.cpp
Roux-sq Jun 5, 2026
5756596
perf: restore batch gemm and planSchmidtOrth in Davidson
Jun 5, 2026
ff49bd6
try to fix ppcg
Roux-sq Jun 20, 2026
c549a85
Merge branch 'origin/feat/hsolver-orth-refactor-opt' into feat/merge-…
Jun 21, 2026
5732c50
Merge branch 'origin/feat/openmp_opt' into feat/merge-openmp-orth-ppc…
Jun 21, 2026
a61a9a6
Merge branch 'origin/fix/ppcg-v2' into feat/merge-openmp-orth-ppcg-v2
Jun 21, 2026
f3e2e0b
fix: remove duplicate benchmark targets after merging openmp_opt
Jun 21, 2026
2960874
remove md files
Roux-sq Jun 25, 2026
348359f
fix MPI benchmark
Roux-sq Jun 25, 2026
fd4b61e
add more MPI for ppcg
Roux-sq Jun 25, 2026
0eae506
review all the changes, clear redundant part
Roux-sq Jun 25, 2026
467864c
merge all the feats
Roux-sq Jun 25, 2026
30c8e17
Potential fix for pull request finding
Roux-sq Jun 25, 2026
2aeb87f
Merge branch 'develop' into diago_final
Roux-sq Jun 26, 2026
4746593
fix bugs suggested by copilot
Roux-sq Jun 26, 2026
d8a881f
fix memory_recorder.h reference
Roux-sq Jun 26, 2026
d4906c2
改动xj
Jun 26, 2026
d85547e
改动xj
Jun 26, 2026
d27e745
改动xj
Jun 26, 2026
f010a25
fix(hsolver): replace std::vector<bool> with std::vector<int> in Davi…
Jun 26, 2026
5b7357b
fix(hsolver): use local band count in BPCG parallel Cholesky rotation
Jun 26, 2026
c60beec
test(hsolver): relax OpenMP consistency tolerance to 1e-5
Jun 26, 2026
c3c6cfe
test(build): copy test fixtures at configure time instead of install
Jun 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/build*
build_info.h
CMakeFiles/
bin
obj
*.o
Expand Down
12 changes: 6 additions & 6 deletions source/source_base/kernels/math_kernel_op_vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct vector_mul_real_op<T, base_device::DEVICE_CPU>
void operator()(const int dim, T* result, const T* vector, const Real constant)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) if(dim > 256)
#endif
for (int i = 0; i < dim; i++)
{
Expand All @@ -43,7 +43,7 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
if (add)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) if(dim > 256)
#endif
for (int i = 0; i < dim; i++)
{
Expand All @@ -53,7 +53,7 @@ struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
else
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) if(dim > 256)
#endif
for (int i = 0; i < dim; i++)
{
Expand All @@ -70,7 +70,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_CPU>
void operator()(const int& dim, T* result, const T* vector, const Real constant)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) if(dim > 256)
#endif
for (int i = 0; i < dim; i++)
{
Expand All @@ -86,7 +86,7 @@ struct vector_div_vector_op<T, base_device::DEVICE_CPU>
void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) if(dim > 256)
#endif
for (int i = 0; i < dim; i++)
{
Expand Down Expand Up @@ -122,7 +122,7 @@ struct vector_add_vector_op<T, base_device::DEVICE_CPU>
const Real constant2)
{
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) if(dim > 256)
#endif
for (int i = 0; i < dim; i++)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ struct synchronize_memory<T, DEVICE_GPU, DEVICE_GPU> {
const T *arr_in,
const size_t& size)
{
CHECK_CUDA(cudaMemcpy(arr_out, arr_in, sizeof(T) * size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(arr_out, arr_in, sizeof(T) * size, cudaMemcpyDeviceToDevice));
}
};

Expand Down
2 changes: 1 addition & 1 deletion source/source_base/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
remove_definitions(-D__MPI)
install(DIRECTORY data DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
AddTest(
TARGET MODULE_BASE_blas_connector
LIBS parameter ${math_libs} base device
Expand Down
10 changes: 5 additions & 5 deletions source/source_base/test_parallel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ AddTest(
SOURCES parallel_reduce_test.cpp ../global_variable.cpp ../parallel_global.cpp ../parallel_comm.cpp ../parallel_common.cpp ../parallel_reduce.cpp ../tool_quit.cpp ../global_file.cpp ../global_function.cpp ../memory_recorder.cpp ../timer.cpp
)

install(FILES parallel_common_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES parallel_global_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES parallel_reduce_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/parallel_common_test.sh ${CMAKE_CURRENT_BINARY_DIR}/parallel_common_test.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/parallel_global_test.sh ${CMAKE_CURRENT_BINARY_DIR}/parallel_global_test.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/parallel_reduce_test.sh ${CMAKE_CURRENT_BINARY_DIR}/parallel_reduce_test.sh COPYONLY)

find_program(BASH bash)
add_test(NAME MODULE_BASE_parallel_common_test
Expand Down Expand Up @@ -57,7 +57,7 @@ AddTest(
LIBS parameter ${math_libs}
)

install(FILES parallel_2d_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/parallel_2d_test.sh ${CMAKE_CURRENT_BINARY_DIR}/parallel_2d_test.sh COPYONLY)
find_program(BASH bash)
add_test(NAME MODULE_BASE_parallel_2d_test_para
COMMAND ${BASH} parallel_2d_test.sh
Expand All @@ -77,7 +77,7 @@ add_test(NAME MODULE_BASE_parallel_2d_test_para
LIBS parameter MPI::MPI_CXX ${BLACS_LIB}
SOURCES blacs_connector_test.cpp
)
install(FILES blacs_connector_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blacs_connector_test.sh ${CMAKE_CURRENT_BINARY_DIR}/blacs_connector_test.sh COPYONLY)
add_test(NAME MODULE_BASE_blacs_connector_test
COMMAND ${BASH} blacs_connector_test.sh
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
Expand Down
6 changes: 3 additions & 3 deletions source/source_basis/module_ao/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ list(APPEND depend_files
)

install(DIRECTORY GaAs DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../../../tests)
install(DIRECTORY GaAs DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/GaAs DESTINATION ${CMAKE_CURRENT_BINARY_DIR})


AddTest(
Expand Down Expand Up @@ -83,13 +83,13 @@ AddTest(
LIBS parameter ${math_libs} device base
)

install(FILES parallel_orbitals_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/parallel_orbitals_test.sh ${CMAKE_CURRENT_BINARY_DIR}/parallel_orbitals_test.sh COPYONLY)
find_program(BASH bash)
add_test(NAME MODULE_AO_parallel_orbitals_test_para
COMMAND ${BASH} parallel_orbitals_test.sh
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)

install(DIRECTORY lcao_H2O DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/lcao_H2O DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(DIRECTORY lcao_H2O DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../../../tests)

16 changes: 8 additions & 8 deletions source/source_cell/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ remove_definitions(-D__ROCM)
remove_definitions(-D__EXX)

find_program(BASH bash)
install(DIRECTORY support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES bcast_atom_pseudo_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES bcast_atom_spec_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES parallel_kpoints_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES klist_test_para.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES unitcell_test_parallel.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES bcast_read_sep_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES bcast_sep_cell_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/bcast_atom_pseudo_test.sh ${CMAKE_CURRENT_BINARY_DIR}/bcast_atom_pseudo_test.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/bcast_atom_spec_test.sh ${CMAKE_CURRENT_BINARY_DIR}/bcast_atom_spec_test.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/parallel_kpoints_test.sh ${CMAKE_CURRENT_BINARY_DIR}/parallel_kpoints_test.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/klist_test_para.sh ${CMAKE_CURRENT_BINARY_DIR}/klist_test_para.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/unitcell_test_parallel.sh ${CMAKE_CURRENT_BINARY_DIR}/unitcell_test_parallel.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/bcast_read_sep_test.sh ${CMAKE_CURRENT_BINARY_DIR}/bcast_read_sep_test.sh COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/bcast_sep_cell_test.sh ${CMAKE_CURRENT_BINARY_DIR}/bcast_sep_cell_test.sh COPYONLY)

list(APPEND cell_simple_srcs
../unitcell.cpp
Expand Down
4 changes: 2 additions & 2 deletions source/source_cell/test_pw/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ remove_definitions(-D__ROCM)
remove_definitions(-D__EXX)
remove_definitions(-D__LCAO)

install(DIRECTORY support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES unitcell_test_pw_para.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/unitcell_test_pw_para.sh ${CMAKE_CURRENT_BINARY_DIR}/unitcell_test_pw_para.sh COPYONLY)

AddTest(
TARGET MODULE_CELL_unitcell_test_pw
Expand Down
2 changes: 1 addition & 1 deletion source/source_esolver/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
remove_definitions(-D__MPI)
remove_definitions(-D__LCAO)

install(DIRECTORY support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

AddTest(
TARGET MODULE_ESOLVER_esolver_dp_test
Expand Down
1 change: 1 addition & 0 deletions source/source_estate/elecstate_print.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void print_scf_iterinfo(const std::string& ks_solver,
{"scalapack_gvx", "GV"},
{"cusolver", "CU"},
{"bpcg", "BP"},
{"ppcg", "PP"},
{"pexsi", "PE"},
{"cusolvermp", "CM"},
{"sdft", "CT"}}; // CT = Chebyshev Trace, for pure SDFT (nbands=0) where no H diagonalization is performed
Expand Down
2 changes: 1 addition & 1 deletion source/source_estate/module_dm/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ remove_definitions(-D__MLALGO)
remove_definitions(-D__CUDA)
remove_definitions(-D__ROCM)

install(DIRECTORY support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

if(TARGET MODULE_ESTATE_dm_io_test_serial)
remove_definitions(-D__MPI)
Expand Down
2 changes: 1 addition & 1 deletion source/source_estate/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ remove_definitions(-D_OPENMP)

if (ENABLE_MPI)

install(DIRECTORY support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

AddTest(
TARGET MODULE_ESTATE_Elecstate_Op_UTs
Expand Down
2 changes: 1 addition & 1 deletion source/source_hamilt/module_surchem/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
remove_definitions(-D__LCAO )
remove_definitions(-DUSE_LIBXC)
install(DIRECTORY support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/support DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

list(APPEND depend_files

Expand Down
4 changes: 2 additions & 2 deletions source/source_hamilt/module_vdw/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ remove_definitions(-D__MLALGO)
remove_definitions(-D__CUDA)
remove_definitions(-D__ROCM)

install(FILES c6.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
install(FILES r0.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/c6.txt ${CMAKE_CURRENT_BINARY_DIR}/c6.txt COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/r0.txt ${CMAKE_CURRENT_BINARY_DIR}/r0.txt COPYONLY)

AddTest(
TARGET MODULE_HAMILT_vdwTest
Expand Down
1 change: 1 addition & 0 deletions source/source_hsolver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ list(APPEND objects
diago_david.cpp
diago_dav_subspace.cpp
diago_bpcg.cpp
diago_ppcg.cpp
para_linear_transform.cpp
hsolver_pw.cpp
hsolver_lcaopw.cpp
Expand Down
77 changes: 56 additions & 21 deletions source/source_hsolver/diago_bpcg.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
#include "source_hsolver/diago_bpcg.h"
#include "source_hsolver/module_diag/diag_orthogonalizer.h"

#include "diago_iter_assist.h"
#include "source_base/global_function.h"
#include "source_base/kernels/math_kernel_op.h"
#include "source_base/parallel_comm.h" // different MPI worlds
#include "source_hsolver/kernels/bpcg_kernel_op.h"
#include "source_hsolver/module_diag/diago_trace.h"
#include "para_linear_transform.h"

#include <ATen/kernels/blas.h>
#include <ATen/kernels/lapack.h>
#include <ATen/ops/einsum_op.h>
#include <algorithm>
#include <limits>
#include <vector>

namespace hsolver {

Expand Down Expand Up @@ -80,6 +84,9 @@ bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vecto
_err_st = tmp_cpu.data();
syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
}
#ifdef _OPENMP
#pragma omp parallel for schedule(static) reduction(||:not_conv) if(this->n_band_l > 64)
#endif
for (int ii = 0; ii < this->n_band_l; ii++) {
if (_err_st[ii] > ethr_band[ii]) {
not_conv = true;
Expand Down Expand Up @@ -117,20 +124,14 @@ void DiagoBPCG<T, Device>::orth_cholesky(
ct::Tensor& hpsi_out,
ct::Tensor& hsub_out)
{
// gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band)
this->pmmcn.multiply(1.0, psi_out.data<T>(), psi_out.data<T>(), 0.0, hsub_out.data<T>());

// set hsub matrix to lower format;
ct::kernels::set_matrix<T, ct_Device>()(
'L', hsub_out.data<T>(), this->n_band);

ct::kernels::lapack_potrf<T, ct_Device>()(
'U', this->n_band, hsub_out.data<T>(), this->n_band);
ct::kernels::lapack_trtri<T, ct_Device>()(
'U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);

this->rotate_wf(hsub_out, psi_out, workspace_in);
this->rotate_wf(hsub_out, hpsi_out, workspace_in);
DiagOrthogonalizer<T, Device>(this->n_dim, this->n_basis)
.cholesky_orth_parallel(workspace_in.data<T>(),
psi_out.data<T>(),
hpsi_out.data<T>(),
hsub_out.data<T>(),
this->n_band,
this->pmmcn,
this->plintrans);
}

template<typename T, typename Device>
Expand Down Expand Up @@ -167,13 +168,12 @@ void DiagoBPCG<T, Device>::orth_projection(
ct::Tensor& hsub_in,
ct::Tensor& grad_out)
{
// gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band)
this->pmmcn.multiply(1.0, psi_in.data<T>(), grad_out.data<T>(), 0.0, hsub_in.data<T>());

// grad_out(n_basis x n_band) = 1.0 * grad_out(n_basis x n_band) - psi_in(n_basis x n_band) * hsub_in(n_band x
// n_band)
this->plintrans.act(-1.0, psi_in.data<T>(), hsub_in.data<T>(), 1.0, grad_out.data<T>());
return;
DiagOrthogonalizer<T, Device>(this->n_dim, this->n_basis)
.project_out_parallel(psi_in.data<T>(),
grad_out.data<T>(),
hsub_in.data<T>(),
this->pmmcn,
this->plintrans);
}

template<typename T, typename Device>
Expand Down Expand Up @@ -265,6 +265,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
const std::vector<double>& ethr_band)
{
const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
DiagoTrace trace("BPCG");
// Get the pointer of the input psi
this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis}));

Expand Down Expand Up @@ -294,6 +295,40 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
this->calc_grad_with_block(this->prec, this->err_st, this->beta,
this->psi, this->hpsi, this->grad, this->grad_old);

if (trace.enabled())
{
std::vector<Real> err_host(this->n_band_l);
const Real* err_ptr = this->err_st.template data<Real>();
if (this->err_st.device_type() == ct::DeviceType::GpuDevice)
{
syncmem_var_d2h_op()(err_host.data(), this->err_st.template data<Real>(), this->n_band_l);
err_ptr = err_host.data();
}

Real max_residual = Real(0);
Real avg_residual = Real(0);
int n_converged = 0;
for (int ib = 0; ib < this->n_band_l; ++ib)
{
max_residual = std::max(max_residual, err_ptr[ib]);
avg_residual += err_ptr[ib];
if (err_ptr[ib] <= ethr_band[ib])
{
++n_converged;
}
}
if (this->n_band_l > 0)
{
avg_residual /= this->n_band_l;
}
trace.record_iteration(ntry,
this->n_band_l,
max_residual,
avg_residual,
n_converged,
Real(-1));
}

// Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi
// for all 'j less or equal to i'.
// Note: hsub and work are only used to store intermediate variables of gemm operator.
Expand Down
Loading
Loading