From 5fbbef3d39574f4660e6c1f3f9de30c8e8f69fea Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Fri, 22 Jun 2018 06:19:32 -0400 Subject: [PATCH] Extended band parallelization to XtHX, filling overlap matrix, subspace rotation and compute rho. Added relevant ctests. ctests passed --- include/dft.h | 11 +- include/dftUtils.h | 14 +- include/linearAlgebraOperations.h | 6 +- include/linearAlgebraOperationsInternal.h | 2 + include/triangulationManager.h | 16 +- src/dft/density.cc | 419 +++++++++--------- src/dft/dft.cc | 2 + src/dft/initRho.cc | 9 +- src/dft/restart.cc | 3 +- src/eigen/eigen.cc | 143 +++--- src/linAlg/linearAlgebraOperationsInternal.cc | 272 +++++++----- src/linAlg/linearAlgebraOperationsOpt.cc | 7 +- src/linAlg/pseudoGS.cc | 8 +- ...evOrthogonalizedSubspaceIterationSolver.cc | 4 +- .../triangulationManager/restartUtils.cc | 14 +- .../real/hcpMgPrim_03_c.mpirun=12.output | 42 ++ .../real/hcpMgPrim_03_c.mpirun=8.output | 42 ++ .../real/hcpMgPrim_03_c.prm.in | 82 ++++ utils/dftUtils.cc | 21 +- 19 files changed, 725 insertions(+), 392 deletions(-) create mode 100644 tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=12.output create mode 100644 tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=8.output create mode 100644 tests/dft/pseudopotential/real/hcpMgPrim_03_c.prm.in diff --git a/include/dft.h b/include/dft.h index 3ecf3a8f0..68e005a69 100644 --- a/include/dft.h +++ b/include/dft.h @@ -296,12 +296,13 @@ namespace dftfe { void computeNodalRhoFromQuadData(); /** - * sums rho cell quadratrure data from all kpoint pools + * sums rho cell quadratrure data from inter communicator */ - void sumRhoDataKPointPools(std::map > * rhoValues, - std::map > * gradRhoValues, - std::map > * rhoValuesSpinPolarized, - std::map > * gradRhoValuesSpinPolarized); + void sumRhoData(std::map > * rhoValues, + std::map > * gradRhoValues, + std::map > * rhoValuesSpinPolarized, + std::map > * gradRhoValuesSpinPolarized, + const MPI_Comm &interComm); /** * resize and allocate table storage for rho cell quadratrue data diff --git a/include/dftUtils.h b/include/dftUtils.h index 363df83c7..191175708 100644 --- a/include/dftUtils.h +++ b/include/dftUtils.h @@ -53,14 +53,26 @@ namespace dftfe { * * @param dataOut DataOut class object * @param intralpoolcomm mpi communicator of domain decomposition inside each pool - * @param interpoolcomm mpi communicator across pools + * @param interpoolcomm mpi communicator across k point pools + * @param interBandGroupComm mpi communicator across band groups * @param fileName */ void writeDataVTUParallelLowestPoolId(const dealii::DataOut<3> & dataOut, const MPI_Comm & intrapoolcomm, const MPI_Comm & interpoolcomm, + const MPI_Comm &interBandGroupComm, const std::string & fileName); + /** @brief Create index vector which is used for band parallelization + * + * @[in]param interBandGroupComm mpi communicator across band groups + * @[in]param numBands + * @[out]param bandGroupLowHighPlusOneIndices + */ + void createBandParallelizationIndices(const MPI_Comm &interBandGroupComm, + const unsigned int numBands, + std::vector & bandGroupLowHighPlusOneIndices); + /** * A class to split the given communicator into a number of pools */ diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index 29993be4c..b8e33a48d 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -149,12 +149,14 @@ namespace dftfe * @param[in,out] X Given subspace as flattened array of multi-vectors. * In-place update of the given subspace * @param[in] numberComponents Number of multiple-fields + * @param[in] interBandGroupComm interpool communicator for parallelization over band groups * * @return flag indicating success/failure. 1 for failure, 0 for success */ template unsigned int pseudoGramSchmidtOrthogonalization(dealii::parallel::distributed::Vector & X, - const unsigned int numberComponents); + const unsigned int numberComponents, + const MPI_Comm &interBandGroupComm); /** @brief Compute Rayleigh-Ritz projection * @@ -176,12 +178,14 @@ namespace dftfe * @param[in,out] X Given subspace as flattened array of multi-vectors. * In-place rotated subspace * @param[in] numberComponents Number of multiple-fields + * @param[in] interBandGroupComm interpool communicator for parallelization over band groups * @param[out] eigenValues of the Projected Hamiltonian */ template void rayleighRitz(operatorDFTClass & operatorMatrix, dealii::parallel::distributed::Vector & X, const unsigned int numberComponents, + const MPI_Comm &interBandGroupComm, std::vector & eigenValues); /** @brief Compute Compute residual norm associated with eigenValue problem of the given operator diff --git a/include/linearAlgebraOperationsInternal.h b/include/linearAlgebraOperationsInternal.h index 93f9f61a2..7c89b8b74 100644 --- a/include/linearAlgebraOperationsInternal.h +++ b/include/linearAlgebraOperationsInternal.h @@ -63,6 +63,7 @@ namespace dftfe void fillParallelOverlapMatrix(const dealii::parallel::distributed::Vector & X, const unsigned int numberVectors, const std::shared_ptr< const dealii::Utilities::MPI::ProcessGrid> & processGrid, + const MPI_Comm &interBandGroupComm, dealii::ScaLAPACKMatrix & overlapMatPar); /** @brief Computes X^{T}=Q*X^{T} inplace. X^{T} is the subspaceVectorsArray in the column major @@ -77,6 +78,7 @@ namespace dftfe void subspaceRotation(dealii::parallel::distributed::Vector & subspaceVectorsArray, const unsigned int numberSubspaceVectors, const std::shared_ptr< const dealii::Utilities::MPI::ProcessGrid> & processGrid, + const MPI_Comm &interBandGroupComm, const dealii::ScaLAPACKMatrix & rotationMatPar, const bool rotationMatTranspose=false); diff --git a/include/triangulationManager.h b/include/triangulationManager.h index 6a1ddcbbe..f7210032d 100644 --- a/include/triangulationManager.h +++ b/include/triangulationManager.h @@ -157,13 +157,17 @@ namespace dftfe { * @param [input]nComponents number of components of the dofHandler on which solution * vectors are based upon * @param [input]solutionVectors vector of parallel distributed solution vectors to be serialized - * @param [input]interpoolComm interpool communicator to ensure serialization happens only in pool + * @param [input]interpoolComm This communicator is used to ensure serialization + * happens only in k point pool + * @param [input]interBandGroupComm This communicator to ensure serialization happens + * only in band group */ void saveTriangulationsSolutionVectors (const unsigned int feOrder, const unsigned int nComponents, const std::vector< const dealii::parallel::distributed::Vector * > & solutionVectors, - const MPI_Comm & interpoolComm); + const MPI_Comm & interpoolComm, + const MPI_Comm &interBandGroupComm); /** * @brief de-serialize the triangulations and the associated solution vectors @@ -183,11 +187,15 @@ namespace dftfe { * @brief serialize the triangulations and the associated cell quadrature data container * * @param [input]cellQuadDataContainerIn container of input cell quadrature data to be serialized - * @param [input]interpoolComm interpool communicator to ensure serialization happens only in pool + * @param [input]interpoolComm This communicator is used to ensure serialization + * happens only in k point pool + * @param [input]interBandGroupComm This communicator to ensure serialization happens + * only in band group */ void saveTriangulationsCellQuadData (const std::vector > *> & cellQuadDataContainerIn, - const MPI_Comm & interpoolComm); + const MPI_Comm & interpoolComm, + const MPI_Comm &interBandGroupComm); /** * @brief de-serialize the triangulations and the associated cell quadrature data container diff --git a/src/dft/density.cc b/src/dft/density.cc index b2af1893c..589ca0406 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -66,9 +66,19 @@ void dftClass::compute_rhoOut() std::vector gradRhoTemp(3*numQuadPoints), gradRhoTempSpinPolarized(6*numQuadPoints),gradRhoOut(3*numQuadPoints), gradRhoOutSpinPolarized(6*numQuadPoints); - const unsigned int eigenVectorsBlockSize=200; - std::vector> eigenVectors((1+dftParameters::spinPolarized)*d_kPointWeights.size()); + //band group parallelization data structures + const unsigned int numberBandGroups= + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int bandGroupTaskId = dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(interBandGroupComm, + numEigenValues, + bandGroupLowHighPlusOneIndices); + + const unsigned int eigenVectorsBlockSize=std::min(dftParameters::orthoRRWaveFuncBlockSize, + bandGroupLowHighPlusOneIndices[1]); + std::vector> eigenVectors((1+dftParameters::spinPolarized)*d_kPointWeights.size()); for(unsigned int ivec = 0; ivec < numEigenValues; ivec+=eigenVectorsBlockSize) { const unsigned int currentBlockSize=std::min(eigenVectorsBlockSize,numEigenValues-ivec); @@ -83,68 +93,54 @@ void dftClass::compute_rhoOut() } } - for(unsigned int kPoint = 0; kPoint < (1+dftParameters::spinPolarized)*d_kPointWeights.size(); ++kPoint) + if ((ivec+currentBlockSize)<=bandGroupLowHighPlusOneIndices[2*bandGroupTaskId+1] && + (ivec+currentBlockSize)>bandGroupLowHighPlusOneIndices[2*bandGroupTaskId]) { + + for(unsigned int kPoint = 0; kPoint < (1+dftParameters::spinPolarized)*d_kPointWeights.size(); ++kPoint) + { #ifdef USE_COMPLEX - vectorTools::copyFlattenedDealiiVecToSingleCompVec - (d_eigenVectorsFlattened[kPoint], - numEigenValues, - std::make_pair(ivec,ivec+currentBlockSize), - localProc_dof_indicesReal, - localProc_dof_indicesImag, - eigenVectors[kPoint]); + vectorTools::copyFlattenedDealiiVecToSingleCompVec + (d_eigenVectorsFlattened[kPoint], + numEigenValues, + std::make_pair(ivec,ivec+currentBlockSize), + localProc_dof_indicesReal, + localProc_dof_indicesImag, + eigenVectors[kPoint]); #else - vectorTools::copyFlattenedDealiiVecToSingleCompVec - (d_eigenVectorsFlattened[kPoint], - numEigenValues, - std::make_pair(ivec,ivec+currentBlockSize), - eigenVectors[kPoint]); + vectorTools::copyFlattenedDealiiVecToSingleCompVec + (d_eigenVectorsFlattened[kPoint], + numEigenValues, + std::make_pair(ivec,ivec+currentBlockSize), + eigenVectors[kPoint]); #endif - } + } #ifdef USE_COMPLEX - std::vector > > psiQuads(numQuadPoints*currentBlockSize*numKPoints,zeroTensor1); - std::vector > > psiQuads2(numQuadPoints*currentBlockSize*numKPoints,zeroTensor1); - std::vector > > > gradPsiQuads(numQuadPoints*currentBlockSize*numKPoints,zeroTensor2); - std::vector > > > gradPsiQuads2(numQuadPoints*currentBlockSize*numKPoints,zeroTensor2); + std::vector > > psiQuads(numQuadPoints*currentBlockSize*numKPoints,zeroTensor1); + std::vector > > psiQuads2(numQuadPoints*currentBlockSize*numKPoints,zeroTensor1); + std::vector > > > gradPsiQuads(numQuadPoints*currentBlockSize*numKPoints,zeroTensor2); + std::vector > > > gradPsiQuads2(numQuadPoints*currentBlockSize*numKPoints,zeroTensor2); #else - std::vector< VectorizedArray > psiQuads(numQuadPoints*currentBlockSize,make_vectorized_array(0.0)); - std::vector< VectorizedArray > psiQuads2(numQuadPoints*currentBlockSize,make_vectorized_array(0.0)); - std::vector > > gradPsiQuads(numQuadPoints*currentBlockSize,zeroTensor3); - std::vector > > gradPsiQuads2(numQuadPoints*currentBlockSize,zeroTensor3); + std::vector< VectorizedArray > psiQuads(numQuadPoints*currentBlockSize,make_vectorized_array(0.0)); + std::vector< VectorizedArray > psiQuads2(numQuadPoints*currentBlockSize,make_vectorized_array(0.0)); + std::vector > > gradPsiQuads(numQuadPoints*currentBlockSize,zeroTensor3); + std::vector > > gradPsiQuads2(numQuadPoints*currentBlockSize,zeroTensor3); #endif - for (unsigned int cell=0; cell::compute_rhoOut() for (unsigned int q=0; qid(); + if(dftParameters::spinPolarized==1) + { - std::fill(rhoTemp.begin(),rhoTemp.end(),0.0); std::fill(rhoOut.begin(),rhoOut.end(),0.0); + psiEval.read_dof_values_plain + (eigenVectors[(1+dftParameters::spinPolarized)*kPoint+1][iEigenVec]); - if (dftParameters::spinPolarized==1) - std::fill(rhoTempSpinPolarized.begin(),rhoTempSpinPolarized.end(),0.0); + if(dftParameters::xc_id == 4) + psiEval.evaluate(true,true); + else + psiEval.evaluate(true,false); - if(dftParameters::xc_id == 4) - { - std::fill(gradRhoTemp.begin(),gradRhoTemp.end(),0.0); - if (dftParameters::spinPolarized==1) - std::fill(gradRhoTempSpinPolarized.begin(),gradRhoTempSpinPolarized.end(),0.0); - } + for (unsigned int q=0; qid(); - const double partialOccupancy=dftUtils::getPartialOccupancy - (eigenValues[kPoint][ivec+iEigenVec], - fermiEnergy, - C_kb, - dftParameters::TVal); + std::fill(rhoTemp.begin(),rhoTemp.end(),0.0); std::fill(rhoOut.begin(),rhoOut.end(),0.0); - const double partialOccupancy2=dftUtils::getPartialOccupancy - (eigenValues[kPoint][ivec+iEigenVec+dftParameters::spinPolarized*numEigenVectors], - fermiEnergy, - C_kb, - dftParameters::TVal); + if (dftParameters::spinPolarized==1) + std::fill(rhoTempSpinPolarized.begin(),rhoTempSpinPolarized.end(),0.0); - for(unsigned int q=0; q psi, psi2; - psi.reinit(2); psi2.reinit(2); - psi(0)= psiQuads[id][0][iSubCell]; - psi(1)=psiQuads[id][1][iSubCell]; + const double partialOccupancy=dftUtils::getPartialOccupancy + (eigenValues[kPoint][ivec+iEigenVec], + fermiEnergy, + C_kb, + dftParameters::TVal); - if(dftParameters::spinPolarized==1) - { - psi2(0)=psiQuads2[id][0][iSubCell]; - psi2(1)=psiQuads2[id][1][iSubCell]; - } + const double partialOccupancy2=dftUtils::getPartialOccupancy + (eigenValues[kPoint][ivec+iEigenVec+dftParameters::spinPolarized*numEigenVectors], + fermiEnergy, + C_kb, + dftParameters::TVal); - std::vector > gradPsi(2),gradPsi2(2); + for(unsigned int q=0; q psi, psi2; + psi.reinit(2); psi2.reinit(2); - if(dftParameters::xc_id == 4) - for(unsigned int idim=0; idim<3; ++idim) - { - gradPsi[0][idim]=gradPsiQuads[id][0][idim][iSubCell]; - gradPsi[1][idim]=gradPsiQuads[id][1][idim][iSubCell]; - - if(dftParameters::spinPolarized==1) - { - gradPsi2[0][idim]=gradPsiQuads2[id][0][idim][iSubCell]; - gradPsi2[1][idim]=gradPsiQuads2[id][1][idim][iSubCell]; - } - } -#else - double psi, psi2; - psi=psiQuads[id][iSubCell]; - if (dftParameters::spinPolarized==1) - psi2=psiQuads2[id][iSubCell]; - - Tensor<1,3,double> gradPsi,gradPsi2; - if(dftParameters::xc_id == 4) - for(unsigned int idim=0; idim<3; ++idim) + psi(0)= psiQuads[id][0][iSubCell]; + psi(1)=psiQuads[id][1][iSubCell]; + + if(dftParameters::spinPolarized==1) { - gradPsi[idim]=gradPsiQuads[id][idim][iSubCell]; - if(dftParameters::spinPolarized==1) - gradPsi2[idim]=gradPsiQuads2[id][idim][iSubCell]; + psi2(0)=psiQuads2[id][0][iSubCell]; + psi2(1)=psiQuads2[id][1][iSubCell]; } -#endif + std::vector > gradPsi(2),gradPsi2(2); -#ifdef USE_COMPLEX - if(dftParameters::spinPolarized==1) - { - rhoTempSpinPolarized[2*q] += partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*psi(0) + psi(1)*psi(1)); - rhoTempSpinPolarized[2*q+1] += partialOccupancy2*d_kPointWeights[kPoint]*(psi2(0)*psi2(0) + psi2(1)*psi2(1)); - // if(dftParameters::xc_id == 4) for(unsigned int idim=0; idim<3; ++idim) { - gradRhoTempSpinPolarized[6*q + idim] += - 2.0*partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*gradPsi[0][idim] + psi(1)*gradPsi[1][idim]); - gradRhoTempSpinPolarized[6*q + 3+idim] += - 2.0*partialOccupancy2*d_kPointWeights[kPoint]*(psi2(0)*gradPsi2[0][idim] + psi2(1)*gradPsi2[1][idim]); + gradPsi[0][idim]=gradPsiQuads[id][0][idim][iSubCell]; + gradPsi[1][idim]=gradPsiQuads[id][1][idim][iSubCell]; + + if(dftParameters::spinPolarized==1) + { + gradPsi2[0][idim]=gradPsiQuads2[id][0][idim][iSubCell]; + gradPsi2[1][idim]=gradPsiQuads2[id][1][idim][iSubCell]; + } } - } - else - { - rhoTemp[q] += 2.0*partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*psi(0) + psi(1)*psi(1)); - if(dftParameters::xc_id == 4) - for(unsigned int idim=0; idim<3; ++idim) - gradRhoTemp[3*q + idim] += 2.0*2.0*partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*gradPsi[0][idim] + psi(1)*gradPsi[1][idim]); - } #else - if(dftParameters::spinPolarized==1) - { - rhoTempSpinPolarized[2*q] += partialOccupancy*psi*psi; - rhoTempSpinPolarized[2*q+1] += partialOccupancy2*psi2*psi2; + double psi, psi2; + psi=psiQuads[id][iSubCell]; + if (dftParameters::spinPolarized==1) + psi2=psiQuads2[id][iSubCell]; + Tensor<1,3,double> gradPsi,gradPsi2; if(dftParameters::xc_id == 4) for(unsigned int idim=0; idim<3; ++idim) { - gradRhoTempSpinPolarized[6*q + idim] += 2.0*partialOccupancy*(psi*gradPsi[idim]); - gradRhoTempSpinPolarized[6*q + 3+idim] += 2.0*partialOccupancy2*(psi2*gradPsi2[idim]); + gradPsi[idim]=gradPsiQuads[id][idim][iSubCell]; + if(dftParameters::spinPolarized==1) + gradPsi2[idim]=gradPsiQuads2[id][idim][iSubCell]; } - } - else - { - rhoTemp[q] += 2.0*partialOccupancy*psi*psi; - if(dftParameters::xc_id == 4) - for(unsigned int idim=0; idim<3; ++idim) - gradRhoTemp[3*q + idim] += 2.0*2.0*partialOccupancy*psi*gradPsi[idim]; - } +#endif + +#ifdef USE_COMPLEX + if(dftParameters::spinPolarized==1) + { + rhoTempSpinPolarized[2*q] += partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*psi(0) + psi(1)*psi(1)); + rhoTempSpinPolarized[2*q+1] += partialOccupancy2*d_kPointWeights[kPoint]*(psi2(0)*psi2(0) + psi2(1)*psi2(1)); + // + if(dftParameters::xc_id == 4) + for(unsigned int idim=0; idim<3; ++idim) + { + gradRhoTempSpinPolarized[6*q + idim] += + 2.0*partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*gradPsi[0][idim] + psi(1)*gradPsi[1][idim]); + gradRhoTempSpinPolarized[6*q + 3+idim] += + 2.0*partialOccupancy2*d_kPointWeights[kPoint]*(psi2(0)*gradPsi2[0][idim] + psi2(1)*gradPsi2[1][idim]); + } + } + else + { + rhoTemp[q] += 2.0*partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*psi(0) + psi(1)*psi(1)); + if(dftParameters::xc_id == 4) + for(unsigned int idim=0; idim<3; ++idim) + gradRhoTemp[3*q + idim] += 2.0*2.0*partialOccupancy*d_kPointWeights[kPoint]*(psi(0)*gradPsi[0][idim] + psi(1)*gradPsi[1][idim]); + } +#else + if(dftParameters::spinPolarized==1) + { + rhoTempSpinPolarized[2*q] += partialOccupancy*psi*psi; + rhoTempSpinPolarized[2*q+1] += partialOccupancy2*psi2*psi2; + + if(dftParameters::xc_id == 4) + for(unsigned int idim=0; idim<3; ++idim) + { + gradRhoTempSpinPolarized[6*q + idim] += 2.0*partialOccupancy*(psi*gradPsi[idim]); + gradRhoTempSpinPolarized[6*q + 3+idim] += 2.0*partialOccupancy2*(psi2*gradPsi2[idim]); + } + } + else + { + rhoTemp[q] += 2.0*partialOccupancy*psi*psi; + + if(dftParameters::xc_id == 4) + for(unsigned int idim=0; idim<3; ++idim) + gradRhoTemp[3*q + idim] += 2.0*2.0*partialOccupancy*psi*gradPsi[idim]; + } #endif - }//quad point loop - }//block eigenvectors per k point + }//quad point loop + }//block eigenvectors per k point - for (unsigned int q=0; q::resizeAndAllocateRhoTableStorage } template -void dftClass::sumRhoDataKPointPools(std::map > * rhoValues, - std::map > * gradRhoValues, - std::map > * rhoValuesSpinPolarized, - std::map > * gradRhoValuesSpinPolarized) +void dftClass::sumRhoData(std::map > * rhoValues, + std::map > * gradRhoValues, + std::map > * rhoValuesSpinPolarized, + std::map > * gradRhoValuesSpinPolarized, + const MPI_Comm &interComm) { typename DoFHandler<3>::active_cell_iterator cell = dofHandler.begin_active(), endc = dofHandler.end(); - //gather density from all pools - if (dftParameters::npool>1) + //gather density from inter communicator + if (dealii::Utilities::MPI::n_mpi_processes(interComm)>1) for (; cell!=endc; ++cell) if (cell->is_locally_owned()) { const dealii::CellId cellId=cell->id(); dealii::Utilities::MPI::sum((*rhoValues)[cellId], - interpoolcomm, + interComm, (*rhoValues)[cellId]); if(dftParameters::xc_id == 4) dealii::Utilities::MPI::sum((*gradRhoValues)[cellId], - interpoolcomm, + interComm, (*gradRhoValues)[cellId]); if (dftParameters::spinPolarized==1) { dealii::Utilities::MPI::sum((*rhoValuesSpinPolarized)[cellId], - interpoolcomm, + interComm, (*rhoValuesSpinPolarized)[cellId]); if(dftParameters::xc_id == 4) dealii::Utilities::MPI::sum((*gradRhoValuesSpinPolarized)[cellId], - interpoolcomm, + interComm, (*gradRhoValuesSpinPolarized)[cellId]); } } diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 190cc86f6..bda4a74db 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -1144,6 +1144,7 @@ namespace dftfe { dftUtils::writeDataVTUParallelLowestPoolId(data_outEigen, mpi_communicator, interpoolcomm, + interBandGroupComm, std::string("eigen")); // @@ -1171,6 +1172,7 @@ namespace dftfe { dftUtils::writeDataVTUParallelLowestPoolId(dataOutRho, mpi_communicator, interpoolcomm, + interBandGroupComm, std::string("rhoField")); } diff --git a/src/dft/initRho.cc b/src/dft/initRho.cc index c1c6547b2..63be16b1d 100644 --- a/src/dft/initRho.cc +++ b/src/dft/initRho.cc @@ -762,10 +762,11 @@ void dftClass::initRhoFromPreviousGroundStateRho() }//macro cell loop //gather density from all pools - sumRhoDataKPointPools(rhoInValues, - gradRhoInValues, - rhoInValuesSpinPolarized, - gradRhoInValuesSpinPolarized); + sumRhoData(rhoInValues, + gradRhoInValues, + rhoInValuesSpinPolarized, + gradRhoInValuesSpinPolarized, + interpoolcomm); //normalize density normalizeRho(); diff --git a/src/dft/restart.cc b/src/dft/restart.cc index d11df0d56..748f97659 100644 --- a/src/dft/restart.cc +++ b/src/dft/restart.cc @@ -63,7 +63,8 @@ void dftClass::saveTriaInfoAndRhoData() } d_mesh.saveTriangulationsCellQuadData(cellQuadDataContainerIn, - interpoolcomm); + interpoolcomm, + interBandGroupComm); //write size of current mixing history into an additional .txt file const std::string extraInfoFileName="rhoDataExtraInfo.chk"; diff --git a/src/eigen/eigen.cc b/src/eigen/eigen.cc index 85299bb0a..e66a7b359 100644 --- a/src/eigen/eigen.cc +++ b/src/eigen/eigen.cc @@ -962,6 +962,14 @@ void eigenClass::computeVEff(const std::mapinterBandGroupComm); + const unsigned int bandGroupTaskId = dealii::Utilities::MPI::this_mpi_process(dftPtr->interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(dftPtr->interBandGroupComm, + numberWaveFunctions, + bandGroupLowHighPlusOneIndices); //X^{T}*H*X is done in a blocked approach for memory optimization: //Sum_{blocks} X^{T}*H*XBlock. The result of each X^{T}*H*XBlock @@ -969,7 +977,8 @@ void eigenClass::computeVEff(const std::map projHamBlock(numberWaveFunctions*vectorsBlockSize,0.0); @@ -985,64 +994,80 @@ void eigenClass::computeVEff(const std::mapis_process_active()) - for (unsigned int j = 0; j ::iterator it= - globalToLocalRowIdMap.find(i); - if (it!=globalToLocalRowIdMap.end()) - projHamPar.local_el(it->second, - localColumnId) - =projHamBlock[j*numberWaveFunctions+i]; - } - } - } + if ((jvec+B)<=bandGroupLowHighPlusOneIndices[2*bandGroupTaskId+1] && + (jvec+B)>bandGroupLowHighPlusOneIndices[2*bandGroupTaskId]) + { + XBlock=0; + //fill XBlock from X: + for(unsigned int iNode = 0; iNodeis_process_active()) + for (unsigned int j = 0; j ::iterator it= + globalToLocalRowIdMap.find(i); + if (it!=globalToLocalRowIdMap.end()) + projHamPar.local_el(it->second, + localColumnId) + =projHamBlock[j*numberWaveFunctions+i]; + } + } + + }//band parallelization + + }//block loop + + //accumulate from all band groups (only one band group has non-zero value for a given entry) + //FIXME: It would be faster to do this all reduce operation in one go over all local elements + //, which might require writing a function inside dealii + if (processGrid->is_process_active() && numberBandGroups>1) + for (unsigned int i = 0; i < projHamPar.local_n(); ++i) + for (unsigned int j = 0; j < projHamPar.local_m(); ++j) + projHamPar.local_el(j,i) + =dealii::Utilities::MPI::sum(projHamPar.local_el(j,i), + dftPtr->interBandGroupComm); #endif } #endif diff --git a/src/linAlg/linearAlgebraOperationsInternal.cc b/src/linAlg/linearAlgebraOperationsInternal.cc index 97cd6749c..ee9777bde 100644 --- a/src/linAlg/linearAlgebraOperationsInternal.cc +++ b/src/linAlg/linearAlgebraOperationsInternal.cc @@ -43,7 +43,7 @@ namespace dftfe const unsigned int blocksPerProc=4; const unsigned int rowProcs=std::min(std::floor(std::sqrt(numberProcs)), std::ceil((double)size/(double)(blocksPerProc*rowsBlockSize))); - if(dftParameters::verbosity>=2) + if(dftParameters::verbosity>=3) { dealii::ConditionalOStream pcout(std::cout, (dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)); pcout<<"Scalapack Matrix created, "<<"rowsBlockSize: "< & X, const unsigned int numberVectors, const std::shared_ptr< const dealii::Utilities::MPI::ProcessGrid> & processGrid, + const MPI_Comm &interBandGroupComm, dealii::ScaLAPACKMatrix & overlapMatPar) { #ifdef USE_COMPLEX @@ -90,6 +91,16 @@ namespace dftfe #else const unsigned int numLocalDofs = X.local_size()/numberVectors; + //band group parallelization data structures + const unsigned int numberBandGroups= + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int bandGroupTaskId = dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(interBandGroupComm, + numberVectors, + bandGroupLowHighPlusOneIndices); + + //get global to local index maps for Scalapack matrix std::map globalToLocalColumnIdMap; std::map globalToLocalRowIdMap; internal::createGlobalToLocalIdMapsScaLAPACKMat(processGrid, @@ -97,7 +108,8 @@ namespace dftfe globalToLocalRowIdMap, globalToLocalColumnIdMap); - const unsigned int vectorsBlockSize=dftParameters::orthoRRWaveFuncBlockSize; + const unsigned int vectorsBlockSize=std::min(dftParameters::orthoRRWaveFuncBlockSize, + bandGroupLowHighPlusOneIndices[1]); std::vector overlapMatrixBlock(numberVectors*vectorsBlockSize,0.0); std::vector blockVectorsMatrix(numLocalDofs*vectorsBlockSize,0.0); @@ -106,50 +118,65 @@ namespace dftfe { // Correct block dimensions if block "goes off edge of" the matrix const unsigned int B = std::min(vectorsBlockSize, numberVectors-ivec); - const char transA = 'N',transB = 'N'; - const T scalarCoeffAlpha = 1.0,scalarCoeffBeta = 0.0; - - std::fill(overlapMatrixBlock.begin(),overlapMatrixBlock.end(),0.); - - for (unsigned int i = 0; i is_process_active()) - for(unsigned int i = 0; i bandGroupLowHighPlusOneIndices[2*bandGroupTaskId]) + { + const char transA = 'N',transB = 'N'; + const T scalarCoeffAlpha = 1.0,scalarCoeffBeta = 0.0; + + std::fill(overlapMatrixBlock.begin(),overlapMatrixBlock.end(),0.); + + for (unsigned int i = 0; i is_process_active()) + for(unsigned int i = 0; i ::iterator it= - globalToLocalRowIdMap.find(j); - if(it!=globalToLocalRowIdMap.end()) - overlapMatPar.local_el(it->second, - localColumnId) - =overlapMatrixBlock[i*numberVectors+j]; + const unsigned int localColumnId=globalToLocalColumnIdMap[i+ivec]; + for (unsigned int j = 0; j ::iterator it= + globalToLocalRowIdMap.find(j); + if(it!=globalToLocalRowIdMap.end()) + overlapMatPar.local_el(it->second, + localColumnId) + =overlapMatrixBlock[i*numberVectors+j]; + } } - } - } + }//band parallelization + }//block loop + + //accumulate from all band groups (only one band group has non-zero value for a given entry) + //FIXME: It would be faster to do this all reduce operation in one go over all local elements + //, which might require writing a function inside dealii + if (processGrid->is_process_active() && numberBandGroups>1) + for (unsigned int i = 0; i < overlapMatPar.local_n(); ++i) + for (unsigned int j = 0; j < overlapMatPar.local_m(); ++j) + overlapMatPar.local_el(j,i) + =dealii::Utilities::MPI::sum(overlapMatPar.local_el(j,i), + interBandGroupComm); #endif } @@ -158,6 +185,7 @@ namespace dftfe void subspaceRotation(dealii::parallel::distributed::Vector & subspaceVectorsArray, const unsigned int numberSubspaceVectors, const std::shared_ptr< const dealii::Utilities::MPI::ProcessGrid> & processGrid, + const MPI_Comm &interBandGroupComm, const dealii::ScaLAPACKMatrix & rotationMatPar, const bool rotationMatTranspose) { @@ -169,6 +197,15 @@ namespace dftfe const unsigned int maxNumLocalDofs=dealii::Utilities::MPI::max(numLocalDofs, subspaceVectorsArray.get_mpi_communicator()); + //band group parallelization data structures + const unsigned int numberBandGroups= + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int bandGroupTaskId = dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(interBandGroupComm, + numberSubspaceVectors, + bandGroupLowHighPlusOneIndices); + std::map globalToLocalColumnIdMap; std::map globalToLocalRowIdMap; internal::createGlobalToLocalIdMapsScaLAPACKMat(processGrid, @@ -177,7 +214,8 @@ namespace dftfe globalToLocalColumnIdMap); - const unsigned int vectorsBlockSize=dftParameters::orthoRRWaveFuncBlockSize; + const unsigned int vectorsBlockSize=std::min(dftParameters::orthoRRWaveFuncBlockSize, + bandGroupLowHighPlusOneIndices[1]); const unsigned int dofsBlockSize=dftParameters::subspaceRotDofsBlockSize; std::vector rotationMatBlock(vectorsBlockSize*numberSubspaceVectors,0.0); @@ -191,88 +229,100 @@ namespace dftfe if (numLocalDofs>=idof) BDof = std::min(dofsBlockSize, numLocalDofs-idof); + std::fill(rotatedVectorsMatBlock.begin(),rotatedVectorsMatBlock.end(),0.); for (unsigned int jvec = 0; jvec < numberSubspaceVectors; jvec += vectorsBlockSize) { // Correct block dimensions if block "goes off edge of" the matrix const unsigned int BVec = std::min(vectorsBlockSize, numberSubspaceVectors-jvec); - const char transA = 'N',transB = 'N'; - const T scalarCoeffAlpha = 1.0,scalarCoeffBeta = 0.0; - std::fill(rotationMatBlock.begin(),rotationMatBlock.end(),0.); - - if (rotationMatTranspose) - { - if (processGrid->is_process_active()) - for (unsigned int i = 0; i ::iterator it= - globalToLocalColumnIdMap.find(j+jvec); - if(it!=globalToLocalColumnIdMap.end()) - rotationMatBlock[i*BVec+j]= - rotationMatPar.local_el(localRowId, - it->second); - } - } - } - else + if ((jvec+BVec)<=bandGroupLowHighPlusOneIndices[2*bandGroupTaskId+1] && + (jvec+BVec)>bandGroupLowHighPlusOneIndices[2*bandGroupTaskId]) { - if (processGrid->is_process_active()) - for (unsigned int i = 0; i is_process_active()) + for (unsigned int i = 0; i ::iterator it= - globalToLocalRowIdMap.find(j+jvec); - if (it!=globalToLocalRowIdMap.end()) - rotationMatBlock[i*BVec+j]= - rotationMatPar.local_el(it->second, - localColumnId); + const unsigned int localRowId=globalToLocalRowIdMap[i]; + for (unsigned int j = 0; j ::iterator it= + globalToLocalColumnIdMap.find(j+jvec); + if(it!=globalToLocalColumnIdMap.end()) + rotationMatBlock[i*BVec+j]= + rotationMatPar.local_el(localRowId, + it->second); + } } - } - } + } + else + { + if (processGrid->is_process_active()) + for (unsigned int i = 0; i ::iterator it= + globalToLocalRowIdMap.find(j+jvec); + if (it!=globalToLocalRowIdMap.end()) + rotationMatBlock[i*BVec+j]= + rotationMatPar.local_el(it->second, + localColumnId); + } + } + } - dealii::Utilities::MPI::sum(rotationMatBlock, - subspaceVectorsArray.get_mpi_communicator(), - rotationMatBlock); - if (BDof!=0) - { + dealii::Utilities::MPI::sum(rotationMatBlock, + subspaceVectorsArray.get_mpi_communicator(), + rotationMatBlock); + if (BDof!=0) + { + + dgemm_(&transA, + &transB, + &BVec, + &BDof, + &numberSubspaceVectors, + &scalarCoeffAlpha, + &rotationMatBlock[0], + &BVec, + subspaceVectorsArray.begin()+idof*numberSubspaceVectors, + &numberSubspaceVectors, + &scalarCoeffBeta, + &rotatedVectorsMatBlockTemp[0], + &BVec); + + for (unsigned int i = 0; i & X, const unsigned int numberVectors, const std::shared_ptr< const dealii::Utilities::MPI::ProcessGrid> & processGrid, + const MPI_Comm &interBandGroupComm, dealii::ScaLAPACKMatrix & overlapMatPar); template void subspaceRotation(dealii::parallel::distributed::Vector & subspaceVectorsArray, const unsigned int numberSubspaceVectors, const std::shared_ptr< const dealii::Utilities::MPI::ProcessGrid> & processGrid, + const MPI_Comm &interBandGroupComm, const dealii::ScaLAPACKMatrix & rotationMatPar, const bool rotationMatTranpose); diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc index 4f7c18008..b9fa52b36 100644 --- a/src/linAlg/linearAlgebraOperationsOpt.cc +++ b/src/linAlg/linearAlgebraOperationsOpt.cc @@ -466,6 +466,7 @@ namespace dftfe{ void rayleighRitz(operatorDFTClass & operatorMatrix, dealii::parallel::distributed::Vector & X, const unsigned int numberWaveFunctions, + const MPI_Comm &interBandGroupComm, std::vector & eigenValues) { @@ -516,6 +517,7 @@ namespace dftfe{ internal::subspaceRotation(X, numberWaveFunctions, processGrid, + interBandGroupComm, projHamPar, true); computing_timer.exit_section("Blocked subspace rotation, RR step"); @@ -529,6 +531,7 @@ namespace dftfe{ void rayleighRitz(operatorDFTClass & operatorMatrix, dealii::parallel::distributed::Vector & X, const unsigned int numberWaveFunctions, + const MPI_Comm &interBandGroupComm, std::vector & eigenValues) { if (dftParameters::orthoRROMPThreads!=0) @@ -1135,11 +1138,13 @@ namespace dftfe{ const unsigned int); template unsigned int pseudoGramSchmidtOrthogonalization(dealii::parallel::distributed::Vector &, - const unsigned int); + const unsigned int, + const MPI_Comm &); template void rayleighRitz(operatorDFTClass & operatorMatrix, dealii::parallel::distributed::Vector &, const unsigned int numberWaveFunctions, + const MPI_Comm &, std::vector & eigenValues); template void computeEigenResidualNorm(operatorDFTClass & operatorMatrix, diff --git a/src/linAlg/pseudoGS.cc b/src/linAlg/pseudoGS.cc index 30433c2cb..b8c0e72ff 100644 --- a/src/linAlg/pseudoGS.cc +++ b/src/linAlg/pseudoGS.cc @@ -29,7 +29,8 @@ namespace dftfe #if(defined DEAL_II_WITH_SCALAPACK && !USE_COMPLEX) template unsigned int pseudoGramSchmidtOrthogonalization(dealii::parallel::distributed::Vector & X, - const unsigned int numberVectors) + const unsigned int numberVectors, + const MPI_Comm &interBandGroupComm) { if (dftParameters::orthoRROMPThreads!=0) omp_set_num_threads(dftParameters::orthoRROMPThreads); @@ -59,6 +60,7 @@ namespace dftfe internal::fillParallelOverlapMatrix(X, numberVectors, processGrid, + interBandGroupComm, overlapMatPar); computing_timer.exit_section("Fill overlap matrix for PGS"); @@ -135,6 +137,7 @@ namespace dftfe internal::subspaceRotation(X, numberVectors, processGrid, + interBandGroupComm, LMatPar, overlapMatPropertyPostCholesky==dealii::LAPACKSupport::Property::upper_triangular?true:false); @@ -147,7 +150,8 @@ namespace dftfe #else template unsigned int pseudoGramSchmidtOrthogonalization(dealii::parallel::distributed::Vector & X, - const unsigned int numberVectors) + const unsigned int numberVectors, + const MPI_Comm &interBandGroupComm) { AssertThrow(false,dftUtils::ExcNotImplementedYet()); return 0; diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc index cbf3bf51d..1786c0662 100755 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc @@ -382,7 +382,8 @@ namespace dftfe{ computing_timer.enter_section("Pseudo-Gram-Schmidt"); const unsigned int flag=linearAlgebraOperations::pseudoGramSchmidtOrthogonalization (eigenVectorsFlattened, - totalNumberWaveFunctions); + totalNumberWaveFunctions, + interBandGroupComm); if (flag==1) { if(dftParameters::verbosity >= 1) @@ -410,6 +411,7 @@ namespace dftfe{ linearAlgebraOperations::rayleighRitz(operatorMatrix, eigenVectorsFlattened, totalNumberWaveFunctions, + interBandGroupComm, eigenValues); computing_timer.exit_section("Rayleigh-Ritz proj Opt"); diff --git a/src/triangulation/triangulationManager/restartUtils.cc b/src/triangulation/triangulationManager/restartUtils.cc index 9d9c493e9..5a522aa67 100644 --- a/src/triangulation/triangulationManager/restartUtils.cc +++ b/src/triangulation/triangulationManager/restartUtils.cc @@ -115,13 +115,16 @@ namespace dftfe { (const unsigned int feOrder, const unsigned int nComponents, const std::vector< const dealii::parallel::distributed::Vector * > & solutionVectors, - const MPI_Comm & interpoolComm) + const MPI_Comm & interpoolComm, + const MPI_Comm &interBandGroupComm) { const unsigned int poolId=dealii::Utilities::MPI::this_mpi_process(interpoolComm); + const unsigned int bandGroupId=dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); const unsigned int minPoolId=dealii::Utilities::MPI::min(poolId,interpoolComm); + const unsigned int minBandGroupId=dealii::Utilities::MPI::min(bandGroupId,interBandGroupComm); - if (poolId==minPoolId) + if (poolId==minPoolId && bandGroupId==minBandGroupId) { dealii::FESystem<3> FE(dealii::FE_Q<3>(dealii::QGaussLobatto<1>(feOrder+1)), nComponents); //linear shape function DoFHandler<3> dofHandler (d_parallelTriangulationUnmoved); @@ -199,13 +202,16 @@ namespace dftfe { void triangulationManager::saveTriangulationsCellQuadData (const std::vector > *> & cellQuadDataContainerIn, - const MPI_Comm & interpoolComm) + const MPI_Comm & interpoolComm, + const MPI_Comm &interBandGroupComm) { const unsigned int poolId=dealii::Utilities::MPI::this_mpi_process(interpoolComm); + const unsigned int bandGroupId=dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); const unsigned int minPoolId=dealii::Utilities::MPI::min(poolId,interpoolComm); + const unsigned int minBandGroupId=dealii::Utilities::MPI::min(bandGroupId,interBandGroupComm); - if (poolId==minPoolId) + if (poolId==minPoolId && bandGroupId==minBandGroupId) { const unsigned int containerSize=cellQuadDataContainerIn.size(); AssertThrow(containerSize!=0,ExcInternalError()); diff --git a/tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=12.output b/tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=12.output new file mode 100644 index 000000000..90251586e --- /dev/null +++ b/tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=12.output @@ -0,0 +1,42 @@ +number of atoms: 2 +number of atoms types: 1 +Z:12 +============================================================================================================================= +number of electrons: 20 +number of eigen values: 23 +============================================================================================================================= +-----------Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 5.882191053999999752e+00 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : -2.941095526999999876e+00 5.094126882677563195e+00 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 9.585777736000000715e+00 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 1: 6.666666666666666297e-01 3.333333333333333148e-01 5.000000000000000000e-01 +----------------------------------------------------------------------------------------- +Number Image Charges 2661 + +Finite element mesh information +------------------------------------------------- +number of elements: 976 +number of degrees of freedom: 32917 +------------------------------------------------- + +Reading initial guess for PSI.... + +Reading initial guess for rho..... + +Pseuodopotential initalization.... + +Starting SCF iteration.... +SCF iteration converged to the specified tolerance after: 15 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -108.88587727 + +Absolute values of ion forces (Hartree/Bohr) +-------------------------------------------------------------------------------------------- +AtomId 0: 0.000000,0.000000,0.000081 +AtomId 1: 0.000097,0.000056,0.000096 +-------------------------------------------------------------------------------------------- diff --git a/tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=8.output b/tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=8.output new file mode 100644 index 000000000..90251586e --- /dev/null +++ b/tests/dft/pseudopotential/real/hcpMgPrim_03_c.mpirun=8.output @@ -0,0 +1,42 @@ +number of atoms: 2 +number of atoms types: 1 +Z:12 +============================================================================================================================= +number of electrons: 20 +number of eigen values: 23 +============================================================================================================================= +-----------Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 5.882191053999999752e+00 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : -2.941095526999999876e+00 5.094126882677563195e+00 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 9.585777736000000715e+00 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 1: 6.666666666666666297e-01 3.333333333333333148e-01 5.000000000000000000e-01 +----------------------------------------------------------------------------------------- +Number Image Charges 2661 + +Finite element mesh information +------------------------------------------------- +number of elements: 976 +number of degrees of freedom: 32917 +------------------------------------------------- + +Reading initial guess for PSI.... + +Reading initial guess for rho..... + +Pseuodopotential initalization.... + +Starting SCF iteration.... +SCF iteration converged to the specified tolerance after: 15 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -108.88587727 + +Absolute values of ion forces (Hartree/Bohr) +-------------------------------------------------------------------------------------------- +AtomId 0: 0.000000,0.000000,0.000081 +AtomId 1: 0.000097,0.000056,0.000096 +-------------------------------------------------------------------------------------------- diff --git a/tests/dft/pseudopotential/real/hcpMgPrim_03_c.prm.in b/tests/dft/pseudopotential/real/hcpMgPrim_03_c.prm.in new file mode 100644 index 000000000..d7f5412c1 --- /dev/null +++ b/tests/dft/pseudopotential/real/hcpMgPrim_03_c.prm.in @@ -0,0 +1,82 @@ +set VERBOSITY= 0 +set REPRODUCIBLE OUTPUT=true + +subsection Geometry + set ATOMIC COORDINATES FILE = @SOURCE_DIR@/hcpMgPrim_coordinates.inp + set DOMAIN BOUNDING VECTORS FILE = @SOURCE_DIR@/hcpMgPrim_domainBoundingVectors.inp + + subsection Optimization + set ION FORCE=true + set CELL STRESS=false + end + +end + + +subsection Boundary conditions + set SELF POTENTIAL ATOM BALL RADIUS = 1.6 + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true +end + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 3 + + subsection Auto mesh generation parameters + set BASE MESH SIZE = 1.0 + set ATOM BALL RADIUS = 2.0 + set MESH SIZE ATOM BALL = 0.5 + set MESH SIZE NEAR ATOM = 0.5 + end + +end + + +subsection Brillouin zone k point sampling options + + subsection Monkhorst-Pack (MP) grid generation + set SAMPLING POINTS 1 = 1 + set SAMPLING POINTS 2 = 1 + set SAMPLING POINTS 3 = 1 + set SAMPLING SHIFT 1 = 0.0 + set SAMPLING SHIFT 2 = 0.0 + set SAMPLING SHIFT 3 = 0.0 + end +end + + + +subsection DFT functional related parameters + set PSEUDOPOTENTIAL CALCULATION =true + set PSEUDOPOTENTIAL TYPE = 2 + set EXCHANGE CORRELATION TYPE = 4 +end + +subsection Parallelization + set NPBAND=4 +end + +subsection SCF parameters + set MAXIMUM ITERATIONS = 100 + set TOLERANCE = 1e-7 + set ANDERSON SCHEME MIXING PARAMETER = 0.5 + set ANDERSON SCHEME MIXING HISTORY = 70 + set TEMPERATURE = 500 + set STARTING WFC=ATOMIC + subsection Eigen-solver/Chebyshev solver related parameters + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 23 + set LOWER BOUND WANTED SPECTRUM = -10.0 + set CHEBYSHEV POLYNOMIAL DEGREE = 40 + set CHEBYSHEV FILTER TOLERANCE=5e-3 + set ORTHOGONALIZATION TYPE=PGS + set ENABLE SWITCH TO GS=false + set ORTHO RR WFC BLOCK SIZE=3 + end +end + +subsection Poisson problem parameters + set MAXIMUM ITERATIONS = 10000 + set TOLERANCE = 1e-12 +end diff --git a/utils/dftUtils.cc b/utils/dftUtils.cc index d7f6bd4cc..967ffa6e2 100644 --- a/utils/dftUtils.cc +++ b/utils/dftUtils.cc @@ -70,18 +70,37 @@ namespace dftUtils void writeDataVTUParallelLowestPoolId(const dealii::DataOut<3> & dataOut, const MPI_Comm & intrapoolcomm, const MPI_Comm & interpoolcomm, + const MPI_Comm &interBandGroupComm, const std::string & fileName) { const unsigned int poolId=dealii::Utilities::MPI::this_mpi_process(interpoolcomm); + const unsigned int bandGroupId=dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); const unsigned int minPoolId=dealii::Utilities::MPI::min(poolId,interpoolcomm); + const unsigned int minBandGroupId=dealii::Utilities::MPI::min(bandGroupId,interBandGroupComm); - if (poolId==minPoolId) + if (poolId==minPoolId && bandGroupId==minBandGroupId) { std::string fileNameVTU=fileName+".vtu"; dataOut.write_vtu_in_parallel(fileNameVTU.c_str(),intrapoolcomm); } } + void createBandParallelizationIndices(const MPI_Comm &interBandGroupComm, + const unsigned int numBands, + std::vector & bandGroupLowHighPlusOneIndices) + { + bandGroupLowHighPlusOneIndices.clear(); + const unsigned int numberBandGroups= + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int wfcBlockSizeBandGroup=numBands/numberBandGroups; + bandGroupLowHighPlusOneIndices.resize(numberBandGroups*2); + for (unsigned int i=0;i