diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp index 592160b77121..15d47a420058 100644 --- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_decl.hpp @@ -163,7 +163,7 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix local_ordinal_type, global_ordinal_type, node_type> crs_matrix_type; - + using crs_graph_type = Tpetra::CrsGraph; @@ -175,11 +175,16 @@ class RBILUK : virtual public Ifpack2::RILUK< Tpetra::RowMatrix< typename Matrix template friend class RBILUK; + typedef typename block_crs_matrix_type::nonconst_global_inds_host_view_type nonconst_global_inds_host_view_type; + typedef typename block_crs_matrix_type::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type; + typedef typename block_crs_matrix_type::nonconst_values_host_view_type nonconst_values_host_view_type; + //@} //! \name Implementation of KK ILU(k). //@{ - - typedef typename crs_matrix_type::local_matrix_device_type local_matrix_device_type; + + typedef typename block_crs_matrix_type::local_matrix_device_type local_matrix_device_type; + typedef typename block_crs_matrix_type::local_matrix_host_type local_matrix_host_type; typedef typename local_matrix_device_type::StaticCrsGraphType::row_map_type lno_row_view_t; typedef typename local_matrix_device_type::StaticCrsGraphType::entries_type lno_nonzero_view_t; typedef typename local_matrix_device_type::values_type scalar_nonzero_view_t; diff --git a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp index 7398d77e1a57..fcc5a62932cc 100644 --- a/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp @@ -45,13 +45,16 @@ #include "Tpetra_BlockMultiVector.hpp" #include "Tpetra_BlockView.hpp" +#include "Tpetra_BlockCrsMatrix_Helpers_decl.hpp" #include "Ifpack2_OverlappingRowMatrix.hpp" +#include "Ifpack2_Details_getCrsMatrix.hpp" #include "Ifpack2_LocalFilter.hpp" #include "Ifpack2_Utilities.hpp" #include "Ifpack2_RILUK.hpp" +#include "KokkosSparse_trsv.hpp" //#define IFPACK2_RBILUK_INITIAL -#define IFPACK2_RBILUK_INITIAL_NOKK +//#define IFPACK2_RBILUK_INITIAL_NOKK #ifndef IFPACK2_RBILUK_INITIAL_NOKK #include "KokkosBatched_Gemm_Decl.hpp" @@ -69,7 +72,7 @@ template struct LocalRowHandler { using LocalOrdinal = typename MatrixType::local_ordinal_type; - using row_matrix_type = Tpetra::RowMatrix< + using row_matrix_type = Tpetra::RowMatrix< typename MatrixType::scalar_type, LocalOrdinal, typename MatrixType::global_ordinal_type, @@ -96,7 +99,7 @@ struct LocalRowHandler A_->getLocalRowView(local_row,InI,InV); NumIn = (LocalOrdinal)InI.size(); } - else + else { size_t cnt; A_->getLocalRowCopy(local_row,ind_nc_,val_nc_,cnt); @@ -266,7 +269,7 @@ makeLocalFilter (const Teuchos::RCP::row_matri template Teuchos::RCP::crs_graph_type> -getBlockCrsGraph(const Teuchos::RCP::row_matrix_type>& A,const typename MatrixType::local_ordinal_type blockSize) +getBlockCrsGraph(const Teuchos::RCP::row_matrix_type>& A) { using local_ordinal_type = typename MatrixType::local_ordinal_type; using Teuchos::RCP; @@ -289,10 +292,10 @@ getBlockCrsGraph(const Teuchos::RCP::row_matri { overlappedA = rcp_dynamic_cast > (filteredA->getUnderlyingMatrix ()); } - + if (! overlappedA.is_null ()) { A_block = rcp_dynamic_cast(overlappedA->getUnderlyingMatrix()); - } + } else if (!filteredA.is_null ()){ //If there is no overlap, filteredA could be the block CRS matrix A_block = rcp_dynamic_cast(filteredA->getUnderlyingMatrix()); @@ -360,6 +363,7 @@ void RBILUK::initialize () "underlying graph is fill complete."); blockSize_ = this->A_->getBlockSize(); + this->A_local_ = makeLocalFilter(this->A_); Teuchos::Time timer ("RBILUK::initialize"); double startTime = timer.wallTime(); @@ -378,12 +382,25 @@ void RBILUK::initialize () this->isComputed_ = false; this->Graph_ = Teuchos::null; - RCP matrixCrsGraph = getBlockCrsGraph(this->A_,blockSize_); + RCP matrixCrsGraph = getBlockCrsGraph(this->A_); this->Graph_ = rcp (new Ifpack2::IlukGraph (matrixCrsGraph, this->LevelOfFill_, 0)); - this->Graph_->initialize (); + if (this->isKokkosKernelsSpiluk_) { + this->KernelHandle_ = Teuchos::rcp (new kk_handle_type ()); + KernelHandle_->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, + this->A_local_->getLocalNumRows(), + 2*this->A_local_->getLocalNumEntries()*(this->LevelOfFill_+1), + 2*this->A_local_->getLocalNumEntries()*(this->LevelOfFill_+1), + blockSize_); + this->Graph_->initialize(KernelHandle_); // this calls spiluk_symbolic + } + else { + this->Graph_->initialize (); + } + allocate_L_and_U_blocks (); + #ifdef IFPACK2_RBILUK_INITIAL initAllValues (); #endif @@ -577,6 +594,10 @@ struct GetManagedView { template void RBILUK::compute () { + using Teuchos::RCP; + using Teuchos::rcp; + using Teuchos::Array; + typedef impl_scalar_type IST; const char prefix[] = "Ifpack2::Experimental::RBILUK::compute: "; @@ -627,205 +648,205 @@ void RBILUK::compute () // const scalar_type MinDiagonalValue = STS::rmin (); // const scalar_type MaxDiagonalValue = STS::one () / MinDiagonalValue; - initAllValues (); - - size_t NumIn; - LO NumL, NumU, NumURead; + if (!this->isKokkosKernelsSpiluk_) { + initAllValues (); + size_t NumIn; + LO NumL, NumU, NumURead; - // Get Maximum Row length - const size_t MaxNumEntries = - L_block_->getLocalMaxNumRowEntries () + U_block_->getLocalMaxNumRowEntries () + 1; + // Get Maximum Row length + const size_t MaxNumEntries = + L_block_->getLocalMaxNumRowEntries () + U_block_->getLocalMaxNumRowEntries () + 1; - const LO blockMatSize = blockSize_*blockSize_; + const LO blockMatSize = blockSize_*blockSize_; - // FIXME (mfh 08 Nov 2015, 24 May 2016) We need to move away from - // expressing these strides explicitly, in order to finish #177 - // (complete Kokkos-ization of BlockCrsMatrix) thoroughly. - const LO rowStride = blockSize_; + // FIXME (mfh 08 Nov 2015, 24 May 2016) We need to move away from + // expressing these strides explicitly, in order to finish #177 + // (complete Kokkos-ization of BlockCrsMatrix) thoroughly. + const LO rowStride = blockSize_; - Teuchos::Array ipiv_teuchos(blockSize_); - Kokkos::View ipiv (ipiv_teuchos.getRawPtr (), blockSize_); - Teuchos::Array work_teuchos(blockSize_); - Kokkos::View work (work_teuchos.getRawPtr (), blockSize_); + Teuchos::Array ipiv_teuchos(blockSize_); + Kokkos::View ipiv (ipiv_teuchos.getRawPtr (), blockSize_); + Teuchos::Array work_teuchos(blockSize_); + Kokkos::View work (work_teuchos.getRawPtr (), blockSize_); - size_t num_cols = U_block_->getColMap()->getLocalNumElements(); - Teuchos::Array colflag(num_cols); + size_t num_cols = U_block_->getColMap()->getLocalNumElements(); + Teuchos::Array colflag(num_cols); - typename GetManagedView::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_); - typename GetManagedView::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_); - typename GetManagedView::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_); + typename GetManagedView::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_); + typename GetManagedView::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_); + typename GetManagedView::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_); // Teuchos::ArrayRCP DV = D_->get1dViewNonConst(); // Get view of diagonal - // Now start the factorization. + // Now start the factorization. - // Need some integer workspace and pointers - LO NumUU; - for (size_t j = 0; j < num_cols; ++j) { - colflag[j] = -1; - } - Teuchos::Array InI(MaxNumEntries, 0); - Teuchos::Array InV(MaxNumEntries*blockMatSize,STM::zero()); - - const LO numLocalRows = L_block_->getLocalNumRows (); - for (LO local_row = 0; local_row < numLocalRows; ++local_row) { - - // Fill InV, InI with current row of L, D and U combined - - NumIn = MaxNumEntries; - local_inds_host_view_type colValsL; - values_host_view_type valsL; - - L_block_->getLocalRowView(local_row, colValsL, valsL); - NumL = (LO) colValsL.size(); - for (LO j = 0; j < NumL; ++j) - { - const LO matOffset = blockMatSize*j; - little_block_host_type lmat((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride); - little_block_host_type lmatV((typename little_block_host_type::value_type*) &InV[matOffset],blockSize_,rowStride); - //lmatV.assign(lmat); - Tpetra::COPY (lmat, lmatV); - InI[j] = colValsL[j]; + // Need some integer workspace and pointers + LO NumUU; + for (size_t j = 0; j < num_cols; ++j) { + colflag[j] = -1; } + Teuchos::Array InI(MaxNumEntries, 0); + Teuchos::Array InV(MaxNumEntries*blockMatSize,STM::zero()); - little_block_host_type dmat = D_block_->getLocalBlockHostNonConst(local_row, local_row); - little_block_host_type dmatV((typename little_block_host_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride); - //dmatV.assign(dmat); - Tpetra::COPY (dmat, dmatV); - InI[NumL] = local_row; - - local_inds_host_view_type colValsU; - values_host_view_type valsU; - U_block_->getLocalRowView(local_row, colValsU, valsU); - NumURead = (LO) colValsU.size(); - NumU = 0; - for (LO j = 0; j < NumURead; ++j) - { - if (!(colValsU[j] < numLocalRows)) continue; - InI[NumL+1+j] = colValsU[j]; - const LO matOffset = blockMatSize*(NumL+1+j); - little_block_host_type umat((typename little_block_host_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride); - little_block_host_type umatV((typename little_block_host_type::value_type*) &InV[matOffset], blockSize_, rowStride); - //umatV.assign(umat); - Tpetra::COPY (umat, umatV); - NumU += 1; - } - NumIn = NumL+NumU+1; + const LO numLocalRows = L_block_->getLocalNumRows (); + for (LO local_row = 0; local_row < numLocalRows; ++local_row) { - // Set column flags - for (size_t j = 0; j < NumIn; ++j) { - colflag[InI[j]] = j; - } + // Fill InV, InI with current row of L, D and U combined + + NumIn = MaxNumEntries; + local_inds_host_view_type colValsL; + values_host_view_type valsL; + + L_block_->getLocalRowView(local_row, colValsL, valsL); + NumL = (LO) colValsL.size(); + for (LO j = 0; j < NumL; ++j) + { + const LO matOffset = blockMatSize*j; + little_block_host_type lmat((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride); + little_block_host_type lmatV((typename little_block_host_type::value_type*) &InV[matOffset],blockSize_,rowStride); + //lmatV.assign(lmat); + Tpetra::COPY (lmat, lmatV); + InI[j] = colValsL[j]; + } + + little_block_host_type dmat = D_block_->getLocalBlockHostNonConst(local_row, local_row); + little_block_host_type dmatV((typename little_block_host_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride); + //dmatV.assign(dmat); + Tpetra::COPY (dmat, dmatV); + InI[NumL] = local_row; + + local_inds_host_view_type colValsU; + values_host_view_type valsU; + U_block_->getLocalRowView(local_row, colValsU, valsU); + NumURead = (LO) colValsU.size(); + NumU = 0; + for (LO j = 0; j < NumURead; ++j) + { + if (!(colValsU[j] < numLocalRows)) continue; + InI[NumL+1+j] = colValsU[j]; + const LO matOffset = blockMatSize*(NumL+1+j); + little_block_host_type umat((typename little_block_host_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride); + little_block_host_type umatV((typename little_block_host_type::value_type*) &InV[matOffset], blockSize_, rowStride); + //umatV.assign(umat); + Tpetra::COPY (umat, umatV); + NumU += 1; + } + NumIn = NumL+NumU+1; + + // Set column flags + for (size_t j = 0; j < NumIn; ++j) { + colflag[InI[j]] = j; + } #ifndef IFPACK2_RBILUK_INITIAL - for (LO i = 0; i < blockSize_; ++i) - for (LO j = 0; j < blockSize_; ++j){ - { - diagModBlock(i,j) = 0; + for (LO i = 0; i < blockSize_; ++i) + for (LO j = 0; j < blockSize_; ++j){ + { + diagModBlock(i,j) = 0; + } } - } #else - scalar_type diagmod = STM::zero (); // Off-diagonal accumulator - Kokkos::deep_copy (diagModBlock, diagmod); + scalar_type diagmod = STM::zero (); // Off-diagonal accumulator + Kokkos::deep_copy (diagModBlock, diagmod); #endif - for (LO jj = 0; jj < NumL; ++jj) { - LO j = InI[jj]; - little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++; - //multiplier.assign(currentVal); - Tpetra::COPY (currentVal, multiplier); + for (LO jj = 0; jj < NumL; ++jj) { + LO j = InI[jj]; + little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++; + //multiplier.assign(currentVal); + Tpetra::COPY (currentVal, multiplier); - const little_block_host_type dmatInverse = D_block_->getLocalBlockHostNonConst(j,j); - // alpha = 1, beta = 0 + const little_block_host_type dmatInverse = D_block_->getLocalBlockHostNonConst(j,j); + // alpha = 1, beta = 0 #ifndef IFPACK2_RBILUK_INITIAL_NOKK - KokkosBatched::Experimental::SerialGemm - ::invoke - (STS::one (), currentVal, dmatInverse, STS::zero (), matTmp); + KokkosBatched::SerialGemm + ::invoke + (STS::one (), currentVal, dmatInverse, STS::zero (), matTmp); #else - Tpetra::GEMM ("N", "N", STS::one (), currentVal, dmatInverse, - STS::zero (), matTmp); + Tpetra::GEMM ("N", "N", STS::one (), currentVal, dmatInverse, + STS::zero (), matTmp); #endif - //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast (currentVal.data ()), reinterpret_cast (dmatInverse.data ()), reinterpret_cast (matTmp.data ()), blockSize_); - //currentVal.assign(matTmp); - Tpetra::COPY (matTmp, currentVal); - local_inds_host_view_type UUI; - values_host_view_type UUV; - - U_block_->getLocalRowView(j, UUI, UUV); - NumUU = (LO) UUI.size(); - - if (this->RelaxValue_ == STM::zero ()) { - for (LO k = 0; k < NumUU; ++k) { - if (!(UUI[k] < numLocalRows)) continue; - const int kk = colflag[UUI[k]]; - if (kk > -1) { - little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); - little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); + //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast (currentVal.data ()), reinterpret_cast (dmatInverse.data ()), reinterpret_cast (matTmp.data ()), blockSize_); + //currentVal.assign(matTmp); + Tpetra::COPY (matTmp, currentVal); + local_inds_host_view_type UUI; + values_host_view_type UUV; + + U_block_->getLocalRowView(j, UUI, UUV); + NumUU = (LO) UUI.size(); + + if (this->RelaxValue_ == STM::zero ()) { + for (LO k = 0; k < NumUU; ++k) { + if (!(UUI[k] < numLocalRows)) continue; + const int kk = colflag[UUI[k]]; + if (kk > -1) { + little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); + little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); #ifndef IFPACK2_RBILUK_INITIAL_NOKK - KokkosBatched::Experimental::SerialGemm - ::invoke - ( magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), kkval); + KokkosBatched::SerialGemm + ::invoke + ( magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), kkval); #else - Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat, - STM::one (), kkval); + Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat, + STM::one (), kkval); #endif - //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast (multiplier.data ()), reinterpret_cast (uumat.data ()), reinterpret_cast (kkval.data ()), blockSize_, -STM::one(), STM::one()); + //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast (multiplier.data ()), reinterpret_cast (uumat.data ()), reinterpret_cast (kkval.data ()), blockSize_, -STM::one(), STM::one()); + } } } - } - else { - for (LO k = 0; k < NumUU; ++k) { - if (!(UUI[k] < numLocalRows)) continue; - const int kk = colflag[UUI[k]]; - little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); - if (kk > -1) { - little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); + else { + for (LO k = 0; k < NumUU; ++k) { + if (!(UUI[k] < numLocalRows)) continue; + const int kk = colflag[UUI[k]]; + little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride); + if (kk > -1) { + little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride); #ifndef IFPACK2_RBILUK_INITIAL_NOKK - KokkosBatched::Experimental::SerialGemm - ::invoke - (magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), kkval); + KokkosBatched::SerialGemm + ::invoke + (magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), kkval); #else - Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat, - STM::one (), kkval); + Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat, + STM::one (), kkval); #endif - //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast(multiplier.data ()), reinterpret_cast(uumat.data ()), reinterpret_cast(kkval.data ()), blockSize_, -STM::one(), STM::one()); - } - else { + //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast(multiplier.data ()), reinterpret_cast(uumat.data ()), reinterpret_cast(kkval.data ()), blockSize_, -STM::one(), STM::one()); + } + else { #ifndef IFPACK2_RBILUK_INITIAL_NOKK - KokkosBatched::Experimental::SerialGemm - ::invoke - (magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), diagModBlock); + KokkosBatched::SerialGemm + ::invoke + (magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), diagModBlock); #else - Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat, - STM::one (), diagModBlock); + Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat, + STM::one (), diagModBlock); #endif - //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast(multiplier.data ()), reinterpret_cast(uumat.data ()), reinterpret_cast(diagModBlock.data ()), blockSize_, -STM::one(), STM::one()); + //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast(multiplier.data ()), reinterpret_cast(uumat.data ()), reinterpret_cast(diagModBlock.data ()), blockSize_, -STM::one(), STM::one()); + } } } } - } - if (NumL) { - // Replace current row of L - L_block_->replaceLocalValues (local_row, InI.getRawPtr (), InV.getRawPtr (), NumL); - } + if (NumL) { + // Replace current row of L + L_block_->replaceLocalValues (local_row, InI.getRawPtr (), InV.getRawPtr (), NumL); + } - // dmat.assign(dmatV); - Tpetra::COPY (dmatV, dmat); + // dmat.assign(dmatV); + Tpetra::COPY (dmatV, dmat); - if (this->RelaxValue_ != STM::zero ()) { - //dmat.update(this->RelaxValue_, diagModBlock); - Tpetra::AXPY (this->RelaxValue_, diagModBlock, dmat); - } + if (this->RelaxValue_ != STM::zero ()) { + //dmat.update(this->RelaxValue_, diagModBlock); + Tpetra::AXPY (this->RelaxValue_, diagModBlock, dmat); + } // if (STS::magnitude (DV[i]) > STS::magnitude (MaxDiagonalValue)) { // if (STS::real (DV[i]) < STM::zero ()) { @@ -836,61 +857,130 @@ void RBILUK::compute () // } // } // else - { - int lapackInfo = 0; - for (int k = 0; k < blockSize_; ++k) { - ipiv[k] = 0; - } - - Tpetra::GETF2 (dmat, ipiv, lapackInfo); - //lapack.GETRF(blockSize_, blockSize_, d_raw, blockSize_, ipiv.getRawPtr(), &lapackInfo); - TEUCHOS_TEST_FOR_EXCEPTION( - lapackInfo != 0, std::runtime_error, "Ifpack2::Experimental::RBILUK::compute: " - "lapackInfo = " << lapackInfo << " which indicates an error in the factorization GETRF."); + { + int lapackInfo = 0; + for (int k = 0; k < blockSize_; ++k) { + ipiv[k] = 0; + } - Tpetra::GETRI (dmat, ipiv, work, lapackInfo); - //lapack.GETRI(blockSize_, d_raw, blockSize_, ipiv.getRawPtr(), work.getRawPtr(), lwork, &lapackInfo); - TEUCHOS_TEST_FOR_EXCEPTION( - lapackInfo != 0, std::runtime_error, "Ifpack2::Experimental::RBILUK::compute: " - "lapackInfo = " << lapackInfo << " which indicates an error in the matrix inverse GETRI."); - } + Tpetra::GETF2 (dmat, ipiv, lapackInfo); + //lapack.GETRF(blockSize_, blockSize_, d_raw, blockSize_, ipiv.getRawPtr(), &lapackInfo); + TEUCHOS_TEST_FOR_EXCEPTION( + lapackInfo != 0, std::runtime_error, "Ifpack2::Experimental::RBILUK::compute: " + "lapackInfo = " << lapackInfo << " which indicates an error in the factorization GETRF."); + + Tpetra::GETRI (dmat, ipiv, work, lapackInfo); + //lapack.GETRI(blockSize_, d_raw, blockSize_, ipiv.getRawPtr(), work.getRawPtr(), lwork, &lapackInfo); + TEUCHOS_TEST_FOR_EXCEPTION( + lapackInfo != 0, std::runtime_error, "Ifpack2::Experimental::RBILUK::compute: " + "lapackInfo = " << lapackInfo << " which indicates an error in the matrix inverse GETRI."); + } - for (LO j = 0; j < NumU; ++j) { - little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++; - // scale U by the diagonal inverse + for (LO j = 0; j < NumU; ++j) { + little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++; + // scale U by the diagonal inverse #ifndef IFPACK2_RBILUK_INITIAL_NOKK - KokkosBatched::Experimental::SerialGemm - ::invoke - (STS::one (), dmat, currentVal, STS::zero (), matTmp); + KokkosBatched::SerialGemm + ::invoke + (STS::one (), dmat, currentVal, STS::zero (), matTmp); #else - Tpetra::GEMM ("N", "N", STS::one (), dmat, currentVal, - STS::zero (), matTmp); + Tpetra::GEMM ("N", "N", STS::one (), dmat, currentVal, + STS::zero (), matTmp); #endif - //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast(dmat.data ()), reinterpret_cast(currentVal.data ()), reinterpret_cast(matTmp.data ()), blockSize_); - //currentVal.assign(matTmp); - Tpetra::COPY (matTmp, currentVal); - } + //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast(dmat.data ()), reinterpret_cast(currentVal.data ()), reinterpret_cast(matTmp.data ()), blockSize_); + //currentVal.assign(matTmp); + Tpetra::COPY (matTmp, currentVal); + } - if (NumU) { - // Replace current row of L and U - U_block_->replaceLocalValues (local_row, &InI[NumL+1], &InV[blockMatSize*(NumL+1)], NumU); - } + if (NumU) { + // Replace current row of L and U + U_block_->replaceLocalValues (local_row, &InI[NumL+1], &InV[blockMatSize*(NumL+1)], NumU); + } #ifndef IFPACK2_RBILUK_INITIAL - // Reset column flags - for (size_t j = 0; j < NumIn; ++j) { - colflag[InI[j]] = -1; - } + // Reset column flags + for (size_t j = 0; j < NumIn; ++j) { + colflag[InI[j]] = -1; + } #else - // Reset column flags - for (size_t j = 0; j < num_cols; ++j) { - colflag[j] = -1; - } + // Reset column flags + for (size_t j = 0; j < num_cols; ++j) { + colflag[j] = -1; + } #endif - } + } + } // !this->isKokkosKernelsSpiluk_ + else { + RCP A_local_bcrs = Details::getBcrsMatrix(this->A_local_); + RCP A_local_crs = Details::getCrsMatrix(this->A_local_); + if (A_local_bcrs.is_null()) { + if (A_local_crs.is_null()) { + local_ordinal_type numRows = this->A_local_->getLocalNumRows(); + Array entriesPerRow(numRows); + for(local_ordinal_type i = 0; i < numRows; i++) { + entriesPerRow[i] = this->A_local_->getNumEntriesInLocalRow(i); + } + RCP A_local_crs_nc = + rcp (new crs_matrix_type (this->A_local_->getRowMap (), + this->A_local_->getColMap (), + entriesPerRow())); + // copy entries into A_local_crs + nonconst_local_inds_host_view_type indices("indices",this->A_local_->getLocalMaxNumRowEntries()); + nonconst_values_host_view_type values("values",this->A_local_->getLocalMaxNumRowEntries()); + for(local_ordinal_type i = 0; i < numRows; i++) { + size_t numEntries = 0; + this->A_local_->getLocalRowCopy(i, indices, values, numEntries); + A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast(values.data()),indices.data()); + } + A_local_crs_nc->fillComplete (this->A_local_->getDomainMap (), this->A_local_->getRangeMap ()); + A_local_crs = Teuchos::rcp_const_cast (A_local_crs_nc); + } + // Create bcrs from crs + // We can skip fillLogicalBlocks if we know the input is already filled + if (blockSize_ > 1) { + auto crs_matrix_block_filled = Tpetra::fillLogicalBlocks(*A_local_crs, blockSize_); + A_local_bcrs = Tpetra::convertToBlockCrsMatrix(*crs_matrix_block_filled, blockSize_); + } + else { + A_local_bcrs = Tpetra::convertToBlockCrsMatrix(*A_local_crs, blockSize_); + } + } + + TEUCHOS_TEST_FOR_EXCEPTION( + this->isKokkosKernelsStream_, std::runtime_error, "Ifpack2::RBILUK::compute: " + "streams are not yet supported."); + auto lclMtx = A_local_bcrs->getLocalMatrixDevice(); + this->A_local_rowmap_ = lclMtx.graph.row_map; + this->A_local_entries_ = lclMtx.graph.entries; + this->A_local_values_ = lclMtx.values; + + // L_block_->resumeFill (); + // U_block_->resumeFill (); + + if (L_block_->isLocallyIndexed ()) { + L_block_->setAllToScalar (STS::zero ()); // Zero out L and U matrices + U_block_->setAllToScalar (STS::zero ()); + } + + using row_map_type = typename local_matrix_device_type::row_map_type; + + auto lclL = L_block_->getLocalMatrixDevice(); + row_map_type L_rowmap = lclL.graph.row_map; + auto L_entries = lclL.graph.entries; + auto L_values = lclL.values; + + auto lclU = U_block_->getLocalMatrixDevice(); + row_map_type U_rowmap = lclU.graph.row_map; + auto U_entries = lclU.graph.entries; + auto U_values = lclU.values; + + KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), this->LevelOfFill_, + this->A_local_rowmap_, this->A_local_entries_, this->A_local_values_, + L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values ); + } } // Stop timing // Sync everything back to device, for efficient solves. @@ -962,104 +1052,147 @@ apply (const Tpetra::MultiVectorGraph_->getA_Graph()->getDomainMap ()), blockSize_, numVectors); - BMV rBlock (* (this->Graph_->getA_Graph()->getDomainMap ()), blockSize_, numVectors); - for (LO imv = 0; imv < numVectors; ++imv) - { - for (size_t i = 0; i < D_block_->getLocalNumRows(); ++i) + if (!this->isKokkosKernelsSpiluk_) { + if (alpha == one && beta == zero) { + if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y. + // Start by solving L C = X for C. C must have the same Map + // as D. We have to use a temp multivector, since our + // implementation of triangular solves does not allow its + // input and output to alias one another. + // + // FIXME (mfh 24 Jan 2014) Cache this temp multivector. + const LO numVectors = xBlock.getNumVectors(); + BMV cBlock (* (this->Graph_->getA_Graph()->getDomainMap ()), blockSize_, numVectors); + BMV rBlock (* (this->Graph_->getA_Graph()->getDomainMap ()), blockSize_, numVectors); + for (LO imv = 0; imv < numVectors; ++imv) { - LO local_row = i; - const_host_little_vec_type xval = - xBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly); - little_host_vec_type cval = - cBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::OverwriteAll); - //cval.assign(xval); - Tpetra::COPY (xval, cval); - - local_inds_host_view_type colValsL; - values_host_view_type valsL; - L_block_->getLocalRowView(local_row, colValsL, valsL); - LO NumL = (LO) colValsL.size(); - - for (LO j = 0; j < NumL; ++j) + for (size_t i = 0; i < D_block_->getLocalNumRows(); ++i) { - LO col = colValsL[j]; - const_host_little_vec_type prevVal = - cBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly); - - const LO matOffset = blockMatSize*j; - little_block_host_type lij((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride); - - //cval.matvecUpdate(-one, lij, prevVal); - Tpetra::GEMV (-one, lij, prevVal, cval); + LO local_row = i; + const_host_little_vec_type xval = + xBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly); + little_host_vec_type cval = + cBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::OverwriteAll); + //cval.assign(xval); + Tpetra::COPY (xval, cval); + + local_inds_host_view_type colValsL; + values_host_view_type valsL; + L_block_->getLocalRowView(local_row, colValsL, valsL); + LO NumL = (LO) colValsL.size(); + + for (LO j = 0; j < NumL; ++j) + { + LO col = colValsL[j]; + const_host_little_vec_type prevVal = + cBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly); + + const LO matOffset = blockMatSize*j; + little_block_host_type lij((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride); + + //cval.matvecUpdate(-one, lij, prevVal); + Tpetra::GEMV (-one, lij, prevVal, cval); + } } } - } - // Solve D R = C. Note that D has been replaced by D^{-1} at this point. - D_block_->applyBlock(cBlock, rBlock); + // Solve D R = C. Note that D has been replaced by D^{-1} at this point. + D_block_->applyBlock(cBlock, rBlock); - // Solve U Y = R. - for (LO imv = 0; imv < numVectors; ++imv) - { - const LO numRows = D_block_->getLocalNumRows(); - for (LO i = 0; i < numRows; ++i) + // Solve U Y = R. + for (LO imv = 0; imv < numVectors; ++imv) { - LO local_row = (numRows-1)-i; - const_host_little_vec_type rval = - rBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly); - little_host_vec_type yval = - yBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::OverwriteAll); - //yval.assign(rval); - Tpetra::COPY (rval, yval); - - local_inds_host_view_type colValsU; - values_host_view_type valsU; - U_block_->getLocalRowView(local_row, colValsU, valsU); - LO NumU = (LO) colValsU.size(); - - for (LO j = 0; j < NumU; ++j) + const LO numRows = D_block_->getLocalNumRows(); + for (LO i = 0; i < numRows; ++i) { - LO col = colValsU[NumU-1-j]; - const_host_little_vec_type prevVal = - yBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly); - - const LO matOffset = blockMatSize*(NumU-1-j); - little_block_host_type uij((typename little_block_host_type::value_type*) &valsU[matOffset], blockSize_, rowStride); - - //yval.matvecUpdate(-one, uij, prevVal); - Tpetra::GEMV (-one, uij, prevVal, yval); + LO local_row = (numRows-1)-i; + const_host_little_vec_type rval = + rBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly); + little_host_vec_type yval = + yBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::OverwriteAll); + //yval.assign(rval); + Tpetra::COPY (rval, yval); + + local_inds_host_view_type colValsU; + values_host_view_type valsU; + U_block_->getLocalRowView(local_row, colValsU, valsU); + LO NumU = (LO) colValsU.size(); + + for (LO j = 0; j < NumU; ++j) + { + LO col = colValsU[NumU-1-j]; + const_host_little_vec_type prevVal = + yBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly); + + const LO matOffset = blockMatSize*(NumU-1-j); + little_block_host_type uij((typename little_block_host_type::value_type*) &valsU[matOffset], blockSize_, rowStride); + + //yval.matvecUpdate(-one, uij, prevVal); + Tpetra::GEMV (-one, uij, prevVal, yval); + } } } } + else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). + TEUCHOS_TEST_FOR_EXCEPTION( + true, std::runtime_error, + "Ifpack2::Experimental::RBILUK::apply: transpose apply is not implemented for the block algorithm without KokkosKernels. "); + } } - else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). - TEUCHOS_TEST_FOR_EXCEPTION( - true, std::runtime_error, - "Ifpack2::Experimental::RBILUK::apply: transpose apply is not implemented for the block algorithm. "); + else { // alpha != 1 or beta != 0 + if (alpha == zero) { + if (beta == zero) { + Y.putScalar (zero); + } else { + Y.scale (beta); + } + } else { // alpha != zero + MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + apply (X, Y_tmp, mode); + Y.update (alpha, Y_tmp, beta); + } } } - else { // alpha != 1 or beta != 0 - if (alpha == zero) { - if (beta == zero) { - Y.putScalar (zero); - } else { - Y.scale (beta); - } - } else { // alpha != zero - MV Y_tmp (Y.getMap (), Y.getNumVectors ()); - apply (X, Y_tmp, mode); - Y.update (alpha, Y_tmp, beta); + else { + // Kokkos kernels impl. For now, the only block trsv available is Sequential + // and must be done on host. + using row_map_type = typename local_matrix_host_type::row_map_type; + using index_type = typename local_matrix_host_type::index_type; + using values_type = typename local_matrix_host_type::values_type; + + auto X_view = X.getLocalViewHost(Tpetra::Access::ReadOnly); + auto Y_view = Y.getLocalViewHost(Tpetra::Access::ReadWrite); + + auto L_row_ptrs_host = L_block_->getCrsGraph().getLocalRowPtrsHost(); + auto L_entries_host = L_block_->getCrsGraph().getLocalIndicesHost(); + auto U_row_ptrs_host = U_block_->getCrsGraph().getLocalRowPtrsHost(); + auto U_entries_host = U_block_->getCrsGraph().getLocalIndicesHost(); + auto L_values_host = L_block_->getValuesHost(); + auto U_values_host = U_block_->getValuesHost(); + + row_map_type* L_row_ptrs_host_ri = reinterpret_cast(&L_row_ptrs_host); + index_type* L_entries_host_ri = reinterpret_cast(&L_entries_host); + row_map_type* U_row_ptrs_host_ri = reinterpret_cast(&U_row_ptrs_host); + index_type* U_entries_host_ri = reinterpret_cast(&U_entries_host); + values_type* L_values_host_ri = reinterpret_cast(&L_values_host); + values_type* U_values_host_ri = reinterpret_cast(&U_values_host); + + const auto numRows = L_block_->getLocalNumRows(); + local_matrix_host_type L_block_local_host("L_block_local_host", numRows, numRows, L_entries_host.size(), *L_values_host_ri, *L_row_ptrs_host_ri, *L_entries_host_ri, blockSize_); + local_matrix_host_type U_block_local_host("U_block_local_host", numRows, numRows, U_entries_host.size(), *U_values_host_ri, *U_row_ptrs_host_ri, *U_entries_host_ri, blockSize_); + + if (mode == Teuchos::NO_TRANS) { + KokkosSparse::trsv("L", "N", "N", L_block_local_host, X_view, Y_view); + KokkosSparse::trsv("U", "N", "N", U_block_local_host, Y_view, Y_view); + KokkosBlas::axpby(alpha, Y_view, beta, Y_view); } + else { + KokkosSparse::trsv("U", "T", "N", U_block_local_host, X_view, Y_view); + KokkosSparse::trsv("L", "T", "N", L_block_local_host, Y_view, Y_view); + KokkosBlas::axpby(alpha, Y_view, beta, Y_view); + } + + //Y.getWrappedDualView().sync(); } } // Stop timing diff --git a/packages/ifpack2/src/Ifpack2_IlukGraph.hpp b/packages/ifpack2/src/Ifpack2_IlukGraph.hpp index 6d293c003b08..c9bf970d322d 100644 --- a/packages/ifpack2/src/Ifpack2_IlukGraph.hpp +++ b/packages/ifpack2/src/Ifpack2_IlukGraph.hpp @@ -189,7 +189,7 @@ class IlukGraph : public Teuchos::Describable { size_t getNumGlobalDiagonals() const { return NumGlobalDiagonals_; } private: - typedef typename GraphType::map_type map_type; + typedef typename GraphType::map_type map_type; /// \brief Copy constructor (UNIMPLEMENTED; DO NOT USE). /// @@ -313,7 +313,7 @@ void IlukGraph::initialize() : Kokkos::ceil(static_cast(RowMaxNumIndices) * Kokkos::pow(overalloc, levelfill)); }); - + }; bool insertError; // No error found yet while inserting entries @@ -415,7 +415,7 @@ void IlukGraph::initialize() L_Graph_->getLocalRowCopy(i, CurrentRow_view, LenL); // Get L Indices CurrentRow[LenL] = i; // Put in Diagonal if (LenU > 0) { - ArrayView URowView = CurrentRow.view (LenL+1,LenU); + ArrayView URowView = CurrentRow.view (LenL+1,LenU); nonconst_local_inds_host_view_type URowView_v(URowView.data(),URowView.size()); // Get U Indices @@ -584,7 +584,7 @@ void IlukGraph::initialize(const Teuchos::RCP lno_row_view_t; typedef typename Kokkos::View lno_nonzero_view_t; - + constructOverlapGraph(); // FIXME (mfh 23 Dec 2013) Use size_t or whatever @@ -614,7 +614,7 @@ void IlukGraph::initialize(const Teuchos::RCP(Overalloc_)*L_entries.extent(0); - data_type nnzU = static_cast(Overalloc_)*U_entries.extent(0); + data_type nnzU = static_cast(Overalloc_)*U_entries.extent(0); KernelHandle->get_spiluk_handle()->reset_handle(NumMyRows, nnzL, nnzU); Kokkos::resize(L_entries, KernelHandle->get_spiluk_handle()->get_nnzL()); Kokkos::resize(U_entries, KernelHandle->get_spiluk_handle()->get_nnzU()); @@ -628,17 +628,17 @@ void IlukGraph::initialize(const Teuchos::RCPget_spiluk_handle()->get_nnzL()); Kokkos::resize(U_entries, KernelHandle->get_spiluk_handle()->get_nnzU()); - + RCP params = Teuchos::parameterList (); params->set ("Optimize Storage",false); - + L_Graph_ = rcp (new crs_graph_type (OverlapGraph_->getRowMap (), - OverlapGraph_->getRowMap (), + OverlapGraph_->getRowMap (), L_row_map, L_entries)); U_Graph_ = rcp (new crs_graph_type (OverlapGraph_->getRowMap (), - OverlapGraph_->getRowMap (), + OverlapGraph_->getRowMap (), U_row_map, U_entries)); - + RCP L_DomainMap = OverlapGraph_->getRowMap (); RCP L_RangeMap = Graph_->getRangeMap (); RCP U_DomainMap = Graph_->getDomainMap (); diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp index 850c121922df..07f95154e909 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp @@ -319,7 +319,7 @@ class RILUK: kk_handle_type; typedef Ifpack2::IlukGraph, kk_handle_type> iluk_graph_type; - + /// \brief Constructor that takes a Tpetra::RowMatrix. /// /// \param A_in [in] The input matrix. @@ -402,7 +402,7 @@ class RILUK: } //! Get a rough estimate of cost per iteration - size_t getNodeSmootherComplexity() const; + size_t getNodeSmootherComplexity() const; @@ -599,7 +599,7 @@ class RILUK: /// may be computed using a crs_matrix_type that initialize() constructs /// temporarily. Teuchos::RCP A_local_; - lno_row_view_t A_local_rowmap_; + lno_row_view_t A_local_rowmap_; lno_nonzero_view_t A_local_entries_; scalar_nonzero_view_t A_local_values_; std::vector A_local_diagblks; diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp index c42873035843..92770862324c 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp @@ -125,7 +125,7 @@ RILUK::RILUK (const Teuchos::RCP& Matrix_in) template -RILUK::~RILUK() +RILUK::~RILUK() { if (!isKokkosKernelsStream_) { if (Teuchos::nonnull (KernelHandle_)) { @@ -315,12 +315,12 @@ void RILUK::allocate_L_and_U () for (int i = 0; i < num_streams_; i++) { L_v_[i] = null; U_v_[i] = null; - + L_v_[i] = rcp (new crs_matrix_type (Graph_v_[i]->getL_Graph ())); U_v_[i] = rcp (new crs_matrix_type (Graph_v_[i]->getU_Graph ())); L_v_[i]->setAllToScalar (STS::zero ()); // Zero out L and U matrices U_v_[i]->setAllToScalar (STS::zero ()); - + L_v_[i]->fillComplete (); U_v_[i]->fillComplete (); } @@ -620,9 +620,9 @@ void RILUK::initialize () if (this->isKokkosKernelsSpiluk_) { if (!isKokkosKernelsStream_) { this->KernelHandle_ = Teuchos::rcp (new kk_handle_type ()); - KernelHandle_->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, + KernelHandle_->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, A_local_->getLocalNumRows(), - 2*A_local_->getLocalNumEntries()*(LevelOfFill_+1), + 2*A_local_->getLocalNumEntries()*(LevelOfFill_+1), 2*A_local_->getLocalNumEntries()*(LevelOfFill_+1) ); Graph_->initialize (KernelHandle_); // this calls spiluk_symbolic } @@ -630,9 +630,9 @@ void RILUK::initialize () KernelHandle_v_ = std::vector< Teuchos::RCP >(num_streams_); for (int i = 0; i < num_streams_; i++) { KernelHandle_v_[i] = Teuchos::rcp (new kk_handle_type ()); - KernelHandle_v_[i]->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, + KernelHandle_v_[i]->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, A_local_diagblks[i].numRows(), - 2*A_local_diagblks[i].nnz()*(LevelOfFill_+1), + 2*A_local_diagblks[i].nnz()*(LevelOfFill_+1), 2*A_local_diagblks[i].nnz()*(LevelOfFill_+1) ); Graph_v_[i]->initialize (KernelHandle_v_[i]); // this calls spiluk_symbolic } @@ -655,7 +655,7 @@ void RILUK::initialize () L_solver_->setMatrices (L_v_); } L_solver_->initialize (); - //NOTE (Nov-09-2022): + //NOTE (Nov-09-2022): //For Cuda >= 11.3 (using cusparseSpSV), skip trisolve computes here. //Instead, call trisolve computes within RILUK compute #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030) @@ -925,7 +925,7 @@ void RILUK::compute () nonconst_local_inds_host_view_type InI_sub(InI.data()+NumL+1,MaxNumEntries-NumL-1); nonconst_values_host_view_type InV_sub(reinterpret_cast(InV.data())+NumL+1,MaxNumEntries-NumL-1); - + U_->getLocalRowCopy (local_row, InI_sub,InV_sub, NumU); NumIn = NumL+NumU+1; @@ -939,12 +939,12 @@ void RILUK::compute () for (size_t jj = 0; jj < NumL; ++jj) { local_ordinal_type j = InI[jj]; IST multiplier = InV[jj]; // current_mults++; - + InV[jj] *= static_cast(DV(j)); - + U_->getLocalRowView(j, UUI, UUV); // View of row above NumUU = UUI.size(); - + if (RelaxValue_ == STM::zero ()) { for (size_t k = 0; k < NumUU; ++k) { const int kk = colflag[UUI[k]]; @@ -1001,7 +1001,7 @@ void RILUK::compute () } if (NumU) { - // Replace current row of L and U + // Replace current row of L and U U_->replaceLocalValues (local_row, InI (NumL+1, NumU), InV (NumL+1, NumU)); } @@ -1087,7 +1087,7 @@ void RILUK::compute () for(int i = 0; i < num_streams_; i++) { L_v_[i]->resumeFill (); U_v_[i]->resumeFill (); - + if (L_v_[i]->isStaticGraph () || L_v_[i]->isLocallyIndexed ()) { L_v_[i]->setAllToScalar (STS::zero ()); // Zero out L and U matrices U_v_[i]->setAllToScalar (STS::zero ()); @@ -1108,8 +1108,8 @@ void RILUK::compute () auto U_entries = lclU.graph.entries; auto U_values = lclU.values; - KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), LevelOfFill_, - A_local_rowmap_, A_local_entries_, A_local_values_, + KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), LevelOfFill_, + A_local_rowmap_, A_local_entries_, A_local_values_, L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values ); L_->fillComplete (L_->getColMap (), A_local_->getRangeMap ()); @@ -1117,7 +1117,7 @@ void RILUK::compute () L_solver_->setMatrix (L_); U_solver_->setMatrix (U_); - } + } else { std::vector L_rowmap_v(num_streams_); std::vector L_entries_v(num_streams_); @@ -1131,7 +1131,7 @@ void RILUK::compute () L_rowmap_v[i] = lclL.graph.row_map; L_entries_v[i] = lclL.graph.entries; L_values_v[i] = lclL.values; - + auto lclU = U_v_[i]->getLocalMatrixDevice(); U_rowmap_v[i] = lclU.graph.row_map; U_entries_v[i] = lclU.graph.entries; @@ -1210,7 +1210,7 @@ apply (const Tpetra::MultiVectorapply (Y, Y, mode); // Solve U Y = Y. #endif } - else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). + else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) //NOTE (Nov-15-2022): //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) @@ -1425,8 +1425,8 @@ std::string RILUK::description () const os << "Level-of-fill: " << getLevelOfFill() << ", "; if(isKokkosKernelsSpiluk_) os<<"KK-SPILUK, "; - if(isKokkosKernelsStream_) os<<"KK-Stream, "; - + if(isKokkosKernelsStream_) os<<"KK-Stream, "; + if (A_.is_null ()) { os << "Matrix: null"; } diff --git a/packages/ifpack2/test/belos/CMakeLists.txt b/packages/ifpack2/test/belos/CMakeLists.txt index c1bf68e37405..fc969c2b9dee 100644 --- a/packages/ifpack2/test/belos/CMakeLists.txt +++ b/packages/ifpack2/test/belos/CMakeLists.txt @@ -43,14 +43,14 @@ ENDIF() TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles DEST_FILES test_database_schwarz.xml - test_ILUT_calore1_mm.xml - test_Jacobi_calore1_mm.xml - test_Jacobi_calore1_mm_constGraph.xml + test_ILUT_calore1_mm.xml + test_Jacobi_calore1_mm.xml + test_Jacobi_calore1_mm_constGraph.xml test_BlockRelaxationAmesos2_calore1_mm.xml test_BlockRelaxationZoltan2_calore1_mm.xml test_GS_calore1_mm.xml test_MTGS_calore1_mm.xml - test_RILUK_calore1_mm.xml + test_RILUK_calore1_mm.xml test_Cheby_calore1_mm.xml test_Cheby_calore1_nospectralradius_mm.xml test_FILU_calore1_mm.xml @@ -59,41 +59,42 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles test_FIC_calore1_mm_sptrsv.xml test_FILDL_calore1_mm.xml test_FILDL_calore1_mm_sptrsv.xml - test_Jacobi_nos1_hb.xml - test_2_Jacobi_nos1_hb.xml - test_Jacobi_gcrodr_nos1_hb.xml + test_Jacobi_nos1_hb.xml + test_2_Jacobi_nos1_hb.xml + test_Jacobi_gcrodr_nos1_hb.xml nos1.rsa test_Jacobi_bcsstk14_hb.xml test_Jacobi_minres_bcsstk14_hb.xml test_Jacobi_pseudoblockcg_bcsstk14_hb.xml bcsstk14.hb - test_2_ILUT_nos1_hb.xml + test_2_ILUT_nos1_hb.xml test_2_RILUK_nos1_hb.xml + test_RBILUK_nos1_hb_block.xml test_2_RILUK_HTS_nos1_hb.xml test_2_RILUK_2streams_nos1_hb.xml test_2_RILUK_4streams_nos1_hb.xml test_2_RILUK_2streams_rcm_nos1_hb.xml test_2_RILUK_4streams_rcm_nos1_hb.xml - test_4_ILUT_nos1_hb.xml + test_4_ILUT_nos1_hb.xml test_4_RILUK_nos1_hb.xml test_4_RILUK_HTS_nos1_hb.xml test_4_RILUK_2streams_nos1_hb.xml test_4_RILUK_4streams_nos1_hb.xml test_4_RILUK_2streams_rcm_nos1_hb.xml test_4_RILUK_4streams_rcm_nos1_hb.xml - test_SGS_calore1_mm.xml + test_SGS_calore1_mm.xml test_MTSGS_calore1_mm.xml - calore1.mtx + calore1.mtx calore1_rhs.mtx test_pseudoblockcg_small_sym_mm.xml test_minres_small_sym_mm.xml - test_Jacobi_small_sym_mm.xml - small_sym.mtx + test_Jacobi_small_sym_mm.xml + small_sym.mtx test_gmres_small_sym_mm.xml test_Diagonal_tfqmr_calore1_mm.xml test_Diagonal_bicgstab_calore1_mm.xml test_Diagonal_gcrodr_calore1_mm.xml - test_GS_tfqmr_small_sym_mm.xml + test_GS_tfqmr_small_sym_mm.xml test_ILUT_tfqmr_small_sym_mm.xml test_tfqmr_small_sym_mm.xml test_FILU_tfqmr_small_sym_mm.xml @@ -106,9 +107,9 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles test_FILDL_tfqmr_small_sym_mm_sptrsv.xml test_FILDL_tfqmr_small_sym_mm_schwarz.xml test_ILUT_tfqmr_calore1_mm.xml - 5w.mtx + 5w.mtx 5w.vec - 6w.mtx + 6w.mtx 6w.vec 5w_bel_tif_ILUT.xml 5w_bel_tif_FILU.xml @@ -122,13 +123,13 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(Ifpack2BelosCopyFiles 5w_bel_tif_FILDL.xml 5w_bel_tif_FILDL_sptrsv.xml 5w_bel_tif_FILDL_schwarz.xml - 5w_missing_diag.mtx + 5w_missing_diag.mtx 5w_missing_diag_ILUT.xml - 5w_bel_tif_RILUK_0.xml + 5w_bel_tif_RILUK_0.xml 5w_bel_tif_RILUK_1.xml test_RILUK_tfqmr_small_sym_mm.xml - test_bordered_DIAG_small.xml - test_minres_bordered_DIAG_small.xml + test_bordered_DIAG_small.xml + test_minres_bordered_DIAG_small.xml test_FILU_small_sym_sing.xml test_FILU_small_sym_sing_sptrsv.xml test_FIC_small_sym_sing.xml @@ -171,7 +172,7 @@ TRIBITS_ADD_TEST( NUM_MPI_PROCS 1 STANDARD_PASS_OUTPUT ) -ENDIF() +ENDIF() IF (Tpetra_ENABLE_quadmath) TRIBITS_ADD_TEST( @@ -299,10 +300,10 @@ TRIBITS_ADD_TEST( # NAME Jacobi_bcsstk14_hb_belos # ARGS "--xml_file=test_Jacobi_bcsstk14_hb.xml" # COMM serial mpi -# NUM_MPI_PROCS 1 +# NUM_MPI_PROCS 1 # STANDARD_PASS_OUTPUT #) - + TRIBITS_ADD_TEST( tif_belos NAME Jacobi_bcsstk14_minres_hb_belos @@ -348,6 +349,15 @@ TRIBITS_ADD_TEST( STANDARD_PASS_OUTPUT ) +TRIBITS_ADD_TEST( + tif_belos + NAME RBILUK_hb_belos_block_serial + ARGS "--xml_file=test_RBILUK_nos1_hb_block.xml" + COMM serial mpi + NUM_MPI_PROCS 1 + STANDARD_PASS_OUTPUT +) + TRIBITS_ADD_TEST( tif_belos NAME RILUK_hb_belos diff --git a/packages/ifpack2/test/belos/belos_solve.cpp b/packages/ifpack2/test/belos/belos_solve.cpp index d0a83bd50a22..214501328b49 100644 --- a/packages/ifpack2/test/belos/belos_solve.cpp +++ b/packages/ifpack2/test/belos/belos_solve.cpp @@ -53,6 +53,7 @@ #include "Tpetra_Core.hpp" #include "Tpetra_Map.hpp" +#include "Tpetra_BlockCrsMatrix_Helpers_decl.hpp" #include "build_problem.hpp" #include "build_solver.hpp" @@ -95,6 +96,7 @@ int main (int argc, char* argv[]) typedef Tpetra::MultiVector TMV; typedef Tpetra::Operator TOP; typedef Tpetra::CrsMatrix crs_matrix_type; + typedef Tpetra::BlockCrsMatrix block_crs_matrix_type; typedef Belos::LinearProblem BLinProb; typedef Belos::SolverManager BSolverMgr; using Teuchos::RCP; @@ -171,8 +173,27 @@ int main (int argc, char* argv[]) std::string prec_side("Left"); Ifpack2::getParameter (test_params, "Preconditioner Side", prec_side); if (tifpack_precond != "not specified") { - auto A = Teuchos::rcp_dynamic_cast(problem->getOperator()); - RCP precond = build_precond (test_params, A); + RCP precond; + if (tifpack_precond == "RBILUK") { + int blockSize = 0; + Teuchos::ParameterList& prec_params = test_params.sublist("Ifpack2"); + Ifpack2::getParameter (prec_params, "fact: block size", blockSize); + assert(blockSize >= 1); + auto A_crs = Teuchos::rcp_dynamic_cast(problem->getOperator()); + if (blockSize > 1) { + auto crs_matrix_block_filled = Tpetra::fillLogicalBlocks(*A_crs, blockSize); + auto A = Teuchos::rcp_const_cast(Tpetra::convertToBlockCrsMatrix(*crs_matrix_block_filled, blockSize)); + precond = build_precond (test_params, A); + } + else { + auto A = Teuchos::rcp_const_cast(Tpetra::convertToBlockCrsMatrix(*A_crs, blockSize)); + precond = build_precond (test_params, A); + } + } + else { + auto A = Teuchos::rcp_dynamic_cast(problem->getOperator()); + precond = build_precond (test_params, A); + } if (prec_side == "Left") problem->setLeftPrec (precond); else if (prec_side == "Right") diff --git a/packages/ifpack2/test/belos/build_precond.hpp b/packages/ifpack2/test/belos/build_precond.hpp index 2a09ca3f4f84..47a74a35b263 100644 --- a/packages/ifpack2/test/belos/build_precond.hpp +++ b/packages/ifpack2/test/belos/build_precond.hpp @@ -48,10 +48,10 @@ #include "Ifpack2_Factory.hpp" -template +template Teuchos::RCP > build_precond (Teuchos::ParameterList& test_params, - const Teuchos::RCP >& A) + const Teuchos::RCP& A) { using Teuchos::FancyOStream; using Teuchos::getFancyOStream; diff --git a/packages/ifpack2/test/belos/test_RBILUK_nos1_hb_block.xml b/packages/ifpack2/test/belos/test_RBILUK_nos1_hb_block.xml new file mode 100644 index 000000000000..ec07f94f0ccf --- /dev/null +++ b/packages/ifpack2/test/belos/test_RBILUK_nos1_hb_block.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + +