Skip to content

Commit

Permalink
Merge pull request kokkos#1817 from kokkos/issue-1700
Browse files Browse the repository at this point in the history
Fixes alignment issues of scratch views
  • Loading branch information
ibaned committed Sep 29, 2018
2 parents dbdc856 + bff88d5 commit c7b033c
Show file tree
Hide file tree
Showing 14 changed files with 144 additions and 45 deletions.
1 change: 1 addition & 0 deletions Makefile.kokkos
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ tmp := $(call kokkos_append_header,"/* Execution Spaces */")

ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
endif

ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
Expand Down
32 changes: 16 additions & 16 deletions core/src/Cuda/Kokkos_Cuda_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks;

int blockSize=1024;
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
unsigned int blockSize=1024;
unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
Expand All @@ -131,7 +131,7 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
blockSize,
sharedmem);
}
int blockSizeUpperBound = blockSize*2;
unsigned int blockSizeUpperBound = blockSize*2;
while (blockSize<blockSizeUpperBound && numBlocks>0) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
Expand All @@ -152,18 +152,18 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks = 0, oldNumBlocks = 0;
int blockSize=MaxThreadsPerBlock;
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
unsigned int blockSize=MaxThreadsPerBlock;
unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
blockSize,
sharedmem);

if(numBlocks>=MinBlocksPerSM) return blockSize;
if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;

while (blockSize>32 && numBlocks<MinBlocksPerSM) {
while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
blockSize/=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
Expand All @@ -174,8 +174,8 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
blockSize,
sharedmem);
}
int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
while (blockSize<blockSizeUpperBound && numBlocks>MinBlocksPerSM) {
unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
Expand All @@ -186,7 +186,7 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
blockSize,
sharedmem);
}
if(oldNumBlocks>=MinBlocksPerSM) return blockSize - 32;
if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
return -1;
}
};
Expand All @@ -196,17 +196,17 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks = 0, oldNumBlocks = 0;
int blockSize=MaxThreadsPerBlock;
unsigned int blockSize=MaxThreadsPerBlock;
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
blockSize,
sharedmem);
if(numBlocks>=MinBlocksPerSM) return blockSize;
if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;

while (blockSize>32 && numBlocks<MinBlocksPerSM) {
while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
blockSize/=2;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
Expand All @@ -217,8 +217,8 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
blockSize,
sharedmem);
}
int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
while (blockSize<blockSizeUpperBound && numBlocks>=MinBlocksPerSM) {
unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) {
blockSize+=32;
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
Expand All @@ -229,7 +229,7 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
blockSize,
sharedmem);
}
if(oldNumBlocks>=MinBlocksPerSM) return blockSize - 32;
if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
return -1;
}
};
Expand Down
2 changes: 0 additions & 2 deletions core/src/Cuda/Kokkos_Cuda_Parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<P
template<class FunctorType>
int team_size_max( const FunctorType& f, const ParallelForTag& ) const {
typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) );
return block_size/vector_length();
Expand Down Expand Up @@ -179,7 +178,6 @@ class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<P
template<class FunctorType>
int team_size_recommended( const FunctorType& f, const ParallelForTag& ) const {
typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
(size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double));
return block_size/vector_length();
Expand Down
8 changes: 4 additions & 4 deletions core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {

if( warp_id == 0) {
ValueInit::init( functor , &value );
for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
for(unsigned int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
ValueJoin::join( functor , &value,&shared_team_buffer_element[i]);
scalar_intra_warp_reduction(functor,value,false,32,*my_global_team_buffer_element);
}
Expand All @@ -647,7 +647,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {

scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
__syncthreads();
int num_teams_done = 0;
unsigned int num_teams_done = 0;
if(threadIdx.x + threadIdx.y == 0) {
__threadfence();
num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
Expand Down Expand Up @@ -719,7 +719,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
__syncthreads();

if( warp_id == 0) {
const int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
const unsigned int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
if(delta<blockDim.x*blockDim.y)
*my_shared_team_buffer_element = shared_team_buffer_element[delta];
KOKKOS_IMPL_CUDA_SYNCWARP;
Expand Down Expand Up @@ -747,7 +747,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
__syncthreads();

int num_teams_done = 0;
unsigned int num_teams_done = 0;
if(threadIdx.x + threadIdx.y == 0) {
__threadfence();
num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
Expand Down
49 changes: 49 additions & 0 deletions core/src/Kokkos_ScratchSpace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,55 @@ class ScratchMemorySpace {
}
}


KOKKOS_INLINE_FUNCTION
void* get_shmem_aligned (const ptrdiff_t size, const ptrdiff_t alignment, int level = -1) const {
if(level == -1)
level = m_default_level;
if(level == 0) {

char* previous = m_iter_L0;
const ptrdiff_t missalign = size_t(m_iter_L0)%alignment;
if(missalign) m_iter_L0 += alignment-missalign;

void* tmp = m_iter_L0 + m_offset * size;
if (m_end_L0 < (m_iter_L0 += size * m_multiplier)) {
m_iter_L0 = previous; // put it back like it was
#ifdef KOKKOS_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L0-m_iter_L0));
#endif // KOKKOS_DEBUG
tmp = 0;
}
return tmp;
} else {

char* previous = m_iter_L1;
const ptrdiff_t missalign = size_t(m_iter_L1)%alignment;
if(missalign) m_iter_L1 += alignment-missalign;

void* tmp = m_iter_L1 + m_offset * size;
if (m_end_L1 < (m_iter_L1 += size * m_multiplier)) {
m_iter_L1 = previous; // put it back like it was
#ifdef KOKKOS_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L1-m_iter_L1));
#endif // KOKKOS_DEBUG
tmp = 0;
}
return tmp;

}
}

template< typename IntType >
KOKKOS_INLINE_FUNCTION
ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
Expand Down
8 changes: 4 additions & 4 deletions core/src/Kokkos_View.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2362,15 +2362,15 @@ class View : public ViewTraits< DataType , Properties ... > {
static inline
size_t shmem_size( typename traits::array_layout const& arg_layout )
{
return map_type::memory_span( arg_layout );
return map_type::memory_span( arg_layout )+sizeof(typename traits::value_type);
}

explicit KOKKOS_INLINE_FUNCTION
View( const typename traits::execution_space::scratch_memory_space & arg_space
, const typename traits::array_layout & arg_layout )
: View( Impl::ViewCtorProp<pointer_type>(
reinterpret_cast<pointer_type>(
arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) )
arg_space.get_shmem_aligned( map_type::memory_span( arg_layout ), sizeof(typename traits::value_type) ) ) )
, arg_layout )
{}

Expand All @@ -2386,11 +2386,11 @@ class View : public ViewTraits< DataType , Properties ... > {
, const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG )
: View( Impl::ViewCtorProp<pointer_type>(
reinterpret_cast<pointer_type>(
arg_space.get_shmem(
arg_space.get_shmem_aligned(
map_type::memory_span(
typename traits::array_layout
( arg_N0 , arg_N1 , arg_N2 , arg_N3
, arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
, arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ), sizeof(typename traits::value_type) ) ) )
, typename traits::array_layout
( arg_N0 , arg_N1 , arg_N2 , arg_N3
, arg_N4 , arg_N5 , arg_N6 , arg_N7 )
Expand Down
29 changes: 27 additions & 2 deletions core/unit_test/TestTeam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -973,7 +973,7 @@ struct TestShmemSize {

size_t size = view_type::shmem_size( d1, d2, d3 );

ASSERT_EQ( size, d1 * d2 * d3 * sizeof( long ) );
ASSERT_EQ( size, (d1 * d2 * d3 + 1)* sizeof( long ) );

test_layout_stride();
}
Expand Down Expand Up @@ -1080,8 +1080,33 @@ struct TestTeamBroadcast {
}
};

template<class ExecSpace>
struct TestScratchAlignment {
struct TestScalar {
double x,y,z;
};
TestScratchAlignment() {
test(true);
test(false);
}
typedef Kokkos::View<TestScalar*,typename ExecSpace::scratch_memory_space> ScratchView;
typedef Kokkos::View<int*,typename ExecSpace::scratch_memory_space> ScratchViewInt;
void test(bool allocate_small) {
int shmem_size = ScratchView::shmem_size(11);
if(allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace>(1,1).set_scratch_size(0,Kokkos::PerTeam(shmem_size)),
KOKKOS_LAMBDA (const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
if(allocate_small) ScratchViewInt p(team.team_scratch(0),1);
ScratchView a(team.team_scratch(0),11);
if(ptrdiff_t(a.data())%sizeof(TestScalar)!=0)
Kokkos::abort("Error: invalid scratch view alignment\n");
});
Kokkos::fence();
}
};

} // namespace

} // namespace Test

/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
37 changes: 22 additions & 15 deletions core/unit_test/TestTeamVector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,13 @@ struct functor_team_for {

functor_team_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
typedef typename shared_int::size_type size_type;

typedef typename shmem_space::size_type size_type;
const size_type shmemSize = team.team_size() * 13;
shared_int values = shared_int( team.team_shmem(), shmemSize );

Expand Down Expand Up @@ -290,7 +289,9 @@ struct functor_team_reduce {

functor_team_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
Expand Down Expand Up @@ -333,7 +334,9 @@ struct functor_team_reduce_reducer {

functor_team_reduce_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
Expand Down Expand Up @@ -376,12 +379,12 @@ struct functor_team_vector_for {

functor_team_vector_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
typedef typename shared_int::size_type size_type;

const size_type shmemSize = team.team_size() * 13;
Expand Down Expand Up @@ -442,7 +445,9 @@ struct functor_team_vector_reduce {
Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
functor_team_vector_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
Expand Down Expand Up @@ -485,7 +490,9 @@ struct functor_team_vector_reduce_reducer {

functor_team_vector_reduce_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
Expand Down Expand Up @@ -568,12 +575,12 @@ struct functor_vec_for {

functor_vec_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}

unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }

KOKKOS_INLINE_FUNCTION
void operator()( typename policy_type::member_type team ) const {
typedef typename ExecutionSpace::scratch_memory_space shmem_space;
typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;

shared_int values = shared_int( team.team_shmem(), team.team_size() * 13 );

Expand Down
Loading

0 comments on commit c7b033c

Please sign in to comment.