Merge pull request kokkos#1817 from kokkos/issue-1700

Fixes alignment issues of scratch views
hippo91 · Sep 29, 2018 · c7b033c · c7b033c
2 parents dbdc856 + bff88d5
commit c7b033c
Show file tree

Hide file tree

Showing 14 changed files with 144 additions and 45 deletions.
diff --git a/Makefile.kokkos b/Makefile.kokkos
@@ -396,6 +396,7 @@ tmp := $(call kokkos_append_header,"/* Execution Spaces */")
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)

diff --git a/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -110,8 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
                             const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int numBlocks;
 
-    int blockSize=1024;
-    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+    unsigned int blockSize=1024;
+    unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                     FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
@@ -131,7 +131,7 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
           blockSize,
           sharedmem);
     }
-    int blockSizeUpperBound = blockSize*2;
+    unsigned int blockSizeUpperBound = blockSize*2;
     while (blockSize<blockSizeUpperBound && numBlocks>0) {
       blockSize+=32;
       sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
@@ -152,18 +152,18 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
   static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                             const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int numBlocks = 0, oldNumBlocks = 0;
-    int blockSize=MaxThreadsPerBlock;
-    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+    unsigned int blockSize=MaxThreadsPerBlock;
+    unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                     FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
         cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
         blockSize,
         sharedmem);
 
-    if(numBlocks>=MinBlocksPerSM) return blockSize;
+    if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
 
-    while (blockSize>32 && numBlocks<MinBlocksPerSM) {
+    while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
       blockSize/=2;
       sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                   FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
@@ -174,8 +174,8 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
           blockSize,
           sharedmem);
     }
-    int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
-    while (blockSize<blockSizeUpperBound && numBlocks>MinBlocksPerSM) {
+    unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
+    while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) {
       blockSize+=32;
       sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                   FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
@@ -186,7 +186,7 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
             blockSize,
             sharedmem);
     }
-    if(oldNumBlocks>=MinBlocksPerSM) return blockSize - 32;
+    if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
     return -1;
   }
 };
@@ -196,17 +196,17 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
   static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                             const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int numBlocks = 0, oldNumBlocks = 0;
-    int blockSize=MaxThreadsPerBlock;
+    unsigned int blockSize=MaxThreadsPerBlock;
     int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                     FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
         cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
         blockSize,
         sharedmem);
-    if(numBlocks>=MinBlocksPerSM) return blockSize;
+    if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
 
-    while (blockSize>32 && numBlocks<MinBlocksPerSM) {
+    while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
       blockSize/=2;
       sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                   FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
@@ -217,8 +217,8 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
           blockSize,
           sharedmem);
     }
-    int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
-    while (blockSize<blockSizeUpperBound && numBlocks>=MinBlocksPerSM) {
+    unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
+    while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) {
       blockSize+=32;
       sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                   FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
@@ -229,7 +229,7 @@ struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,Mi
             blockSize,
             sharedmem);
     }
-    if(oldNumBlocks>=MinBlocksPerSM) return blockSize - 32;
+    if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
     return -1;
   }
 };

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -138,7 +138,6 @@ class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<P
   template<class FunctorType>
   int team_size_max( const FunctorType& f, const ParallelForTag& ) const {
     typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
-    typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
     int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
         (size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) );
     return block_size/vector_length();
@@ -179,7 +178,6 @@ class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<P
   template<class FunctorType>
   int team_size_recommended( const FunctorType& f, const ParallelForTag& ) const {
     typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
-    typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
     int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
         (size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double));
     return block_size/vector_length();

diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -623,7 +623,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
 
     if( warp_id == 0) {
       ValueInit::init( functor , &value );
-      for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
+      for(unsigned int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
         ValueJoin::join( functor , &value,&shared_team_buffer_element[i]);
       scalar_intra_warp_reduction(functor,value,false,32,*my_global_team_buffer_element);
     }
@@ -647,7 +647,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
 
     scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
     __syncthreads();
-    int num_teams_done = 0;
+    unsigned int num_teams_done = 0;
     if(threadIdx.x + threadIdx.y == 0) {
       __threadfence();
       num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
@@ -719,7 +719,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
     __syncthreads();
 
     if( warp_id == 0) {
-      const int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
+      const unsigned int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
       if(delta<blockDim.x*blockDim.y)
         *my_shared_team_buffer_element = shared_team_buffer_element[delta];
       KOKKOS_IMPL_CUDA_SYNCWARP;   
@@ -747,7 +747,7 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
     scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
     __syncthreads();
 
-    int num_teams_done = 0;
+    unsigned int num_teams_done = 0;
     if(threadIdx.x + threadIdx.y == 0) {
       __threadfence();
       num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;

diff --git a/core/src/Kokkos_ScratchSpace.hpp b/core/src/Kokkos_ScratchSpace.hpp
@@ -136,6 +136,55 @@ class ScratchMemorySpace {
     }
   }
 
+
+  KOKKOS_INLINE_FUNCTION
+  void* get_shmem_aligned (const ptrdiff_t size, const ptrdiff_t alignment, int level = -1) const {
+    if(level == -1)
+      level = m_default_level;
+    if(level == 0) {
+
+      char* previous = m_iter_L0;
+      const ptrdiff_t missalign = size_t(m_iter_L0)%alignment;
+      if(missalign) m_iter_L0 += alignment-missalign;
+
+      void* tmp = m_iter_L0 + m_offset * size;
+      if (m_end_L0 < (m_iter_L0 += size * m_multiplier)) {
+        m_iter_L0 = previous; // put it back like it was
+        #ifdef KOKKOS_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L0-m_iter_L0));
+        #endif // KOKKOS_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+    } else {
+
+      char* previous = m_iter_L1;
+      const ptrdiff_t missalign =  size_t(m_iter_L1)%alignment;
+      if(missalign) m_iter_L1 += alignment-missalign;
+
+      void* tmp = m_iter_L1 + m_offset * size;
+      if (m_end_L1 < (m_iter_L1 += size * m_multiplier)) {
+        m_iter_L1 = previous; // put it back like it was
+        #ifdef KOKKOS_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L1-m_iter_L1));
+        #endif // KOKKOS_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+
+    }
+  }
+
   template< typename IntType >
   KOKKOS_INLINE_FUNCTION
   ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)

diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp
@@ -2362,15 +2362,15 @@ class View : public ViewTraits< DataType , Properties ... > {
   static inline
   size_t shmem_size( typename traits::array_layout const& arg_layout )
   {
-    return map_type::memory_span( arg_layout );
+    return map_type::memory_span( arg_layout )+sizeof(typename traits::value_type);
   }
 
   explicit KOKKOS_INLINE_FUNCTION
   View( const typename traits::execution_space::scratch_memory_space & arg_space
       , const typename traits::array_layout & arg_layout )
     : View( Impl::ViewCtorProp<pointer_type>(
               reinterpret_cast<pointer_type>(
-                arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) )
+                arg_space.get_shmem_aligned( map_type::memory_span( arg_layout ), sizeof(typename traits::value_type) ) ) )
          , arg_layout )
     {}
 
@@ -2386,11 +2386,11 @@ class View : public ViewTraits< DataType , Properties ... > {
       , const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG )
     : View( Impl::ViewCtorProp<pointer_type>(
               reinterpret_cast<pointer_type>(
-                arg_space.get_shmem(
+                arg_space.get_shmem_aligned(
                   map_type::memory_span(
                     typename traits::array_layout
                      ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
+                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ), sizeof(typename traits::value_type) ) ) )
           , typename traits::array_layout
              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )

diff --git a/core/unit_test/TestTeam.hpp b/core/unit_test/TestTeam.hpp
@@ -973,7 +973,7 @@ struct TestShmemSize {
 
     size_t size = view_type::shmem_size( d1, d2, d3 );
 
-    ASSERT_EQ( size, d1 * d2 * d3 * sizeof( long ) );
+    ASSERT_EQ( size, (d1 * d2 * d3 + 1)* sizeof( long ) );
 
     test_layout_stride();
   }
@@ -1080,8 +1080,33 @@ struct TestTeamBroadcast {
   }
 };
 
+template<class ExecSpace>
+struct TestScratchAlignment {
+  struct TestScalar {
+    double x,y,z;
+  };
+  TestScratchAlignment() {
+    test(true);
+    test(false);
+  }
+  typedef Kokkos::View<TestScalar*,typename ExecSpace::scratch_memory_space> ScratchView;
+  typedef Kokkos::View<int*,typename ExecSpace::scratch_memory_space> ScratchViewInt;
+  void test(bool allocate_small) {
+    int shmem_size = ScratchView::shmem_size(11);
+    if(allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
+    Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace>(1,1).set_scratch_size(0,Kokkos::PerTeam(shmem_size)),
+     KOKKOS_LAMBDA (const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
+     if(allocate_small) ScratchViewInt p(team.team_scratch(0),1);
+     ScratchView a(team.team_scratch(0),11);
+     if(ptrdiff_t(a.data())%sizeof(TestScalar)!=0)
+       Kokkos::abort("Error: invalid scratch view alignment\n");
+    });
+    Kokkos::fence();
+  }
+};
+
 } // namespace
 
 } // namespace Test
 
-/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp
@@ -227,14 +227,13 @@ struct functor_team_for {
 
   functor_team_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
-    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
-    typedef typename shared_int::size_type size_type;
-
+    typedef typename shmem_space::size_type size_type;
     const size_type shmemSize = team.team_size() * 13;
     shared_int values = shared_int( team.team_shmem(), shmemSize );
 
@@ -290,7 +289,9 @@ struct functor_team_reduce {
 
   functor_team_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
@@ -333,7 +334,9 @@ struct functor_team_reduce_reducer {
 
   functor_team_reduce_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
@@ -376,12 +379,12 @@ struct functor_team_vector_for {
 
   functor_team_vector_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
-    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
     typedef typename shared_int::size_type size_type;
 
     const size_type shmemSize = team.team_size() * 13;
@@ -442,7 +445,9 @@ struct functor_team_vector_reduce {
   Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
   functor_team_vector_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
@@ -485,7 +490,9 @@ struct functor_team_vector_reduce_reducer {
 
   functor_team_vector_reduce_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
@@ -568,12 +575,12 @@ struct functor_vec_for {
 
   functor_vec_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+  typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+  typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+  unsigned team_shmem_size( int team_size ) const { return shared_int::shmem_size(team_size*13); }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( typename policy_type::member_type team ) const {
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
-    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
 
     shared_int values = shared_int( team.team_shmem(), team.team_size() * 13 );