Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU Batch 11 #1196

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Add device only impl
Add function for device only impl

Fix function signatures

Fix arrays

Add basic test for compilation

Allow serial implementation to be run on host under NVCC

Add verification steps

Add arrays of levels coefficient sizes

Cleanup test set

Add double test set

Add structure for the doubles support

Save space by using pointer to different size arrays rather than 2d

Separate the double precision weights into their own arrays

Remove stray call to std::abs

Add NVRTC testing

Add documentation section

Add device function signature for sinh_sinh_integrate

Add float coefficients

Add double coeffs

Add device specific impl

Add sinh_sinh CUDA testing

Add sinh_sinh NVRTC testing
  • Loading branch information
mborland committed Sep 12, 2024
commit b5214b5935bea24021c719eea3451d1eec3024fb
25 changes: 25 additions & 0 deletions doc/quadrature/double_exponential.qbk
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[/
Copyright (c) 2017 Nick Thompson
Copyright (c) 2024 Matt Borland
Use, modification and distribution are subject to the
Boost Software License, Version 1.0. (See accompanying file
LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
Expand Down Expand Up @@ -538,6 +539,30 @@ This form integrates just fine over (-log([pi]/2), +[infin]) using either the `t

[endsect] [/section:de_caveats Caveats]

[section:gpu_usage GPU Usage]

``
#include <boost/math/quadrature/exp_sinh.hpp>

namespace boost{ namespace math{ namespace quadrature {

template <class F, class Real, class Policy = policies::policy<> >
__device__ auto exp_sinh_integrate(const F& f, Real a, Real b, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)

template <class F, class Real, class Policy = policies::policy<> >
__device__ auto exp_sinh_integrate(const F& f, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)

}}}
``

Quadrature is additionally able to run on CUDA (NVCC and NVRTC) platforms.
The major difference is outlined in the above function signatures.
When used on device these are free standing functions instead of using OOP like on the host.
The tables of abscissas and weights are stored in shared read only memory on the device instead of being initialized when the class is constructed.
An example use case would be in the finite elements method computing a stiffness matrix since it would consist of many different functions.

[endsect] [/section:gpu_usage Usage]

[section:de_refes References]

* Hidetosi Takahasi and Masatake Mori, ['Double Exponential Formulas for Numerical Integration] Publ. Res. Inst. Math. Sci., 9 (1974), pp. 721-741.
Expand Down
1,460 changes: 1,459 additions & 1 deletion include/boost/math/quadrature/detail/exp_sinh_detail.hpp

Large diffs are not rendered by default.

869 changes: 867 additions & 2 deletions include/boost/math/quadrature/detail/sinh_sinh_detail.hpp

Large diffs are not rendered by default.

83 changes: 81 additions & 2 deletions include/boost/math/quadrature/exp_sinh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@
#ifndef BOOST_MATH_QUADRATURE_EXP_SINH_HPP
#define BOOST_MATH_QUADRATURE_EXP_SINH_HPP

#include <boost/math/tools/config.hpp>
#include <boost/math/quadrature/detail/exp_sinh_detail.hpp>

#ifndef BOOST_MATH_HAS_NVRTC

#include <cmath>
#include <limits>
#include <memory>
#include <string>
#include <boost/math/quadrature/detail/exp_sinh_detail.hpp>

namespace boost{ namespace math{ namespace quadrature {

Expand Down Expand Up @@ -98,4 +102,79 @@ auto exp_sinh<Real, Policy>::integrate(const F& f, Real tolerance, Real* error,


}}}
#endif

#endif // BOOST_MATH_HAS_NVRTC

#ifdef BOOST_MATH_ENABLE_CUDA

#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/constants/constants.hpp>

namespace boost {
namespace math {
namespace quadrature {

template <class F, class Real, class Policy = policies::policy<> >
__device__ auto exp_sinh_integrate(const F& f, Real a, Real b, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)
{
BOOST_MATH_STD_USING

using K = decltype(f(a));
static_assert(!boost::math::is_integral<K>::value,
"The return type cannot be integral, it must be either a real or complex floating point type.");

constexpr auto function = "boost::math::quadrature::exp_sinh<%1%>::integrate";

// Neither limit may be a NaN:
if((boost::math::isnan)(a) || (boost::math::isnan)(b))
{
return static_cast<K>(policies::raise_domain_error(function, "NaN supplied as one limit of integration - sorry I don't know what to do", a, Policy()));
}
// Right limit is infinite:
if ((boost::math::isfinite)(a) && (b >= boost::math::tools::max_value<Real>()))
{
// If a = 0, don't use an additional level of indirection:
if (a == static_cast<Real>(0))
{
return detail::exp_sinh_integrate_impl(f, tolerance, error, L1, levels);
}
const auto u = [&](Real t)->K { return f(t + a); };
return detail::exp_sinh_integrate_impl(u, tolerance, error, L1, levels);
}

if ((boost::math::isfinite)(b) && a <= -boost::math::tools::max_value<Real>())
{
const auto u = [&](Real t)->K { return f(b-t);};
return detail::exp_sinh_integrate_impl(u, tolerance, error, L1, levels);
}

// Infinite limits:
if ((a <= -boost::math::tools::max_value<Real>()) && (b >= boost::math::tools::max_value<Real>()))
{
return static_cast<K>(policies::raise_domain_error(function, "Use sinh_sinh quadrature for integration over the whole real line; exp_sinh is for half infinite integrals.", a, Policy()));
}
// If we get to here then both ends must necessarily be finite:
return static_cast<K>(policies::raise_domain_error(function, "Use tanh_sinh quadrature for integration over finite domains; exp_sinh is for half infinite integrals.", a, Policy()));
}

template <class F, class Real, class Policy = policies::policy<> >
__device__ auto exp_sinh_integrate(const F& f, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)
{
BOOST_MATH_STD_USING
constexpr auto function = "boost::math::quadrature::exp_sinh<%1%>::integrate";
if (abs(tolerance) > 1) {
return policies::raise_domain_error(function, "The tolerance provided (%1%) is unusually large; did you confuse it with a domain bound?", tolerance, Policy());
}
return detail::exp_sinh_integrate_impl(f, tolerance, error, L1, levels);
}

} // namespace quadrature
} // namespace math
} // namespace boost

#endif // BOOST_MATH_ENABLE_CUDA

#endif // BOOST_MATH_QUADRATURE_EXP_SINH_HPP
33 changes: 31 additions & 2 deletions include/boost/math/quadrature/sinh_sinh.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright Nick Thompson, 2017
// Copyright Matt Borland, 2024
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt
Expand All @@ -15,10 +16,17 @@
#ifndef BOOST_MATH_QUADRATURE_SINH_SINH_HPP
#define BOOST_MATH_QUADRATURE_SINH_SINH_HPP

#include <boost/math/tools/config.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/quadrature/detail/sinh_sinh_detail.hpp>
#include <boost/math/policies/error_handling.hpp>

#ifndef BOOST_MATH_HAS_NVRTC

#include <cmath>
#include <limits>
#include <memory>
#include <boost/math/quadrature/detail/sinh_sinh_detail.hpp>

namespace boost{ namespace math{ namespace quadrature {

Expand All @@ -40,4 +48,25 @@ class sinh_sinh
};

}}}
#endif

#endif // BOOST_MATH_HAS_NVRTC

#ifdef BOOST_MATH_ENABLE_CUDA

namespace boost {
namespace math {
namespace quadrature {

template <class F, class Real, class Policy = boost::math::policies::policy<> >
__device__ auto sinh_sinh_integrate(const F& f, Real tol = boost::math::tools::root_epsilon<Real>(), Real* error = nullptr, Real* L1 = nullptr, boost::math::size_t* levels = nullptr)
{
return detail::sinh_sinh_integrate_impl(f, tol, error, L1, levels);
}

} // namespace quadrature
} // namespace math
} // namespace boost

#endif // BOOST_MATH_ENABLE_CUDA

#endif // BOOST_MATH_QUADRATURE_SINH_SINH_HPP
6 changes: 6 additions & 0 deletions test/cuda_jamfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ project : requirements
[ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
;

# Quad
run test_exp_sinh_quad_float.cu ;
run test_exp_sinh_quad_double.cu ;
run test_sinh_sinh_quad_float.cu ;
run test_sinh_sinh_quad_double.cu ;

# Distributions
run test_arcsine_cdf_double.cu ;
run test_arcsine_cdf_float.cu ;
Expand Down
6 changes: 6 additions & 0 deletions test/nvrtc_jamfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ project : requirements
[ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
;

# Quad
run test_exp_sinh_quad_nvrtc_float.cpp ;
run test_exp_sinh_quad_nvrtc_double.cpp ;
run test_sinh_sinh_quad_nvrtc_float.cpp ;
run test_sinh_sinh_quad_nvrtc_double.cpp ;

# Distributions
run test_arcsine_cdf_nvrtc_double.cpp ;
run test_arcsine_cdf_nvrtc_float.cpp ;
Expand Down
133 changes: 133 additions & 0 deletions test/test_exp_sinh_quad_double.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@

// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/quadrature/exp_sinh.hpp>
#include <boost/math/special_functions.hpp>
#include <boost/math/tools/precision.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

typedef double float_type;

__host__ __device__ float_type func(float_type x)
{
BOOST_MATH_STD_USING
return 1/(1+x*x);
}

/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
float_type tol = boost::math::tools::root_epsilon<float_type>();
float_type error;
float_type L1;
boost::math::size_t levels;

if (i < numElements)
{
out[i] = boost::math::quadrature::exp_sinh_integrate(func, tol, &error, &L1, &levels);
}
}

/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;

// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;

// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);

// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);

// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = M_PI * (static_cast<float_type>(i) / numElements);
}

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;

watch w;

cuda_test<<<blocksPerGrid, threadsPerBlock>>>(output_vector.get(), numElements);
cudaDeviceSynchronize();

std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;

err = cudaGetLastError();

if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}

// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
float_type tol = boost::math::tools::root_epsilon<float_type>();
float_type error;
float_type L1;
boost::math::quadrature::exp_sinh<float_type> integrator;
for(int i = 0; i < numElements; ++i)
{
results.push_back(integrator.integrate(func, tol, &error, &L1));
}
double t = w.elapsed();
// check the results
int failed_count = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
if (eps > 10)
{
std::cerr << std::setprecision(std::numeric_limits<float_type>::digits10)
<< "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i]
<< "\n Host: " << results[i]
<< "\n Eps: " << eps << "\n";
failed_count++;
}
if (failed_count > 100)
{
break;
}
}

if (failed_count != 0)
{
std::cout << "Test FAILED" << std::endl;
return EXIT_FAILURE;
}

std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";

return 0;
}
Loading