Add device only impl

Add function for device only impl Fix function signatures Fix arrays Add basic test for compilation Allow serial implementation to be run on host under NVCC Add verification steps Add arrays of levels coefficient sizes Cleanup test set Add double test set Add structure for the doubles support Save space by using pointer to different size arrays rather than 2d Separate the double precision weights into their own arrays Remove stray call to std::abs Add NVRTC testing Add documentation section Add device function signature for sinh_sinh_integrate Add float coefficients Add double coeffs Add device specific impl Add sinh_sinh CUDA testing Add sinh_sinh NVRTC testing
boostorg · mborland · Sep 16, 2024 · Sep 12, 2024 · Sep 12, 2024 · b5214b5935bea24021c719eea3451d1eec3024fb
commit b5214b5935bea24021c719eea3451d1eec3024fb
diff --git a/doc/quadrature/double_exponential.qbk b/doc/quadrature/double_exponential.qbk
@@ -1,5 +1,6 @@
 [/
 Copyright (c) 2017 Nick Thompson
+Copyright (c) 2024 Matt Borland
 Use, modification and distribution are subject to the
 Boost Software License, Version 1.0. (See accompanying file
 LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -538,6 +539,30 @@ This form integrates just fine over (-log([pi]/2), +[infin]) using either the `t
 
 [endsect] [/section:de_caveats Caveats]
 
+[section:gpu_usage GPU Usage]
+
+``
+    #include <boost/math/quadrature/exp_sinh.hpp>
+
+    namespace boost{ namespace math{ namespace quadrature {
+
+    template <class F, class Real, class Policy = policies::policy<> >
+    __device__ auto exp_sinh_integrate(const F& f, Real a, Real b, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)
+
+    template <class F, class Real, class Policy = policies::policy<> >
+    __device__ auto exp_sinh_integrate(const F& f, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)
+
+}}}
+``
+
+Quadrature is additionally able to run on CUDA (NVCC and NVRTC) platforms.
+The major difference is outlined in the above function signatures.
+When used on device these are free standing functions instead of using OOP like on the host.
+The tables of abscissas and weights are stored in shared read only memory on the device instead of being initialized when the class is constructed.
+An example use case would be in the finite elements method computing a stiffness matrix since it would consist of many different functions.
+
+[endsect] [/section:gpu_usage Usage]
+
 [section:de_refes References]
 
 * Hidetosi Takahasi and Masatake Mori, ['Double Exponential Formulas for Numerical Integration] Publ. Res. Inst. Math. Sci., 9 (1974), pp. 721-741.

diff --git a/include/boost/math/quadrature/detail/exp_sinh_detail.hpp b/include/boost/math/quadrature/detail/exp_sinh_detail.hpp
diff --git a/include/boost/math/quadrature/detail/sinh_sinh_detail.hpp b/include/boost/math/quadrature/detail/sinh_sinh_detail.hpp
diff --git a/include/boost/math/quadrature/exp_sinh.hpp b/include/boost/math/quadrature/exp_sinh.hpp
@@ -15,11 +15,15 @@
 #ifndef BOOST_MATH_QUADRATURE_EXP_SINH_HPP
 #define BOOST_MATH_QUADRATURE_EXP_SINH_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/quadrature/detail/exp_sinh_detail.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <cmath>
 #include <limits>
 #include <memory>
 #include <string>
-#include <boost/math/quadrature/detail/exp_sinh_detail.hpp>
 
 namespace boost{ namespace math{ namespace quadrature {
 
@@ -98,4 +102,79 @@ auto exp_sinh<Real, Policy>::integrate(const F& f, Real tolerance, Real* error,
 
 
 }}}
-#endif
+
+#endif // BOOST_MATH_HAS_NVRTC
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/constants/constants.hpp>
+
+namespace boost { 
+namespace math { 
+namespace quadrature {
+
+template <class F, class Real, class Policy = policies::policy<> >
+__device__ auto exp_sinh_integrate(const F& f, Real a, Real b, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)
+{
+    BOOST_MATH_STD_USING
+
+    using K = decltype(f(a));
+    static_assert(!boost::math::is_integral<K>::value,
+                  "The return type cannot be integral, it must be either a real or complex floating point type.");
+
+    constexpr auto function = "boost::math::quadrature::exp_sinh<%1%>::integrate";
+
+    // Neither limit may be a NaN:
+    if((boost::math::isnan)(a) || (boost::math::isnan)(b))
+    {
+       return static_cast<K>(policies::raise_domain_error(function, "NaN supplied as one limit of integration - sorry I don't know what to do", a, Policy()));
+    }
+    // Right limit is infinite:
+    if ((boost::math::isfinite)(a) && (b >= boost::math::tools::max_value<Real>()))
+    {
+        // If a = 0, don't use an additional level of indirection:
+        if (a == static_cast<Real>(0))
+        {
+            return detail::exp_sinh_integrate_impl(f, tolerance, error, L1, levels);
+        }
+        const auto u = [&](Real t)->K { return f(t + a); };
+        return detail::exp_sinh_integrate_impl(u, tolerance, error, L1, levels);
+    }
+
+    if ((boost::math::isfinite)(b) && a <= -boost::math::tools::max_value<Real>())
+    {
+        const auto u = [&](Real t)->K { return f(b-t);};
+        return detail::exp_sinh_integrate_impl(u, tolerance, error, L1, levels);
+    }
+
+    // Infinite limits:
+    if ((a <= -boost::math::tools::max_value<Real>()) && (b >= boost::math::tools::max_value<Real>()))
+    {
+        return static_cast<K>(policies::raise_domain_error(function, "Use sinh_sinh quadrature for integration over the whole real line; exp_sinh is for half infinite integrals.", a, Policy()));
+    }
+    // If we get to here then both ends must necessarily be finite:
+    return static_cast<K>(policies::raise_domain_error(function, "Use tanh_sinh quadrature for integration over finite domains; exp_sinh is for half infinite integrals.", a, Policy()));
+}
+
+template <class F, class Real, class Policy = policies::policy<> >
+__device__ auto exp_sinh_integrate(const F& f, Real tolerance, Real* error, Real* L1, boost::math::size_t* levels)
+{
+    BOOST_MATH_STD_USING
+    constexpr auto function = "boost::math::quadrature::exp_sinh<%1%>::integrate";
+    if (abs(tolerance) > 1) {
+        return policies::raise_domain_error(function, "The tolerance provided (%1%) is unusually large; did you confuse it with a domain bound?", tolerance, Policy());
+    }
+    return detail::exp_sinh_integrate_impl(f, tolerance, error, L1, levels);
+}
+
+} // namespace quadrature
+} // namespace math
+} // namespace boost
+
+#endif // BOOST_MATH_ENABLE_CUDA
+
+#endif // BOOST_MATH_QUADRATURE_EXP_SINH_HPP
diff --git a/include/boost/math/quadrature/sinh_sinh.hpp b/include/boost/math/quadrature/sinh_sinh.hpp
@@ -1,4 +1,5 @@
 // Copyright Nick Thompson, 2017
+// Copyright Matt Borland, 2024
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -15,10 +16,17 @@
 #ifndef BOOST_MATH_QUADRATURE_SINH_SINH_HPP
 #define BOOST_MATH_QUADRATURE_SINH_SINH_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/quadrature/detail/sinh_sinh_detail.hpp>
+#include <boost/math/policies/error_handling.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <cmath>
 #include <limits>
 #include <memory>
-#include <boost/math/quadrature/detail/sinh_sinh_detail.hpp>
 
 namespace boost{ namespace math{ namespace quadrature {
 
@@ -40,4 +48,25 @@ class sinh_sinh
 };
 
 }}}
-#endif
+
+#endif // BOOST_MATH_HAS_NVRTC
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+
+namespace boost {
+namespace math {
+namespace quadrature {
+
+template <class F, class Real, class Policy = boost::math::policies::policy<> >
+__device__ auto sinh_sinh_integrate(const F& f, Real tol = boost::math::tools::root_epsilon<Real>(), Real* error = nullptr, Real* L1 = nullptr, boost::math::size_t* levels = nullptr)
+{
+    return detail::sinh_sinh_integrate_impl(f, tol, error, L1, levels);
+}
+
+} // namespace quadrature
+} // namespace math
+} // namespace boost
+
+#endif // BOOST_MATH_ENABLE_CUDA
+
+#endif // BOOST_MATH_QUADRATURE_SINH_SINH_HPP
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -9,6 +9,12 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
+# Quad
+run test_exp_sinh_quad_float.cu ;
+run test_exp_sinh_quad_double.cu ;
+run test_sinh_sinh_quad_float.cu ;
+run test_sinh_sinh_quad_double.cu ;
+
 # Distributions
 run test_arcsine_cdf_double.cu ;
 run test_arcsine_cdf_float.cu ;

diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
@@ -9,6 +9,12 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
+# Quad
+run test_exp_sinh_quad_nvrtc_float.cpp ;
+run test_exp_sinh_quad_nvrtc_double.cpp ;
+run test_sinh_sinh_quad_nvrtc_float.cpp ;
+run test_sinh_sinh_quad_nvrtc_double.cpp ;
+
 # Distributions
 run test_arcsine_cdf_nvrtc_double.cpp ;
 run test_arcsine_cdf_nvrtc_float.cpp ;

diff --git a/test/test_exp_sinh_quad_double.cu b/test/test_exp_sinh_quad_double.cu
@@ -0,0 +1,133 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/quadrature/exp_sinh.hpp>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/tools/precision.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+__host__ __device__ float_type func(float_type x)
+{
+    BOOST_MATH_STD_USING
+    return 1/(1+x*x);
+}
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    float_type tol = boost::math::tools::root_epsilon<float_type>();
+    float_type error;
+    float_type L1;
+    boost::math::size_t levels;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::quadrature::exp_sinh_integrate(func, tol, &error, &L1, &levels);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = M_PI * (static_cast<float_type>(i) / numElements);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    float_type tol = boost::math::tools::root_epsilon<float_type>();
+    float_type error;
+    float_type L1;
+    boost::math::quadrature::exp_sinh<float_type> integrator;
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(integrator.integrate(func, tol, &error, &L1));
+    }
+    double t = w.elapsed();
+    // check the results
+    int failed_count = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
+        if (eps > 10)
+        {
+            std::cerr   << std::setprecision(std::numeric_limits<float_type>::digits10)
+                        << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i]
+                        << "\n  Host: " << results[i]
+                        << "\n   Eps: " << eps << "\n";
+            failed_count++;
+        }
+        if (failed_count > 100)
+        {
+            break;
+        }
+    }
+
+    if (failed_count != 0)
+    {
+        std::cout << "Test FAILED" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}