Start importing good ideas from the hierarchical branch

HadrienG2 · Dec 11, 2023 · de9254b · de9254b
1 parent 3d57e6d
commit de9254b
Show file tree

Hide file tree

Showing 12 changed files with 226 additions and 130 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -33,7 +33,10 @@ edition = "2021"
 # of the join() hot code path and adding a dependency on the log library, it
 # lets the thread pool implementation emit an error log when the limit is
 # reached, which lets you know about it and take corrective action.
-detect-excessive-joins = ["dep:log"]
+detect-excessive-joins = ["log"]
+
+# Enable general-purpose logging
+log = ["dep:log"]
 
 # Enable benchmarking code (must be enabled for benchmarks to compile)
 bench = ["dep:criterion", "dep:env_logger", "dep:iterator_ilp", "dep:pessimize", "dep:rayon"]
@@ -45,7 +48,8 @@ default = ["detect-excessive-joins", "bench"]
 # Core dependencies
 atomic-wait = { version = "1.1.0", default-features = false }
 crossbeam = { version = "0.8.2", features = ["crossbeam-deque"] }
-hwlocality = { git = "https://github.com/HadrienG2/hwlocality.git", features = ["hwloc-2_2_0"] }
+hwlocality = { git = "https://github.com/HadrienG2/hwlocality.git", features = ["hwloc-2_2_0", "proptest"] }
+rand = "0.8"
 
 # Used to optionally warn about dangerous or inefficient library usage
 log = { version = "0.4", optional = true }

diff --git a/benches/fibonacci.rs b/benches/fibonacci.rs
@@ -3,7 +3,7 @@ use viscose::bench;
 
 fn criterion_benchmark(c: &mut Criterion) {
     bench::for_each_locality(
-        |rayon_name, mut make_rayon_pool, flat_name, mut make_flat_pool| {
+        |rayon_name, mut make_rayon_pool, our_name, mut make_our_pool| {
             fn bench_backend(
                 c: &mut Criterion,
                 backend_name: &str,
@@ -27,10 +27,10 @@ fn criterion_benchmark(c: &mut Criterion) {
                 });
             }
             {
-                let flat_pool = make_flat_pool();
-                bench_backend(c, flat_name, |b: &mut Bencher, size| {
-                    flat_pool
-                        .run(|scope| b.iter(|| bench::fibonacci_flat(scope, pessimize::hide(size))))
+                let our_pool = make_our_pool();
+                bench_backend(c, our_name, |b: &mut Bencher, size| {
+                    our_pool
+                        .run(|scope| b.iter(|| bench::fibonacci_ours(scope, pessimize::hide(size))))
                 })
             }
         },

diff --git a/benches/norm_sqr.rs b/benches/norm_sqr.rs
@@ -3,7 +3,7 @@ use viscose::bench::{self, bench_local_floats};
 
 fn criterion_benchmark(c: &mut Criterion) {
     bench::for_each_locality(
-        |rayon_name, mut make_rayon_pool, flat_name, mut make_flat_pool| {
+        |rayon_name, mut make_rayon_pool, our_name, mut make_our_pool| {
             macro_rules! bench_norm_sqr {
             () => {
                 // I picked these values because...
@@ -45,15 +45,15 @@ fn criterion_benchmark(c: &mut Criterion) {
                     );
                 }
                 {
-                    let flat_pool = make_flat_pool();
+                    let our_pool = make_our_pool();
                     bench_local_floats::<BLOCK_SIZE>(
                         c,
                         bench_name,
-                        flat_name,
+                        our_name,
                         |b: &mut Bencher, slice| {
-                            flat_pool.run(|scope| {
+                            our_pool.run(|scope| {
                                 b.iter(|| {
-                                    bench::norm_sqr_flat::<BLOCK_SIZE, ILP_STREAMS>(
+                                    bench::norm_sqr_ours::<BLOCK_SIZE, ILP_STREAMS>(
                                         scope,
                                         pessimize::hide(slice),
                                     )

diff --git a/benches/square.rs b/benches/square.rs
@@ -3,7 +3,7 @@ use viscose::bench::{self, bench_local_floats};
 
 fn criterion_benchmark(c: &mut Criterion) {
     bench::for_each_locality(
-        |rayon_name, mut make_rayon_pool, flat_name, mut make_flat_pool| {
+        |rayon_name, mut make_rayon_pool, our_name, mut make_our_pool| {
             macro_rules! bench_square {
             () => {
                 // I picked these values because...
@@ -22,9 +22,9 @@ fn criterion_benchmark(c: &mut Criterion) {
                         });
                     }
                     {
-                        let flat_pool = make_flat_pool();
-                        bench_local_floats::<BLOCK_SIZE>(c, "square", flat_name, |b: &mut Bencher, slice| {
-                            flat_pool.run(|scope| b.iter(|| bench::square_flat(scope, pessimize::hide(slice))))
+                        let our_pool = make_our_pool();
+                        bench_local_floats::<BLOCK_SIZE>(c, "square", our_name, |b: &mut Bencher, slice| {
+                            our_pool.run(|scope| b.iter(|| bench::square_ours(scope, pessimize::hide(slice))))
                         });
                     }
                 }

diff --git a/benches/sum.rs b/benches/sum.rs
@@ -3,7 +3,7 @@ use viscose::bench::{self, bench_local_floats};
 
 fn criterion_benchmark(c: &mut Criterion) {
     bench::for_each_locality(
-        |rayon_name, mut make_rayon_pool, flat_name, mut make_flat_pool| {
+        |rayon_name, mut make_rayon_pool, our_name, mut make_our_pool| {
             macro_rules! bench_sum {
             () => {
                 // I picked these values because...
@@ -45,15 +45,15 @@ fn criterion_benchmark(c: &mut Criterion) {
                     );
                 }
                 {
-                    let flat_pool = make_flat_pool();
+                    let our_pool = make_our_pool();
                     bench_local_floats::<BLOCK_SIZE>(
                         c,
                         bench_name,
-                        flat_name,
+                        our_name,
                         |b: &mut Bencher, slice| {
-                            flat_pool.run(|scope| {
+                            our_pool.run(|scope| {
                                 b.iter(|| {
-                                    bench::sum_flat::<BLOCK_SIZE, ILP_STREAMS>(
+                                    bench::sum_ours::<BLOCK_SIZE, ILP_STREAMS>(
                                         scope,
                                         pessimize::hide(slice),
                                     )

diff --git a/src/bench.rs b/src/bench.rs
@@ -1,14 +1,11 @@
 //! Benchmarking utilities
 
-use crate::{pool::FlatPool, worker::scope::Scope};
+use crate::{pool::ThreadPool, worker::scope::Scope};
 use criterion::{Bencher, Criterion};
 use crossbeam::utils::CachePadded;
-use hwlocality::{cpu::binding::CpuBindingFlags, object::types::ObjectType, Topology};
+use hwlocality::{cpu::binding::CpuBindingFlags, object::types::ObjectType};
 use iterator_ilp::IteratorILP;
-use std::{
-    collections::BTreeSet,
-    sync::{Arc, Once, OnceLock},
-};
+use std::{collections::BTreeSet, sync::OnceLock};
 
 /// Re-export atomic flags for benchmarking
 pub use crate::shared::flags::{bitref::BitRef, AtomicFlags};
@@ -19,11 +16,11 @@ pub fn for_each_locality(
         &str,
         Box<dyn FnMut() -> rayon::ThreadPool>,
         &str,
-        Box<dyn FnMut() -> FlatPool>,
+        Box<dyn FnMut() -> ThreadPool>,
     ),
 ) {
-    setup_logger_once();
-    let topology = topology();
+    crate::setup_logger_once();
+    let topology = crate::topology();
     let mut seen_affinities = BTreeSet::new();
     for ty in [
         ObjectType::L1Cache,
@@ -63,7 +60,7 @@ pub fn for_each_locality(
 
             // Prepare to build thread pools
             let affinity2 = affinity.clone();
-            let make_flat_pool = move || FlatPool::with_affinity(topology.clone(), &affinity2);
+            let make_our_pool = move || ThreadPool::with_affinity(topology.clone(), &affinity2);
             let make_rayon_pool = move || {
                 rayon::ThreadPoolBuilder::new()
                     .num_threads(affinity.weight().unwrap())
@@ -77,7 +74,7 @@ pub fn for_each_locality(
                 &format!("{locality_name}/rayon"),
                 Box::new(make_rayon_pool),
                 &format!("{locality_name}/flat"),
-                Box::new(make_flat_pool),
+                Box::new(make_our_pool),
             )
         }
     }
@@ -98,13 +95,13 @@ pub fn fibonacci_rayon(n: u64) -> u64 {
     }
 }
 
-/// Like `fibonacci_rayon()`, but uses a `FlatPool`
+/// Like `fibonacci_rayon()`, but uses a `ThreadPool`
 #[inline]
-pub fn fibonacci_flat(scope: &Scope<'_>, n: u64) -> u64 {
+pub fn fibonacci_ours(scope: &Scope<'_>, n: u64) -> u64 {
     if n > 1 {
         let (x, y) = scope.join(
-            || fibonacci_flat(scope, n - 1),
-            move |scope| fibonacci_flat(scope, n - 2),
+            || fibonacci_ours(scope, n - 1),
+            move |scope| fibonacci_ours(scope, n - 2),
         );
         x + y
     } else {
@@ -255,7 +252,7 @@ pub fn bench_local_floats<const BLOCK_SIZE: usize>(
 fn max_data_size_pow2() -> u32 {
     static RESULT: OnceLock<u32> = OnceLock::new();
     *RESULT.get_or_init(|| {
-        let cache_stats = topology().cpu_cache_stats().unwrap();
+        let cache_stats = crate::topology().cpu_cache_stats().unwrap();
         let total_l3_capacity = cache_stats.total_data_cache_sizes().last().unwrap();
         let mut max_size = 2 * total_l3_capacity;
         if !max_size.is_power_of_two() {
@@ -282,17 +279,17 @@ pub fn square_rayon<const BLOCK_SIZE: usize>(slice: &mut LocalFloatsSlice<'_, BL
     );
 }
 
-/// Like `square_rayon()`, but using a `FlatPool`
+/// Like `square_rayon()`, but using a `ThreadPool`
 #[inline]
-pub fn square_flat<const BLOCK_SIZE: usize>(
+pub fn square_ours<const BLOCK_SIZE: usize>(
     scope: &Scope<'_>,
     slice: &mut LocalFloatsSlice<'_, BLOCK_SIZE>,
 ) {
     slice.process(
         |[mut left, mut right]| {
             scope.join(
-                || square_flat(scope, &mut left),
-                move |scope| square_flat(scope, &mut right),
+                || square_ours(scope, &mut left),
+                move |scope| square_ours(scope, &mut right),
             );
         },
         |block, locality| {
@@ -326,17 +323,17 @@ pub fn sum_rayon<const BLOCK_SIZE: usize, const ILP_STREAMS: usize>(
     )
 }
 
-/// Like `sum_rayon()`, but uses a `FlatPool`
+/// Like `sum_rayon()`, but uses a `ThreadPool`
 #[inline]
-pub fn sum_flat<const BLOCK_SIZE: usize, const ILP_STREAMS: usize>(
+pub fn sum_ours<const BLOCK_SIZE: usize, const ILP_STREAMS: usize>(
     scope: &Scope<'_>,
     slice: &mut LocalFloatsSlice<'_, BLOCK_SIZE>,
 ) -> f32 {
     slice.process(
         |[mut left, mut right]| {
             let (left, right) = scope.join(
-                || sum_flat::<BLOCK_SIZE, ILP_STREAMS>(scope, &mut left),
-                move |scope| sum_flat::<BLOCK_SIZE, ILP_STREAMS>(scope, &mut right),
+                || sum_ours::<BLOCK_SIZE, ILP_STREAMS>(scope, &mut left),
+                move |scope| sum_ours::<BLOCK_SIZE, ILP_STREAMS>(scope, &mut right),
             );
             left + right
         },
@@ -345,7 +342,7 @@ pub fn sum_flat<const BLOCK_SIZE: usize, const ILP_STREAMS: usize>(
     )
 }
 
-/// Memory-bound recursive squared vector norm computation based on FlatPool
+/// Memory-bound recursive squared vector norm computation based on ThreadPool
 ///
 /// This computation is not written for optimal efficiency (a single-pass
 /// algorithm would be more efficient), but to highlight the importance of NUMA
@@ -380,21 +377,21 @@ pub fn norm_sqr_rayon<const BLOCK_SIZE: usize, const REDUCE_ILP_STREAMS: usize>(
     )
 }
 
-/// Like `norm_sqr_rayon()`, but uses a `FlatPool`
+/// Like `norm_sqr_rayon()`, but uses a `ThreadPool`
 #[inline]
-pub fn norm_sqr_flat<const BLOCK_SIZE: usize, const REDUCE_ILP_STREAMS: usize>(
+pub fn norm_sqr_ours<const BLOCK_SIZE: usize, const REDUCE_ILP_STREAMS: usize>(
     scope: &Scope<'_>,
     slice: &mut LocalFloatsSlice<'_, BLOCK_SIZE>,
 ) -> f32 {
     slice.process(
         |[mut left, mut right]| {
             scope.join(
-                || square_flat(scope, &mut left),
-                |scope| square_flat(scope, &mut right),
+                || square_ours(scope, &mut left),
+                |scope| square_ours(scope, &mut right),
             );
             let (left, right) = scope.join(
-                || sum_flat::<BLOCK_SIZE, REDUCE_ILP_STREAMS>(scope, &mut left),
-                move |scope| sum_flat::<BLOCK_SIZE, REDUCE_ILP_STREAMS>(scope, &mut right),
+                || sum_ours::<BLOCK_SIZE, REDUCE_ILP_STREAMS>(scope, &mut left),
+                move |scope| sum_ours::<BLOCK_SIZE, REDUCE_ILP_STREAMS>(scope, &mut right),
             );
             left + right
         },
@@ -409,24 +406,10 @@ pub fn norm_sqr_flat<const BLOCK_SIZE: usize, const REDUCE_ILP_STREAMS: usize>(
     )
 }
 
-/// Topology instance shared by all above functions
-fn topology() -> &'static Arc<Topology> {
-    static INSTANCE: OnceLock<Arc<Topology>> = OnceLock::new();
-    INSTANCE.get_or_init(|| Arc::new(Topology::new().unwrap()))
-}
-
-/// Ensure logging to stderr is set up during benchmarking
-fn setup_logger_once() {
-    static ONCE: Once = Once::new();
-    ONCE.call_once(|| {
-        env_logger::init();
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::pool::FlatPool;
+    use crate::pool::ThreadPool;
 
     /// Reference computation of the N-th fibonacci sequence term
     fn fibonacci_ref(n: u64) -> u64 {
@@ -442,10 +425,10 @@ mod tests {
 
     #[test]
     fn fibonacci() {
-        let flat = FlatPool::new();
-        flat.run(|scope| {
+        let pool = ThreadPool::new();
+        pool.run(|scope| {
             for i in 0..=34 {
-                assert_eq!(fibonacci_flat(scope, i), fibonacci_ref(i));
+                assert_eq!(fibonacci_ours(scope, i), fibonacci_ref(i));
             }
         });
     }