diffeq/sde__multithreading_8hpp_source.html

#pragma once


#include "sde_synchronization.hpp"

#include <atomic>

#include <thread>

#include <mutex>

#include <condition_variable>

#include <memory>

#include <vector>

#include <queue>

#include <functional>

#include <future>

#include <random>

#include <algorithm>

#include <numeric>

#include <immintrin.h>  // For SIMD operations


#ifdef __cpp_impl_coroutine

#include <coroutine>

#endif


// Lock-free data structures

#include <memory>

#if defined(__has_include) && __has_include(<boost/lockfree/queue.hpp>)

#include <boost/lockfree/queue.hpp>

#include <boost/lockfree/spsc_queue.hpp>

#define HAVE_BOOST_LOCKFREE 1

#else

#define HAVE_BOOST_LOCKFREE 0

#endif


namespace diffeq::core::composable {


enum class SDEThreadingMode {

    SINGLE_THREAD,      // Traditional single-threaded

    MULTI_THREAD,       // Standard multi-threading with mutexes

    LOCK_FREE,          // Lock-free data structures

    FIBER_BASED,        // Fiber/coroutine-based (if available)

    NUMA_AWARE,         // NUMA-topology aware threading

    VECTORIZED          // SIMD-vectorized batch processing

};


enum class MemoryStrategy {

    STANDARD,           // Standard allocator

    POOL_ALLOCATED,     // Memory pool allocation

    NUMA_LOCAL,         // NUMA-local allocation

    CACHE_ALIGNED,      // Cache-line aligned allocation

    HUGE_PAGES          // Large page allocation (Linux)

};


struct SDEThreadingConfig {

    SDEThreadingMode threading_mode{SDEThreadingMode::MULTI_THREAD};

    MemoryStrategy memory_strategy{MemoryStrategy::CACHE_ALIGNED};


    // Threading parameters

    size_t num_threads{0};                     // 0 = auto-detect

    size_t num_fibers{1000};                   // Number of fibers for fiber mode

    size_t batch_size{1000};                   // Batch size for vectorized operations

    size_t queue_size{10000};                  // Lock-free queue size


    // Performance tuning

    bool enable_simd{true};                    // Enable SIMD vectorization

    bool enable_prefetching{true};             // Enable memory prefetching

    bool pin_threads{false};                  // Pin threads to CPU cores

    bool use_huge_pages{false};               // Use huge pages if available


    // NUMA configuration

    bool numa_aware{false};                   // Enable NUMA awareness

    std::vector<int> numa_nodes;              // Preferred NUMA nodes


    // Academic/research optimizations

    bool enable_batch_generation{true};       // Generate noise in batches

    bool enable_precomputation{true};         // Precompute common operations

    size_t precompute_buffer_size{100000};   // Size of precomputed buffer


    void validate() const {

        if (num_fibers == 0) {

            throw std::invalid_argument("num_fibers must be positive");

        }


        if (batch_size == 0) {

            throw std::invalid_argument("batch_size must be positive");

        }


        if (queue_size == 0) {

            throw std::invalid_argument("queue_size must be positive");

        }


        if (precompute_buffer_size == 0) {

            throw std::invalid_argument("precompute_buffer_size must be positive");

        }

    }


    static SDEThreadingConfig auto_detect() {

        SDEThreadingConfig config;


        // Auto-detect number of threads

        config.num_threads = std::thread::hardware_concurrency();

        if (config.num_threads == 0) config.num_threads = 4;


        // Choose threading mode based on available features

#if HAVE_BOOST_LOCKFREE

        config.threading_mode = SDEThreadingMode::LOCK_FREE;

#else

        config.threading_mode = SDEThreadingMode::MULTI_THREAD;

#endif


        // Enable SIMD if supported

#ifdef __AVX2__

        config.enable_simd = true;

        config.batch_size = 8;  // AVX2 can process 8 doubles

#elif defined(__SSE2__)

        config.enable_simd = true;

        config.batch_size = 4;  // SSE2 can process 4 doubles

#else

        config.enable_simd = false;

        config.batch_size = 1000;

#endif


        return config;

    }


};


template<typename T>


class LockFreeNoiseQueue {

private:

#if HAVE_BOOST_LOCKFREE

    boost::lockfree::spsc_queue<NoiseData<T>, boost::lockfree::capacity<10000>> queue_;

#else

    std::queue<NoiseData<T>> queue_;

    std::mutex mutex_;

#endif

    std::atomic<size_t> size_{0};


public:

    bool push(const NoiseData<T>& data) {

#if HAVE_BOOST_LOCKFREE

        bool success = queue_.push(data);

        if (success) size_++;

        return success;

#else

        std::lock_guard<std::mutex> lock(mutex_);

        queue_.push(data);

        size_++;

        return true;

#endif

    }


    bool pop(NoiseData<T>& data) {

#if HAVE_BOOST_LOCKFREE

        bool success = queue_.pop(data);

        if (success) size_--;

        return success;

#else

        std::lock_guard<std::mutex> lock(mutex_);

        if (queue_.empty()) return false;

        data = queue_.front();

        queue_.pop();

        size_--;

        return true;

#endif

    }


    size_t size() const { return size_.load(); }

    bool empty() const { return size() == 0; }

};


class SIMDNoiseGenerator {

private:

    std::mt19937_64 rng_;

    std::normal_distribution<double> normal_dist_;

    std::vector<double> batch_buffer_;


public:

    explicit SIMDNoiseGenerator(uint64_t seed = 12345)

        : rng_(seed), normal_dist_(0.0, 1.0) {}


    std::vector<double> generate_batch(size_t count, double intensity = 1.0) {

        std::vector<double> result;

        result.reserve(count);


#ifdef __AVX2__

        if (count >= 8) {

            generate_batch_avx2(result, count, intensity);

        } else {

            generate_batch_scalar(result, count, intensity);

        }

#elif defined(__SSE2__)

        if (count >= 4) {

            generate_batch_sse2(result, count, intensity);

        } else {

            generate_batch_scalar(result, count, intensity);

        }

#else

        generate_batch_scalar(result, count, intensity);

#endif


        return result;

    }


private:

    void generate_batch_scalar(std::vector<double>& result, size_t count, double intensity) {

        for (size_t i = 0; i < count; ++i) {

            result.push_back(intensity * normal_dist_(rng_));

        }

    }


#ifdef __AVX2__

    void generate_batch_avx2(std::vector<double>& result, size_t count, double intensity) {

        const size_t simd_count = (count / 8) * 8;


        // Generate SIMD batches

        for (size_t i = 0; i < simd_count; i += 8) {

            // Generate 8 random numbers

            alignas(32) double values[8];

            for (int j = 0; j < 8; ++j) {

                values[j] = intensity * normal_dist_(rng_);

            }


            // Load into AVX2 register and store

            __m256d vec = _mm256_load_pd(values);

            _mm256_store_pd(&values[0], vec);


            for (int j = 0; j < 8; ++j) {

                result.push_back(values[j]);

            }

        }


        // Handle remaining elements

        for (size_t i = simd_count; i < count; ++i) {

            result.push_back(intensity * normal_dist_(rng_));

        }

    }

#endif


#ifdef __SSE2__

    void generate_batch_sse2(std::vector<double>& result, size_t count, double intensity) {

        const size_t simd_count = (count / 4) * 4;


        // Generate SSE2 batches

        for (size_t i = 0; i < simd_count; i += 4) {

            alignas(16) double values[4];

            for (int j = 0; j < 4; ++j) {

                values[j] = intensity * normal_dist_(rng_);

            }


            __m128d vec1 = _mm_load_pd(&values[0]);

            __m128d vec2 = _mm_load_pd(&values[2]);

            _mm_store_pd(&values[0], vec1);

            _mm_store_pd(&values[2], vec2);


            for (int j = 0; j < 4; ++j) {

                result.push_back(values[j]);

            }

        }


        // Handle remaining elements

        for (size_t i = simd_count; i < count; ++i) {

            result.push_back(intensity * normal_dist_(rng_));

        }

    }

#endif

};


template<system_state S, can_be_time T = double>


class HighPerformanceSDESynchronizer {

private:

    SDEThreadingConfig config_;

    std::vector<std::thread> worker_threads_;

    std::vector<std::unique_ptr<SIMDNoiseGenerator>> generators_;

    std::vector<std::unique_ptr<LockFreeNoiseQueue<T>>> noise_queues_;


    // Statistics and monitoring

    std::atomic<size_t> total_noise_generated_{0};

    std::atomic<size_t> total_noise_consumed_{0};

    std::atomic<size_t> cache_hits_{0};

    std::atomic<size_t> cache_misses_{0};


    // Precomputed noise cache

    std::vector<std::vector<double>> precomputed_cache_;

    std::atomic<size_t> cache_index_{0};

    std::mutex cache_mutex_;


    // Thread management

    std::atomic<bool> running_{false};

    std::condition_variable worker_cv_;

    std::mutex worker_mutex_;


public:


    explicit HighPerformanceSDESynchronizer(SDEThreadingConfig config = SDEThreadingConfig::auto_detect())

        : config_(std::move(config)) {


        config_.validate();

        initialize_system();

    }


    ~HighPerformanceSDESynchronizer() {

        shutdown();

    }


    NoiseData<T> get_noise_increment_fast(T current_time, T dt, size_t dimensions = 1) {

        total_noise_consumed_++;


        switch (config_.threading_mode) {

            case SDEThreadingMode::VECTORIZED:

                return get_vectorized_noise(current_time, dt, dimensions);

            case SDEThreadingMode::LOCK_FREE:

                return get_lockfree_noise(current_time, dt, dimensions);

            case SDEThreadingMode::MULTI_THREAD:

                return get_multithreaded_noise(current_time, dt, dimensions);

            default:

                return get_cached_noise(current_time, dt, dimensions);

        }

    }


    std::vector<NoiseData<T>> generate_monte_carlo_batch(T current_time, T dt,

                                                        size_t dimensions, size_t num_simulations) {

        std::vector<NoiseData<T>> results;

        results.reserve(num_simulations);


        if (config_.enable_simd && num_simulations >= config_.batch_size) {

            return generate_vectorized_batch(current_time, dt, dimensions, num_simulations);

        } else {

            return generate_standard_batch(current_time, dt, dimensions, num_simulations);

        }

    }


    template<typename Integrator, typename InitialCondition>


    auto monte_carlo_integrate(std::function<std::unique_ptr<Integrator>()> integrator_factory,

                              std::function<S()> initial_condition_generator,

                              T dt, T end_time, size_t num_simulations) {


        std::vector<S> final_states;

        final_states.reserve(num_simulations);


        const size_t num_threads = config_.num_threads;

        const size_t sims_per_thread = num_simulations / num_threads;


        std::vector<std::future<std::vector<S>>> futures;


        for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) {

            size_t start_sim = thread_id * sims_per_thread;

            size_t end_sim = (thread_id == num_threads - 1) ? num_simulations : (thread_id + 1) * sims_per_thread;


            futures.emplace_back(std::async(std::launch::async, [=, this]() {

                return run_thread_simulations(integrator_factory, initial_condition_generator,

                                             dt, end_time, start_sim, end_sim, thread_id);

            }));

        }


        // Collect results

        for (auto& future : futures) {

            auto thread_results = future.get();

            final_states.insert(final_states.end(), thread_results.begin(), thread_results.end());

        }


        return final_states;

    }


    struct PerformanceStats {

        size_t noise_generated;

        size_t noise_consumed;

        size_t cache_hits;

        size_t cache_misses;

        double cache_hit_rate() const {

            size_t total = cache_hits + cache_misses;

            return total > 0 ? static_cast<double>(cache_hits) / total : 0.0;

        }

        double throughput_msamples_per_sec(std::chrono::milliseconds elapsed_time) const {

            return noise_generated / (elapsed_time.count() * 1000.0);

        }

    };


    PerformanceStats get_statistics() const {

        return {

            total_noise_generated_.load(),

            total_noise_consumed_.load(),

            cache_hits_.load(),

            cache_misses_.load()

        };

    }


    void reset_statistics() {

        total_noise_generated_ = 0;

        total_noise_consumed_ = 0;

        cache_hits_ = 0;

        cache_misses_ = 0;

    }


    void warmup(size_t warmup_samples = 100000) {

        std::cout << "Warming up high-performance SDE system...\n";


        auto start = std::chrono::high_resolution_clock::now();


        // Generate warmup noise to populate caches

        for (size_t i = 0; i < warmup_samples; ++i) {

            get_noise_increment_fast(static_cast<T>(i * 0.01), 0.01, 1);

        }


        auto end = std::chrono::high_resolution_clock::now();

        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);


        std::cout << "Warmup completed: " << warmup_samples << " samples in "

                 << duration.count() << "ms\n";

        std::cout << "Throughput: " << (warmup_samples / (duration.count() * 1000.0))

                 << " M samples/sec\n";

    }


private:

    void initialize_system() {

        // Initialize generators per thread

        generators_.resize(config_.num_threads);

        noise_queues_.resize(config_.num_threads);


        for (size_t i = 0; i < config_.num_threads; ++i) {

            generators_[i] = std::make_unique<SIMDNoiseGenerator>(12345 + i);

            noise_queues_[i] = std::make_unique<LockFreeNoiseQueue<T>>();

        }


        // Initialize precomputed cache

        if (config_.enable_precomputation) {

            initialize_precomputed_cache();

        }


        // Start worker threads

        if (config_.threading_mode != SDEThreadingMode::SINGLE_THREAD) {

            start_worker_threads();

        }

    }


    void initialize_precomputed_cache() {

        precomputed_cache_.resize(config_.num_threads);


        std::vector<std::future<void>> futures;


        for (size_t i = 0; i < config_.num_threads; ++i) {

            futures.emplace_back(std::async(std::launch::async, [this, i]() {

                auto& cache = precomputed_cache_[i];

                cache.reserve(config_.precompute_buffer_size);


                // Pre-generate noise samples

                for (size_t j = 0; j < config_.precompute_buffer_size; ++j) {

                    cache.push_back(generators_[i]->generate_batch(1)[0]);

                }

            }));

        }


        // Wait for all threads to complete

        for (auto& future : futures) {

            future.wait();

        }


        std::cout << "Precomputed " << (config_.num_threads * config_.precompute_buffer_size)

                 << " noise samples\n";

    }


    void start_worker_threads() {

        running_ = true;

        worker_threads_.resize(config_.num_threads);


        for (size_t i = 0; i < config_.num_threads; ++i) {

            worker_threads_[i] = std::thread([this, i]() {

                worker_thread_function(i);

            });


            // Pin threads to cores if requested

            if (config_.pin_threads) {

#ifdef __linux__

                cpu_set_t cpuset;

                CPU_ZERO(&cpuset);

                CPU_SET(i % std::thread::hardware_concurrency(), &cpuset);

                pthread_setaffinity_np(worker_threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);

#endif

            }

        }

    }


    void worker_thread_function(size_t thread_id) {

        while (running_) {

            // Generate noise in batches when queues get low

            if (noise_queues_[thread_id]->size() < config_.queue_size / 4) {

                auto noise_batch = generators_[thread_id]->generate_batch(config_.batch_size);


                for (size_t i = 0; i < noise_batch.size(); ++i) {

                    NoiseData<T> data(static_cast<T>(i * 0.01), {noise_batch[i]}, NoiseProcessType::WIENER);

                    noise_queues_[thread_id]->push(data);

                }


                total_noise_generated_ += noise_batch.size();

            }


            // Brief sleep to prevent spinning

            std::this_thread::sleep_for(std::chrono::microseconds(10));

        }

    }


    NoiseData<T> get_vectorized_noise(T current_time, T dt, size_t dimensions) {

        size_t thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id()) % config_.num_threads;

        auto noise_values = generators_[thread_id]->generate_batch(dimensions);


        total_noise_generated_ += dimensions;

        return NoiseData<T>(current_time, std::move(noise_values), NoiseProcessType::WIENER);

    }


    NoiseData<T> get_lockfree_noise(T current_time, T dt, size_t dimensions) {

        size_t thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id()) % config_.num_threads;


        NoiseData<T> result;

        if (noise_queues_[thread_id]->pop(result)) {

            cache_hits_++;

            return result;

        }


        // Cache miss - generate immediately

        cache_misses_++;

        return get_vectorized_noise(current_time, dt, dimensions);

    }


    NoiseData<T> get_multithreaded_noise(T current_time, T dt, size_t dimensions) {

        return get_vectorized_noise(current_time, dt, dimensions);

    }


    NoiseData<T> get_cached_noise(T current_time, T dt, size_t dimensions) {

        if (!config_.enable_precomputation || precomputed_cache_.empty()) {

            return get_vectorized_noise(current_time, dt, dimensions);

        }


        size_t thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id()) % config_.num_threads;

        size_t index = cache_index_++ % config_.precompute_buffer_size;


        std::vector<double> values;

        values.reserve(dimensions);


        for (size_t i = 0; i < dimensions; ++i) {

            size_t cache_idx = (index + i) % precomputed_cache_[thread_id].size();

            values.push_back(precomputed_cache_[thread_id][cache_idx]);

        }


        cache_hits_++;

        return NoiseData<T>(current_time, std::move(values), NoiseProcessType::WIENER);

    }


    std::vector<NoiseData<T>> generate_vectorized_batch(T current_time, T dt,

                                                       size_t dimensions, size_t num_simulations) {

        std::vector<NoiseData<T>> results;

        results.reserve(num_simulations);


        const size_t total_samples = dimensions * num_simulations;

        const size_t batches = (total_samples + config_.batch_size - 1) / config_.batch_size;


        std::vector<std::future<std::vector<double>>> futures;


        for (size_t batch = 0; batch < batches; ++batch) {

            size_t batch_start = batch * config_.batch_size;

            size_t batch_end = std::min(batch_start + config_.batch_size, total_samples);

            size_t batch_size = batch_end - batch_start;


            size_t thread_id = batch % config_.num_threads;


            futures.emplace_back(std::async(std::launch::async, [this, thread_id, batch_size]() {

                return generators_[thread_id]->generate_batch(batch_size);

            }));

        }


        // Collect and organize results

        size_t sample_idx = 0;

        for (auto& future : futures) {

            auto batch_samples = future.get();


            for (double sample : batch_samples) {

                size_t sim_id = sample_idx / dimensions;

                size_t dim_id = sample_idx % dimensions;


                if (dim_id == 0) {

                    results.emplace_back(current_time + sim_id * dt, std::vector<double>{}, NoiseProcessType::WIENER);

                    results.back().increments.reserve(dimensions);

                }


                results[sim_id].increments.push_back(sample);

                sample_idx++;

            }

        }


        total_noise_generated_ += total_samples;

        return results;

    }


    std::vector<NoiseData<T>> generate_standard_batch(T current_time, T dt,

                                                     size_t dimensions, size_t num_simulations) {

        std::vector<NoiseData<T>> results;

        results.reserve(num_simulations);


        for (size_t i = 0; i < num_simulations; ++i) {

            results.push_back(get_noise_increment_fast(current_time + i * dt, dt, dimensions));

        }


        return results;

    }


    template<typename Integrator, typename InitialCondition>

    std::vector<S> run_thread_simulations(std::function<std::unique_ptr<Integrator>()> integrator_factory,

                                         std::function<S()> initial_condition_generator,

                                         T dt, T end_time, size_t start_sim, size_t end_sim,

                                         size_t thread_id) {

        std::vector<S> results;

        results.reserve(end_sim - start_sim);


        for (size_t sim = start_sim; sim < end_sim; ++sim) {

            auto integrator = integrator_factory();

            S state = initial_condition_generator();


            // Configure integrator to use this synchronizer

            // This would require integrator to accept noise source


            integrator->integrate(state, dt, end_time);

            results.push_back(state);

        }


        return results;

    }


    void shutdown() {

        running_ = false;


        for (auto& thread : worker_threads_) {

            if (thread.joinable()) {

                thread.join();

            }

        }


        worker_threads_.clear();

    }

};


// ============================================================================

// FIBER/COROUTINE SUPPORT (C++20)

// ============================================================================


#ifdef __cpp_impl_coroutine


template<system_state S, can_be_time T = double>

class FiberSDESynchronizer {

public:

    struct NoiseAwaitable {

        T time;

        T dt;

        size_t dimensions;


        bool await_ready() const noexcept { return false; }


        void await_suspend(std::coroutine_handle<> handle) const {

            // Schedule noise generation

        }


        NoiseData<T> await_resume() const {

            // Return generated noise

            return NoiseData<T>(time, std::vector<double>(dimensions, 0.1), NoiseProcessType::WIENER);

        }

    };


    NoiseAwaitable get_noise_async(T time, T dt, size_t dimensions = 1) {

        return {time, dt, dimensions};

    }

};


#endif


// ============================================================================

// CONVENIENCE FUNCTIONS

// ============================================================================


template<system_state S, can_be_time T = double>

auto create_monte_carlo_system(size_t num_simulations, size_t num_threads = 0) {

    SDEThreadingConfig config = SDEThreadingConfig::auto_detect();


    if (num_threads > 0) {

        config.num_threads = num_threads;

    }


    // Optimize for Monte Carlo

    config.threading_mode = SDEThreadingMode::VECTORIZED;

    config.enable_precomputation = true;

    config.enable_simd = true;

    config.batch_size = std::max(size_t(1000), num_simulations / config.num_threads);


    return std::make_unique<HighPerformanceSDESynchronizer<S, T>>(config);

}


template<system_state S, can_be_time T = double>

auto create_realtime_system() {

    SDEThreadingConfig config;

    config.threading_mode = SDEThreadingMode::LOCK_FREE;

    config.memory_strategy = MemoryStrategy::CACHE_ALIGNED;

    config.enable_precomputation = true;

    config.pin_threads = true;

    config.batch_size = 1;  // Minimal latency


    return std::make_unique<HighPerformanceSDESynchronizer<S, T>>(config);

}


template<system_state S, can_be_time T = double>

auto create_numa_system(const std::vector<int>& numa_nodes = {}) {

    SDEThreadingConfig config = SDEThreadingConfig::auto_detect();

    config.threading_mode = SDEThreadingMode::NUMA_AWARE;

    config.memory_strategy = MemoryStrategy::NUMA_LOCAL;

    config.numa_aware = true;

    config.numa_nodes = numa_nodes;

    config.use_huge_pages = true;


    return std::make_unique<HighPerformanceSDESynchronizer<S, T>>(config);

}


} // namespace diffeq::core::composable

diffeq::core::composable::HighPerformanceSDESynchronizer
High-performance multi-threaded SDE synchronizer.
Definition sde_multithreading.hpp:305

diffeq::core::composable::HighPerformanceSDESynchronizer::HighPerformanceSDESynchronizer
HighPerformanceSDESynchronizer(SDEThreadingConfig config=SDEThreadingConfig::auto_detect())
Construct high-performance SDE synchronizer.
Definition sde_multithreading.hpp:332

diffeq::core::composable::HighPerformanceSDESynchronizer::warmup
void warmup(size_t warmup_samples=100000)
Warmup system for optimal performance.
Definition sde_multithreading.hpp:450

diffeq::core::composable::HighPerformanceSDESynchronizer::get_noise_increment_fast
NoiseData< T > get_noise_increment_fast(T current_time, T dt, size_t dimensions=1)
Get noise increment with ultra-low latency.
Definition sde_multithreading.hpp:346

diffeq::core::composable::HighPerformanceSDESynchronizer::generate_monte_carlo_batch
std::vector< NoiseData< T > > generate_monte_carlo_batch(T current_time, T dt, size_t dimensions, size_t num_simulations)
Generate batch of noise for Monte Carlo simulations.
Definition sde_multithreading.hpp:364

diffeq::core::composable::HighPerformanceSDESynchronizer::monte_carlo_integrate
auto monte_carlo_integrate(std::function< std::unique_ptr< Integrator >()> integrator_factory, std::function< S()> initial_condition_generator, T dt, T end_time, size_t num_simulations)
Monte Carlo integration with automatic parallelization.
Definition sde_multithreading.hpp:380

diffeq::core::composable::HighPerformanceSDESynchronizer::reset_statistics
void reset_statistics()
Reset statistics.
Definition sde_multithreading.hpp:440

diffeq::core::composable::LockFreeNoiseQueue
Lock-free noise data queue for high-performance scenarios.
Definition sde_multithreading.hpp:143

diffeq::core::composable::SIMDNoiseGenerator
SIMD-accelerated noise generation.
Definition sde_multithreading.hpp:189

diffeq::core::composable::SIMDNoiseGenerator::generate_batch
std::vector< double > generate_batch(size_t count, double intensity=1.0)
Generate batch of noise using SIMD when possible.
Definition sde_multithreading.hpp:202

diffeq::core::composable::HighPerformanceSDESynchronizer::PerformanceStats
Get performance statistics.
Definition sde_multithreading.hpp:414

diffeq::core::composable::NoiseData
Noise data structure.
Definition sde_synchronization.hpp:92

diffeq::core::composable::SDEThreadingConfig
Configuration for high-performance SDE threading.
Definition sde_multithreading.hpp:60

diffeq::core::composable::SDEThreadingConfig::auto_detect
static SDEThreadingConfig auto_detect()
Auto-detect optimal configuration.
Definition sde_multithreading.hpp:109

diffeq::core::composable::SDEThreadingConfig::validate
void validate() const
Validate configuration.
Definition sde_multithreading.hpp:88