DiffEq - Modern C++ ODE Integration Library 1.0.0
High-performance C++ library for solving ODEs with async signal processing
Loading...
Searching...
No Matches
sde_multithreading.hpp
1#pragma once
2
3#include "sde_synchronization.hpp"
4#include <atomic>
5#include <thread>
6#include <mutex>
7#include <condition_variable>
8#include <memory>
9#include <vector>
10#include <queue>
11#include <functional>
12#include <future>
13#include <random>
14#include <algorithm>
15#include <numeric>
16#include <immintrin.h> // For SIMD operations
17
18#ifdef __cpp_impl_coroutine
19#include <coroutine>
20#endif
21
22// Lock-free data structures
23#include <memory>
24#if defined(__has_include) && __has_include(<boost/lockfree/queue.hpp>)
25#include <boost/lockfree/queue.hpp>
26#include <boost/lockfree/spsc_queue.hpp>
27#define HAVE_BOOST_LOCKFREE 1
28#else
29#define HAVE_BOOST_LOCKFREE 0
30#endif
31
32namespace diffeq::core::composable {
33
37enum class SDEThreadingMode {
38 SINGLE_THREAD, // Traditional single-threaded
39 MULTI_THREAD, // Standard multi-threading with mutexes
40 LOCK_FREE, // Lock-free data structures
41 FIBER_BASED, // Fiber/coroutine-based (if available)
42 NUMA_AWARE, // NUMA-topology aware threading
43 VECTORIZED // SIMD-vectorized batch processing
44};
45
49enum class MemoryStrategy {
50 STANDARD, // Standard allocator
51 POOL_ALLOCATED, // Memory pool allocation
52 NUMA_LOCAL, // NUMA-local allocation
53 CACHE_ALIGNED, // Cache-line aligned allocation
54 HUGE_PAGES // Large page allocation (Linux)
55};
56
61 SDEThreadingMode threading_mode{SDEThreadingMode::MULTI_THREAD};
62 MemoryStrategy memory_strategy{MemoryStrategy::CACHE_ALIGNED};
63
64 // Threading parameters
65 size_t num_threads{0}; // 0 = auto-detect
66 size_t num_fibers{1000}; // Number of fibers for fiber mode
67 size_t batch_size{1000}; // Batch size for vectorized operations
68 size_t queue_size{10000}; // Lock-free queue size
69
70 // Performance tuning
71 bool enable_simd{true}; // Enable SIMD vectorization
72 bool enable_prefetching{true}; // Enable memory prefetching
73 bool pin_threads{false}; // Pin threads to CPU cores
74 bool use_huge_pages{false}; // Use huge pages if available
75
76 // NUMA configuration
77 bool numa_aware{false}; // Enable NUMA awareness
78 std::vector<int> numa_nodes; // Preferred NUMA nodes
79
80 // Academic/research optimizations
81 bool enable_batch_generation{true}; // Generate noise in batches
82 bool enable_precomputation{true}; // Precompute common operations
83 size_t precompute_buffer_size{100000}; // Size of precomputed buffer
84
88 void validate() const {
89 if (num_fibers == 0) {
90 throw std::invalid_argument("num_fibers must be positive");
91 }
92
93 if (batch_size == 0) {
94 throw std::invalid_argument("batch_size must be positive");
95 }
96
97 if (queue_size == 0) {
98 throw std::invalid_argument("queue_size must be positive");
99 }
100
101 if (precompute_buffer_size == 0) {
102 throw std::invalid_argument("precompute_buffer_size must be positive");
103 }
104 }
105
110 SDEThreadingConfig config;
111
112 // Auto-detect number of threads
113 config.num_threads = std::thread::hardware_concurrency();
114 if (config.num_threads == 0) config.num_threads = 4;
115
116 // Choose threading mode based on available features
117#if HAVE_BOOST_LOCKFREE
118 config.threading_mode = SDEThreadingMode::LOCK_FREE;
119#else
120 config.threading_mode = SDEThreadingMode::MULTI_THREAD;
121#endif
122
123 // Enable SIMD if supported
124#ifdef __AVX2__
125 config.enable_simd = true;
126 config.batch_size = 8; // AVX2 can process 8 doubles
127#elif defined(__SSE2__)
128 config.enable_simd = true;
129 config.batch_size = 4; // SSE2 can process 4 doubles
130#else
131 config.enable_simd = false;
132 config.batch_size = 1000;
133#endif
134
135 return config;
136 }
137};
138
142template<typename T>
144private:
145#if HAVE_BOOST_LOCKFREE
146 boost::lockfree::spsc_queue<NoiseData<T>, boost::lockfree::capacity<10000>> queue_;
147#else
148 std::queue<NoiseData<T>> queue_;
149 std::mutex mutex_;
150#endif
151 std::atomic<size_t> size_{0};
152
153public:
154 bool push(const NoiseData<T>& data) {
155#if HAVE_BOOST_LOCKFREE
156 bool success = queue_.push(data);
157 if (success) size_++;
158 return success;
159#else
160 std::lock_guard<std::mutex> lock(mutex_);
161 queue_.push(data);
162 size_++;
163 return true;
164#endif
165 }
166
167 bool pop(NoiseData<T>& data) {
168#if HAVE_BOOST_LOCKFREE
169 bool success = queue_.pop(data);
170 if (success) size_--;
171 return success;
172#else
173 std::lock_guard<std::mutex> lock(mutex_);
174 if (queue_.empty()) return false;
175 data = queue_.front();
176 queue_.pop();
177 size_--;
178 return true;
179#endif
180 }
181
182 size_t size() const { return size_.load(); }
183 bool empty() const { return size() == 0; }
184};
185
190private:
191 std::mt19937_64 rng_;
192 std::normal_distribution<double> normal_dist_;
193 std::vector<double> batch_buffer_;
194
195public:
196 explicit SIMDNoiseGenerator(uint64_t seed = 12345)
197 : rng_(seed), normal_dist_(0.0, 1.0) {}
198
202 std::vector<double> generate_batch(size_t count, double intensity = 1.0) {
203 std::vector<double> result;
204 result.reserve(count);
205
206#ifdef __AVX2__
207 if (count >= 8) {
208 generate_batch_avx2(result, count, intensity);
209 } else {
210 generate_batch_scalar(result, count, intensity);
211 }
212#elif defined(__SSE2__)
213 if (count >= 4) {
214 generate_batch_sse2(result, count, intensity);
215 } else {
216 generate_batch_scalar(result, count, intensity);
217 }
218#else
219 generate_batch_scalar(result, count, intensity);
220#endif
221
222 return result;
223 }
224
225private:
226 void generate_batch_scalar(std::vector<double>& result, size_t count, double intensity) {
227 for (size_t i = 0; i < count; ++i) {
228 result.push_back(intensity * normal_dist_(rng_));
229 }
230 }
231
232#ifdef __AVX2__
233 void generate_batch_avx2(std::vector<double>& result, size_t count, double intensity) {
234 const size_t simd_count = (count / 8) * 8;
235
236 // Generate SIMD batches
237 for (size_t i = 0; i < simd_count; i += 8) {
238 // Generate 8 random numbers
239 alignas(32) double values[8];
240 for (int j = 0; j < 8; ++j) {
241 values[j] = intensity * normal_dist_(rng_);
242 }
243
244 // Load into AVX2 register and store
245 __m256d vec = _mm256_load_pd(values);
246 _mm256_store_pd(&values[0], vec);
247
248 for (int j = 0; j < 8; ++j) {
249 result.push_back(values[j]);
250 }
251 }
252
253 // Handle remaining elements
254 for (size_t i = simd_count; i < count; ++i) {
255 result.push_back(intensity * normal_dist_(rng_));
256 }
257 }
258#endif
259
260#ifdef __SSE2__
261 void generate_batch_sse2(std::vector<double>& result, size_t count, double intensity) {
262 const size_t simd_count = (count / 4) * 4;
263
264 // Generate SSE2 batches
265 for (size_t i = 0; i < simd_count; i += 4) {
266 alignas(16) double values[4];
267 for (int j = 0; j < 4; ++j) {
268 values[j] = intensity * normal_dist_(rng_);
269 }
270
271 __m128d vec1 = _mm_load_pd(&values[0]);
272 __m128d vec2 = _mm_load_pd(&values[2]);
273 _mm_store_pd(&values[0], vec1);
274 _mm_store_pd(&values[2], vec2);
275
276 for (int j = 0; j < 4; ++j) {
277 result.push_back(values[j]);
278 }
279 }
280
281 // Handle remaining elements
282 for (size_t i = simd_count; i < count; ++i) {
283 result.push_back(intensity * normal_dist_(rng_));
284 }
285 }
286#endif
287};
288
304template<system_state S, can_be_time T = double>
306private:
307 SDEThreadingConfig config_;
308 std::vector<std::thread> worker_threads_;
309 std::vector<std::unique_ptr<SIMDNoiseGenerator>> generators_;
310 std::vector<std::unique_ptr<LockFreeNoiseQueue<T>>> noise_queues_;
311
312 // Statistics and monitoring
313 std::atomic<size_t> total_noise_generated_{0};
314 std::atomic<size_t> total_noise_consumed_{0};
315 std::atomic<size_t> cache_hits_{0};
316 std::atomic<size_t> cache_misses_{0};
317
318 // Precomputed noise cache
319 std::vector<std::vector<double>> precomputed_cache_;
320 std::atomic<size_t> cache_index_{0};
321 std::mutex cache_mutex_;
322
323 // Thread management
324 std::atomic<bool> running_{false};
325 std::condition_variable worker_cv_;
326 std::mutex worker_mutex_;
327
328public:
333 : config_(std::move(config)) {
334
335 config_.validate();
336 initialize_system();
337 }
338
340 shutdown();
341 }
342
346 NoiseData<T> get_noise_increment_fast(T current_time, T dt, size_t dimensions = 1) {
347 total_noise_consumed_++;
348
349 switch (config_.threading_mode) {
350 case SDEThreadingMode::VECTORIZED:
351 return get_vectorized_noise(current_time, dt, dimensions);
352 case SDEThreadingMode::LOCK_FREE:
353 return get_lockfree_noise(current_time, dt, dimensions);
354 case SDEThreadingMode::MULTI_THREAD:
355 return get_multithreaded_noise(current_time, dt, dimensions);
356 default:
357 return get_cached_noise(current_time, dt, dimensions);
358 }
359 }
360
364 std::vector<NoiseData<T>> generate_monte_carlo_batch(T current_time, T dt,
365 size_t dimensions, size_t num_simulations) {
366 std::vector<NoiseData<T>> results;
367 results.reserve(num_simulations);
368
369 if (config_.enable_simd && num_simulations >= config_.batch_size) {
370 return generate_vectorized_batch(current_time, dt, dimensions, num_simulations);
371 } else {
372 return generate_standard_batch(current_time, dt, dimensions, num_simulations);
373 }
374 }
375
379 template<typename Integrator, typename InitialCondition>
380 auto monte_carlo_integrate(std::function<std::unique_ptr<Integrator>()> integrator_factory,
381 std::function<S()> initial_condition_generator,
382 T dt, T end_time, size_t num_simulations) {
383
384 std::vector<S> final_states;
385 final_states.reserve(num_simulations);
386
387 const size_t num_threads = config_.num_threads;
388 const size_t sims_per_thread = num_simulations / num_threads;
389
390 std::vector<std::future<std::vector<S>>> futures;
391
392 for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) {
393 size_t start_sim = thread_id * sims_per_thread;
394 size_t end_sim = (thread_id == num_threads - 1) ? num_simulations : (thread_id + 1) * sims_per_thread;
395
396 futures.emplace_back(std::async(std::launch::async, [=, this]() {
397 return run_thread_simulations(integrator_factory, initial_condition_generator,
398 dt, end_time, start_sim, end_sim, thread_id);
399 }));
400 }
401
402 // Collect results
403 for (auto& future : futures) {
404 auto thread_results = future.get();
405 final_states.insert(final_states.end(), thread_results.begin(), thread_results.end());
406 }
407
408 return final_states;
409 }
410
415 size_t noise_generated;
416 size_t noise_consumed;
417 size_t cache_hits;
418 size_t cache_misses;
419 double cache_hit_rate() const {
420 size_t total = cache_hits + cache_misses;
421 return total > 0 ? static_cast<double>(cache_hits) / total : 0.0;
422 }
423 double throughput_msamples_per_sec(std::chrono::milliseconds elapsed_time) const {
424 return noise_generated / (elapsed_time.count() * 1000.0);
425 }
426 };
427
428 PerformanceStats get_statistics() const {
429 return {
430 total_noise_generated_.load(),
431 total_noise_consumed_.load(),
432 cache_hits_.load(),
433 cache_misses_.load()
434 };
435 }
436
441 total_noise_generated_ = 0;
442 total_noise_consumed_ = 0;
443 cache_hits_ = 0;
444 cache_misses_ = 0;
445 }
446
450 void warmup(size_t warmup_samples = 100000) {
451 std::cout << "Warming up high-performance SDE system...\n";
452
453 auto start = std::chrono::high_resolution_clock::now();
454
455 // Generate warmup noise to populate caches
456 for (size_t i = 0; i < warmup_samples; ++i) {
457 get_noise_increment_fast(static_cast<T>(i * 0.01), 0.01, 1);
458 }
459
460 auto end = std::chrono::high_resolution_clock::now();
461 auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
462
463 std::cout << "Warmup completed: " << warmup_samples << " samples in "
464 << duration.count() << "ms\n";
465 std::cout << "Throughput: " << (warmup_samples / (duration.count() * 1000.0))
466 << " M samples/sec\n";
467 }
468
469private:
470 void initialize_system() {
471 // Initialize generators per thread
472 generators_.resize(config_.num_threads);
473 noise_queues_.resize(config_.num_threads);
474
475 for (size_t i = 0; i < config_.num_threads; ++i) {
476 generators_[i] = std::make_unique<SIMDNoiseGenerator>(12345 + i);
477 noise_queues_[i] = std::make_unique<LockFreeNoiseQueue<T>>();
478 }
479
480 // Initialize precomputed cache
481 if (config_.enable_precomputation) {
482 initialize_precomputed_cache();
483 }
484
485 // Start worker threads
486 if (config_.threading_mode != SDEThreadingMode::SINGLE_THREAD) {
487 start_worker_threads();
488 }
489 }
490
491 void initialize_precomputed_cache() {
492 precomputed_cache_.resize(config_.num_threads);
493
494 std::vector<std::future<void>> futures;
495
496 for (size_t i = 0; i < config_.num_threads; ++i) {
497 futures.emplace_back(std::async(std::launch::async, [this, i]() {
498 auto& cache = precomputed_cache_[i];
499 cache.reserve(config_.precompute_buffer_size);
500
501 // Pre-generate noise samples
502 for (size_t j = 0; j < config_.precompute_buffer_size; ++j) {
503 cache.push_back(generators_[i]->generate_batch(1)[0]);
504 }
505 }));
506 }
507
508 // Wait for all threads to complete
509 for (auto& future : futures) {
510 future.wait();
511 }
512
513 std::cout << "Precomputed " << (config_.num_threads * config_.precompute_buffer_size)
514 << " noise samples\n";
515 }
516
517 void start_worker_threads() {
518 running_ = true;
519 worker_threads_.resize(config_.num_threads);
520
521 for (size_t i = 0; i < config_.num_threads; ++i) {
522 worker_threads_[i] = std::thread([this, i]() {
523 worker_thread_function(i);
524 });
525
526 // Pin threads to cores if requested
527 if (config_.pin_threads) {
528#ifdef __linux__
529 cpu_set_t cpuset;
530 CPU_ZERO(&cpuset);
531 CPU_SET(i % std::thread::hardware_concurrency(), &cpuset);
532 pthread_setaffinity_np(worker_threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
533#endif
534 }
535 }
536 }
537
538 void worker_thread_function(size_t thread_id) {
539 while (running_) {
540 // Generate noise in batches when queues get low
541 if (noise_queues_[thread_id]->size() < config_.queue_size / 4) {
542 auto noise_batch = generators_[thread_id]->generate_batch(config_.batch_size);
543
544 for (size_t i = 0; i < noise_batch.size(); ++i) {
545 NoiseData<T> data(static_cast<T>(i * 0.01), {noise_batch[i]}, NoiseProcessType::WIENER);
546 noise_queues_[thread_id]->push(data);
547 }
548
549 total_noise_generated_ += noise_batch.size();
550 }
551
552 // Brief sleep to prevent spinning
553 std::this_thread::sleep_for(std::chrono::microseconds(10));
554 }
555 }
556
557 NoiseData<T> get_vectorized_noise(T current_time, T dt, size_t dimensions) {
558 size_t thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id()) % config_.num_threads;
559 auto noise_values = generators_[thread_id]->generate_batch(dimensions);
560
561 total_noise_generated_ += dimensions;
562 return NoiseData<T>(current_time, std::move(noise_values), NoiseProcessType::WIENER);
563 }
564
565 NoiseData<T> get_lockfree_noise(T current_time, T dt, size_t dimensions) {
566 size_t thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id()) % config_.num_threads;
567
568 NoiseData<T> result;
569 if (noise_queues_[thread_id]->pop(result)) {
570 cache_hits_++;
571 return result;
572 }
573
574 // Cache miss - generate immediately
575 cache_misses_++;
576 return get_vectorized_noise(current_time, dt, dimensions);
577 }
578
579 NoiseData<T> get_multithreaded_noise(T current_time, T dt, size_t dimensions) {
580 return get_vectorized_noise(current_time, dt, dimensions);
581 }
582
583 NoiseData<T> get_cached_noise(T current_time, T dt, size_t dimensions) {
584 if (!config_.enable_precomputation || precomputed_cache_.empty()) {
585 return get_vectorized_noise(current_time, dt, dimensions);
586 }
587
588 size_t thread_id = std::hash<std::thread::id>{}(std::this_thread::get_id()) % config_.num_threads;
589 size_t index = cache_index_++ % config_.precompute_buffer_size;
590
591 std::vector<double> values;
592 values.reserve(dimensions);
593
594 for (size_t i = 0; i < dimensions; ++i) {
595 size_t cache_idx = (index + i) % precomputed_cache_[thread_id].size();
596 values.push_back(precomputed_cache_[thread_id][cache_idx]);
597 }
598
599 cache_hits_++;
600 return NoiseData<T>(current_time, std::move(values), NoiseProcessType::WIENER);
601 }
602
603 std::vector<NoiseData<T>> generate_vectorized_batch(T current_time, T dt,
604 size_t dimensions, size_t num_simulations) {
605 std::vector<NoiseData<T>> results;
606 results.reserve(num_simulations);
607
608 const size_t total_samples = dimensions * num_simulations;
609 const size_t batches = (total_samples + config_.batch_size - 1) / config_.batch_size;
610
611 std::vector<std::future<std::vector<double>>> futures;
612
613 for (size_t batch = 0; batch < batches; ++batch) {
614 size_t batch_start = batch * config_.batch_size;
615 size_t batch_end = std::min(batch_start + config_.batch_size, total_samples);
616 size_t batch_size = batch_end - batch_start;
617
618 size_t thread_id = batch % config_.num_threads;
619
620 futures.emplace_back(std::async(std::launch::async, [this, thread_id, batch_size]() {
621 return generators_[thread_id]->generate_batch(batch_size);
622 }));
623 }
624
625 // Collect and organize results
626 size_t sample_idx = 0;
627 for (auto& future : futures) {
628 auto batch_samples = future.get();
629
630 for (double sample : batch_samples) {
631 size_t sim_id = sample_idx / dimensions;
632 size_t dim_id = sample_idx % dimensions;
633
634 if (dim_id == 0) {
635 results.emplace_back(current_time + sim_id * dt, std::vector<double>{}, NoiseProcessType::WIENER);
636 results.back().increments.reserve(dimensions);
637 }
638
639 results[sim_id].increments.push_back(sample);
640 sample_idx++;
641 }
642 }
643
644 total_noise_generated_ += total_samples;
645 return results;
646 }
647
648 std::vector<NoiseData<T>> generate_standard_batch(T current_time, T dt,
649 size_t dimensions, size_t num_simulations) {
650 std::vector<NoiseData<T>> results;
651 results.reserve(num_simulations);
652
653 for (size_t i = 0; i < num_simulations; ++i) {
654 results.push_back(get_noise_increment_fast(current_time + i * dt, dt, dimensions));
655 }
656
657 return results;
658 }
659
660 template<typename Integrator, typename InitialCondition>
661 std::vector<S> run_thread_simulations(std::function<std::unique_ptr<Integrator>()> integrator_factory,
662 std::function<S()> initial_condition_generator,
663 T dt, T end_time, size_t start_sim, size_t end_sim,
664 size_t thread_id) {
665 std::vector<S> results;
666 results.reserve(end_sim - start_sim);
667
668 for (size_t sim = start_sim; sim < end_sim; ++sim) {
669 auto integrator = integrator_factory();
670 S state = initial_condition_generator();
671
672 // Configure integrator to use this synchronizer
673 // This would require integrator to accept noise source
674
675 integrator->integrate(state, dt, end_time);
676 results.push_back(state);
677 }
678
679 return results;
680 }
681
682 void shutdown() {
683 running_ = false;
684
685 for (auto& thread : worker_threads_) {
686 if (thread.joinable()) {
687 thread.join();
688 }
689 }
690
691 worker_threads_.clear();
692 }
693};
694
695// ============================================================================
696// FIBER/COROUTINE SUPPORT (C++20)
697// ============================================================================
698
699#ifdef __cpp_impl_coroutine
700
704template<system_state S, can_be_time T = double>
705class FiberSDESynchronizer {
706public:
707 struct NoiseAwaitable {
708 T time;
709 T dt;
710 size_t dimensions;
711
712 bool await_ready() const noexcept { return false; }
713
714 void await_suspend(std::coroutine_handle<> handle) const {
715 // Schedule noise generation
716 }
717
718 NoiseData<T> await_resume() const {
719 // Return generated noise
720 return NoiseData<T>(time, std::vector<double>(dimensions, 0.1), NoiseProcessType::WIENER);
721 }
722 };
723
724 NoiseAwaitable get_noise_async(T time, T dt, size_t dimensions = 1) {
725 return {time, dt, dimensions};
726 }
727};
728
729#endif
730
731// ============================================================================
732// CONVENIENCE FUNCTIONS
733// ============================================================================
734
738template<system_state S, can_be_time T = double>
739auto create_monte_carlo_system(size_t num_simulations, size_t num_threads = 0) {
740 SDEThreadingConfig config = SDEThreadingConfig::auto_detect();
741
742 if (num_threads > 0) {
743 config.num_threads = num_threads;
744 }
745
746 // Optimize for Monte Carlo
747 config.threading_mode = SDEThreadingMode::VECTORIZED;
748 config.enable_precomputation = true;
749 config.enable_simd = true;
750 config.batch_size = std::max(size_t(1000), num_simulations / config.num_threads);
751
752 return std::make_unique<HighPerformanceSDESynchronizer<S, T>>(config);
753}
754
758template<system_state S, can_be_time T = double>
759auto create_realtime_system() {
760 SDEThreadingConfig config;
761 config.threading_mode = SDEThreadingMode::LOCK_FREE;
762 config.memory_strategy = MemoryStrategy::CACHE_ALIGNED;
763 config.enable_precomputation = true;
764 config.pin_threads = true;
765 config.batch_size = 1; // Minimal latency
766
767 return std::make_unique<HighPerformanceSDESynchronizer<S, T>>(config);
768}
769
773template<system_state S, can_be_time T = double>
774auto create_numa_system(const std::vector<int>& numa_nodes = {}) {
775 SDEThreadingConfig config = SDEThreadingConfig::auto_detect();
776 config.threading_mode = SDEThreadingMode::NUMA_AWARE;
777 config.memory_strategy = MemoryStrategy::NUMA_LOCAL;
778 config.numa_aware = true;
779 config.numa_nodes = numa_nodes;
780 config.use_huge_pages = true;
781
782 return std::make_unique<HighPerformanceSDESynchronizer<S, T>>(config);
783}
784
785} // namespace diffeq::core::composable
High-performance multi-threaded SDE synchronizer.
HighPerformanceSDESynchronizer(SDEThreadingConfig config=SDEThreadingConfig::auto_detect())
Construct high-performance SDE synchronizer.
void warmup(size_t warmup_samples=100000)
Warmup system for optimal performance.
NoiseData< T > get_noise_increment_fast(T current_time, T dt, size_t dimensions=1)
Get noise increment with ultra-low latency.
std::vector< NoiseData< T > > generate_monte_carlo_batch(T current_time, T dt, size_t dimensions, size_t num_simulations)
Generate batch of noise for Monte Carlo simulations.
auto monte_carlo_integrate(std::function< std::unique_ptr< Integrator >()> integrator_factory, std::function< S()> initial_condition_generator, T dt, T end_time, size_t num_simulations)
Monte Carlo integration with automatic parallelization.
Lock-free noise data queue for high-performance scenarios.
std::vector< double > generate_batch(size_t count, double intensity=1.0)
Generate batch of noise using SIMD when possible.
Configuration for high-performance SDE threading.
static SDEThreadingConfig auto_detect()
Auto-detect optimal configuration.
void validate() const
Validate configuration.