/* Copyright 2005-2013 Intel Corporation. All Rights Reserved. This file is part of Threading Building Blocks. Threading Building Blocks is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. Threading Building Blocks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Threading Building Blocks; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA As a special exception, you may use this file as part of a free software library without restriction. Specifically, if other files instantiate templates or use macros or inline functions from this file, or you compile this file and link it with other files to produce an executable, this file does not by itself cause the resulting executable to be covered by the GNU General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU General Public License. */ #include "perf.h" #include #include "tbb/blocked_range.h" #include "tbb/parallel_for.h" #include "tbb/parallel_reduce.h" #define NUM_CHILD_TASKS 2096 #define NUM_ROOT_TASKS 256 #define N 100000000 #define FINEST_GRAIN 10 #define FINE_GRAIN 50 #define MED_GRAIN 200 #define COARSE_GRAIN 1000 typedef int count_t; const count_t N_finest = (count_t)(N/log((double)N)/10); const count_t N_fine = N_finest * 20; const count_t N_med = N_fine * (count_t)log((double)N) / 5; class StaticTaskHolder { public: tbb::task *my_leafTaskPtr; StaticTaskHolder (); }; static StaticTaskHolder s_tasks; static count_t NumIterations; static count_t NumLeafTasks; static count_t NumRootTasks; class LeafTaskBase : public tbb::task { public: count_t my_ID; LeafTaskBase () {} LeafTaskBase ( count_t id ) : my_ID(id) {} }; class SimpleLeafTask : public LeafTaskBase { task* execute () { volatile count_t anchor = 0; for ( count_t i=0; i < NumIterations; ++i ) anchor += i; return NULL; } public: SimpleLeafTask ( count_t ) {} }; StaticTaskHolder::StaticTaskHolder () { static SimpleLeafTask s_t1(0); my_leafTaskPtr = &s_t1; } class Test_SPMC : public Perf::Test { protected: static const int numWorkloads = 4; static const count_t workloads[numWorkloads]; LeafTaskBase* my_leafTaskPtr; const char* Name () { return "SPMC"; } int NumWorkloads () { return numWorkloads; } void SetWorkload ( int idx ) { NumRootTasks = 1; NumIterations = workloads[idx]; NumLeafTasks = NUM_CHILD_TASKS * NUM_ROOT_TASKS / (NumIterations > 1000 ? 32 : 8); Perf::SetWorkloadName( "%d x %d", NumLeafTasks, NumIterations ); } void Run ( ThreadInfo& ) { tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task; r.set_ref_count( NumLeafTasks + 1 ); for ( count_t i = 0; i < NumLeafTasks; ++i ) r.spawn( *new(r.allocate_child()) SimpleLeafTask(0) ); r.wait_for_all(); tbb::task::destroy(r); } void RunSerial ( ThreadInfo& ) { const count_t n = NumLeafTasks * NumRootTasks; for ( count_t i=0; i < n; ++i ) { my_leafTaskPtr->my_ID = i; my_leafTaskPtr->execute(); } } public: Test_SPMC ( LeafTaskBase* leafTaskPtr = NULL ) { static SimpleLeafTask t(0); my_leafTaskPtr = leafTaskPtr ? leafTaskPtr : &t; } }; // class Test_SPMC const count_t Test_SPMC::workloads[Test_SPMC::numWorkloads] = { 1, 50, 500, 5000 }; template class LeavesLauncherTask : public tbb::task { count_t my_groupId; task* execute () { count_t base = my_groupId * NumLeafTasks; set_ref_count(NumLeafTasks + 1); for ( count_t i = 0; i < NumLeafTasks; ++i ) spawn( *new(allocate_child()) LeafTask(base + i) ); wait_for_all(); return NULL; } public: LeavesLauncherTask ( count_t groupId ) : my_groupId(groupId) {} }; template void RunShallowTree () { tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task; r.set_ref_count( NumRootTasks + 1 ); for ( count_t i = 0; i < NumRootTasks; ++i ) r.spawn( *new(r.allocate_child()) LeavesLauncherTask(i) ); r.wait_for_all(); tbb::task::destroy(r); } class Test_ShallowTree : public Test_SPMC { const char* Name () { return "ShallowTree"; } void SetWorkload ( int idx ) { NumRootTasks = NUM_ROOT_TASKS; NumIterations = workloads[idx]; NumLeafTasks = NumIterations > 200 ? NUM_CHILD_TASKS / 10 : (NumIterations > 50 ? NUM_CHILD_TASKS / 2 : NUM_CHILD_TASKS * 2); Perf::SetWorkloadName( "%d x %d", NumRootTasks * NumLeafTasks, NumIterations ); } void Run ( ThreadInfo& ) { RunShallowTree(); } }; // class Test_ShallowTree class LeafTaskSkewed : public LeafTaskBase { task* execute () { volatile count_t anchor = 0; double K = (double)NumRootTasks * NumLeafTasks; count_t n = count_t(sqrt(double(my_ID)) * double(my_ID) * my_ID / (4 * K * K)); for ( count_t i = 0; i < n; ++i ) anchor += i; return NULL; } public: LeafTaskSkewed ( count_t id ) : LeafTaskBase(id) {} }; class Test_ShallowTree_Skewed : public Test_SPMC { static LeafTaskSkewed SerialTaskBody; const char* Name () { return "ShallowTree_Skewed"; } int NumWorkloads () { return 1; } void SetWorkload ( int ) { NumRootTasks = NUM_ROOT_TASKS; NumLeafTasks = NUM_CHILD_TASKS; Perf::SetWorkloadName( "%d", NumRootTasks * NumLeafTasks ); } void Run ( ThreadInfo& ) { RunShallowTree(); } public: Test_ShallowTree_Skewed () : Test_SPMC(&SerialTaskBody) {} }; // class Test_ShallowTree_Skewed LeafTaskSkewed Test_ShallowTree_Skewed::SerialTaskBody(0); typedef tbb::blocked_range range_t; static count_t IterRange = N, IterGrain = 1; enum PartitionerType { SimplePartitioner = 0, AutoPartitioner = 1 }; class Test_Algs : public Perf::Test { protected: static const int numWorkloads = 4; static const count_t algRanges[numWorkloads]; static const count_t algGrains[numWorkloads]; tbb::simple_partitioner my_simplePartitioner; tbb::auto_partitioner my_autoPartitioner; PartitionerType my_partitionerType; bool UseAutoPartitioner () const { return my_partitionerType == AutoPartitioner; } int NumWorkloads () { return UseAutoPartitioner() ? 3 : numWorkloads; } void SetWorkload ( int idx ) { if ( UseAutoPartitioner() ) { IterRange = algRanges[idx ? numWorkloads - 1 : 0]; IterGrain = idx > 1 ? algGrains[numWorkloads - 1] : 1; } else { IterRange = algRanges[idx]; IterGrain = algGrains[idx]; } Perf::SetWorkloadName( "%d / %d", IterRange, IterGrain ); } public: Test_Algs ( PartitionerType pt = SimplePartitioner ) : my_partitionerType(pt) {} }; // class Test_Algs const count_t Test_Algs::algRanges[] = {N_finest, N_fine, N_med, N}; const count_t Test_Algs::algGrains[] = {1, FINE_GRAIN, MED_GRAIN, COARSE_GRAIN}; template class Test_PFor : public Test_Algs { protected: void Run ( ThreadInfo& ) { if ( UseAutoPartitioner() ) tbb::parallel_for( range_t(0, IterRange, IterGrain), Body(), my_autoPartitioner ); else tbb::parallel_for( range_t(0, IterRange, IterGrain), Body(), my_simplePartitioner ); } void RunSerial ( ThreadInfo& ) { Body body; body( range_t(0, IterRange, IterGrain) ); } public: Test_PFor ( PartitionerType pt = SimplePartitioner ) : Test_Algs(pt) {} }; // class Test_PFor class SimpleForBody { public: void operator()( const range_t& r ) const { count_t end = r.end(); volatile count_t anchor = 0; for( count_t i = r.begin(); i < end; ++i ) anchor += i; } }; // class SimpleForBody class Test_PFor_Simple : public Test_PFor { protected: const char* Name () { return UseAutoPartitioner() ? "PFor-AP" : "PFor"; } public: Test_PFor_Simple ( PartitionerType pt = SimplePartitioner ) : Test_PFor(pt) {} }; // class Test_PFor_Simple class SkewedForBody { public: void operator()( const range_t& r ) const { count_t end = (r.end() + 1) * (r.end() + 1); volatile count_t anchor = 0; for( count_t i = r.begin() * r.begin(); i < end; ++i ) anchor += i; } }; // class SkewedForBody class Test_PFor_Skewed : public Test_PFor { typedef Test_PFor base_type; protected: const char* Name () { return UseAutoPartitioner() ? "PFor-Skewed-AP" : "PFor-Skewed"; } void SetWorkload ( int idx ) { base_type::SetWorkload(idx); IterRange = (count_t)(sqrt((double)IterRange) * sqrt(sqrt((double)N / IterRange))); Perf::SetWorkloadName( "%d", IterRange ); } public: Test_PFor_Skewed ( PartitionerType pt = SimplePartitioner ) : base_type(pt) {} }; // class Test_PFor_Skewed PartitionerType gPartitionerType; count_t NestingRange; count_t NestingGrain; class NestingForBody { count_t my_depth; tbb::simple_partitioner my_simplePartitioner; tbb::auto_partitioner my_autoPartitioner; template void run ( const range_t& r, Partitioner& p ) const { count_t end = r.end(); if ( my_depth > 1 ) for ( count_t i = r.begin(); i < end; ++i ) tbb::parallel_for( range_t(0, IterRange, IterGrain), NestingForBody(my_depth - 1), p ); else for ( count_t i = r.begin(); i < end; ++i ) tbb::parallel_for( range_t(0, IterRange, IterGrain), SimpleForBody(), p ); } public: void operator()( const range_t& r ) const { if ( gPartitionerType == AutoPartitioner ) run( r, my_autoPartitioner ); else run( r, my_simplePartitioner ); } NestingForBody ( count_t depth = 1 ) : my_depth(depth) {} }; // class NestingForBody enum NestingType { HollowNesting, ShallowNesting, DeepNesting }; class Test_PFor_Nested : public Test_Algs { typedef Test_Algs base_type; NestingType my_nestingType; count_t my_nestingDepth; protected: const char* Name () { static const char* names[] = { "PFor-HollowNested", "PFor-HollowNested-AP", "PFor-ShallowNested", "PFor-ShallowNested-AP", "PFor-DeeplyNested", "PFor-DeeplyNested-AP" }; return names[my_nestingType * 2 + my_partitionerType]; } int NumWorkloads () { return my_nestingType == ShallowNesting ? (UseAutoPartitioner() ? 3 : 2) : 1; } void SetWorkload ( int idx ) { gPartitionerType = my_partitionerType; if ( my_nestingType == DeepNesting ) { NestingRange = 1024; IterGrain = NestingGrain = 1; IterRange = 4; my_nestingDepth = 4; } else if ( my_nestingType == ShallowNesting ) { int i = idx ? numWorkloads - 1 : 0; count_t baseRange = algRanges[i]; count_t baseGrain = !UseAutoPartitioner() || idx > 1 ? algGrains[i] : 1; NestingRange = IterRange = (count_t)sqrt((double)baseRange); NestingGrain = IterGrain = (count_t)sqrt((double)baseGrain); } else { NestingRange = N / 100; NestingGrain = COARSE_GRAIN / 10; IterRange = 2; IterGrain = 1; } Perf::SetWorkloadName( "%d / %d", NestingRange, NestingGrain ); } void Run ( ThreadInfo& ) { if ( UseAutoPartitioner() ) tbb::parallel_for( range_t(0, NestingRange, NestingGrain), NestingForBody(my_nestingDepth), my_autoPartitioner ); else tbb::parallel_for( range_t(0, NestingRange, NestingGrain), NestingForBody(my_nestingDepth), my_simplePartitioner ); } void RunSerial ( ThreadInfo& ) { for ( int i = 0; i < NestingRange; ++i ) { SimpleForBody body; body( range_t(0, IterRange, IterGrain) ); } } public: Test_PFor_Nested ( NestingType nt, PartitionerType pt ) : base_type(pt), my_nestingType(nt), my_nestingDepth(1) {} }; // class Test_PFor_Nested class SimpleReduceBody { public: count_t my_sum; SimpleReduceBody () : my_sum(0) {} SimpleReduceBody ( SimpleReduceBody&, tbb::split ) : my_sum(0) {} void join( SimpleReduceBody& rhs ) { my_sum += rhs.my_sum;} void operator()( const range_t& r ) { count_t end = r.end(); volatile count_t anchor = 0; for( count_t i = r.begin(); i < end; ++i ) anchor += i; my_sum = anchor; } }; // class SimpleReduceBody class Test_PReduce : public Test_Algs { protected: const char* Name () { return UseAutoPartitioner() ? "PReduce-AP" : "PReduce"; } void Run ( ThreadInfo& ) { SimpleReduceBody body; if ( UseAutoPartitioner() ) tbb::parallel_reduce( range_t(0, IterRange, IterGrain), body, my_autoPartitioner ); else tbb::parallel_reduce( range_t(0, IterRange, IterGrain), body, my_simplePartitioner ); } void RunSerial ( ThreadInfo& ) { SimpleReduceBody body; body( range_t(0, IterRange, IterGrain) ); } public: Test_PReduce ( PartitionerType pt = SimplePartitioner ) : Test_Algs(pt) {} }; // class Test_PReduce int main( int argc, char* argv[] ) { Perf::SessionSettings opts (Perf::UseTaskScheduler | Perf::UseSerialBaseline, "perf_sched.txt"); // Perf::UseBaseline, Perf::UseSmallestWorkloadOnly Perf::RegisterTest(); Perf::RegisterTest(); Perf::RegisterTest(); Test_PFor_Simple pf_sp(SimplePartitioner), pf_ap(AutoPartitioner); Perf::RegisterTest(pf_sp); Perf::RegisterTest(pf_ap); Test_PReduce pr_sp(SimplePartitioner), pr_ap(AutoPartitioner); Perf::RegisterTest(pr_sp); Perf::RegisterTest(pr_ap); Test_PFor_Skewed pf_s_sp(SimplePartitioner), pf_s_ap(AutoPartitioner); Perf::RegisterTest(pf_s_sp); Perf::RegisterTest(pf_s_ap); Test_PFor_Nested pf_hn_sp(HollowNesting, SimplePartitioner), pf_hn_ap(HollowNesting, AutoPartitioner), pf_sn_sp(ShallowNesting, SimplePartitioner), pf_sn_ap(ShallowNesting, AutoPartitioner), pf_dn_sp(DeepNesting, SimplePartitioner), pf_dn_ap(DeepNesting, AutoPartitioner); Perf::RegisterTest(pf_hn_sp); Perf::RegisterTest(pf_hn_ap); Perf::RegisterTest(pf_sn_sp); Perf::RegisterTest(pf_sn_ap); Perf::RegisterTest(pf_dn_sp); Perf::RegisterTest(pf_dn_ap); return Perf::TestMain(argc, argv, &opts); }