You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
464 lines
15 KiB
464 lines
15 KiB
/*
|
|
Copyright 2005-2013 Intel Corporation. All Rights Reserved.
|
|
|
|
This file is part of Threading Building Blocks.
|
|
|
|
Threading Building Blocks is free software; you can redistribute it
|
|
and/or modify it under the terms of the GNU General Public License
|
|
version 2 as published by the Free Software Foundation.
|
|
|
|
Threading Building Blocks is distributed in the hope that it will be
|
|
useful, but WITHOUT ANY WARRANTY; without even the implied warranty
|
|
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Threading Building Blocks; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
As a special exception, you may use this file as part of a free software
|
|
library without restriction. Specifically, if other files instantiate
|
|
templates or use macros or inline functions from this file, or you compile
|
|
this file and link it with other files to produce an executable, this
|
|
file does not by itself cause the resulting executable to be covered by
|
|
the GNU General Public License. This exception does not however
|
|
invalidate any other reasons why the executable file might be covered by
|
|
the GNU General Public License.
|
|
*/
|
|
|
|
#include "perf.h"
|
|
|
|
#include <cmath>
|
|
|
|
#include "tbb/blocked_range.h"
|
|
#include "tbb/parallel_for.h"
|
|
#include "tbb/parallel_reduce.h"
|
|
|
|
#define NUM_CHILD_TASKS 2096
|
|
#define NUM_ROOT_TASKS 256
|
|
|
|
#define N 100000000
|
|
#define FINEST_GRAIN 10
|
|
#define FINE_GRAIN 50
|
|
#define MED_GRAIN 200
|
|
#define COARSE_GRAIN 1000
|
|
|
|
|
|
typedef int count_t;
|
|
|
|
const count_t N_finest = (count_t)(N/log((double)N)/10);
|
|
const count_t N_fine = N_finest * 20;
|
|
const count_t N_med = N_fine * (count_t)log((double)N) / 5;
|
|
|
|
class StaticTaskHolder {
|
|
public:
|
|
tbb::task *my_leafTaskPtr;
|
|
StaticTaskHolder ();
|
|
};
|
|
|
|
static StaticTaskHolder s_tasks;
|
|
|
|
static count_t NumIterations;
|
|
static count_t NumLeafTasks;
|
|
static count_t NumRootTasks;
|
|
|
|
class LeafTaskBase : public tbb::task {
|
|
public:
|
|
count_t my_ID;
|
|
|
|
LeafTaskBase () {}
|
|
LeafTaskBase ( count_t id ) : my_ID(id) {}
|
|
};
|
|
|
|
class SimpleLeafTask : public LeafTaskBase {
|
|
task* execute () {
|
|
volatile count_t anchor = 0;
|
|
for ( count_t i=0; i < NumIterations; ++i )
|
|
anchor += i;
|
|
return NULL;
|
|
}
|
|
public:
|
|
SimpleLeafTask ( count_t ) {}
|
|
};
|
|
|
|
StaticTaskHolder::StaticTaskHolder () {
|
|
static SimpleLeafTask s_t1(0);
|
|
my_leafTaskPtr = &s_t1;
|
|
}
|
|
|
|
class Test_SPMC : public Perf::Test {
|
|
protected:
|
|
static const int numWorkloads = 4;
|
|
static const count_t workloads[numWorkloads];
|
|
|
|
LeafTaskBase* my_leafTaskPtr;
|
|
|
|
const char* Name () { return "SPMC"; }
|
|
|
|
int NumWorkloads () { return numWorkloads; }
|
|
|
|
void SetWorkload ( int idx ) {
|
|
NumRootTasks = 1;
|
|
NumIterations = workloads[idx];
|
|
NumLeafTasks = NUM_CHILD_TASKS * NUM_ROOT_TASKS / (NumIterations > 1000 ? 32 : 8);
|
|
Perf::SetWorkloadName( "%d x %d", NumLeafTasks, NumIterations );
|
|
}
|
|
|
|
void Run ( ThreadInfo& ) {
|
|
tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task;
|
|
r.set_ref_count( NumLeafTasks + 1 );
|
|
for ( count_t i = 0; i < NumLeafTasks; ++i )
|
|
r.spawn( *new(r.allocate_child()) SimpleLeafTask(0) );
|
|
r.wait_for_all();
|
|
tbb::task::destroy(r);
|
|
}
|
|
|
|
void RunSerial ( ThreadInfo& ) {
|
|
const count_t n = NumLeafTasks * NumRootTasks;
|
|
for ( count_t i=0; i < n; ++i ) {
|
|
my_leafTaskPtr->my_ID = i;
|
|
my_leafTaskPtr->execute();
|
|
}
|
|
}
|
|
|
|
public:
|
|
Test_SPMC ( LeafTaskBase* leafTaskPtr = NULL ) {
|
|
static SimpleLeafTask t(0);
|
|
my_leafTaskPtr = leafTaskPtr ? leafTaskPtr : &t;
|
|
}
|
|
}; // class Test_SPMC
|
|
|
|
const count_t Test_SPMC::workloads[Test_SPMC::numWorkloads] = { 1, 50, 500, 5000 };
|
|
|
|
template<class LeafTask>
|
|
class LeavesLauncherTask : public tbb::task {
|
|
count_t my_groupId;
|
|
|
|
task* execute () {
|
|
count_t base = my_groupId * NumLeafTasks;
|
|
set_ref_count(NumLeafTasks + 1);
|
|
for ( count_t i = 0; i < NumLeafTasks; ++i )
|
|
spawn( *new(allocate_child()) LeafTask(base + i) );
|
|
wait_for_all();
|
|
return NULL;
|
|
}
|
|
public:
|
|
LeavesLauncherTask ( count_t groupId ) : my_groupId(groupId) {}
|
|
};
|
|
|
|
template<class LeafTask>
|
|
void RunShallowTree () {
|
|
tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task;
|
|
r.set_ref_count( NumRootTasks + 1 );
|
|
for ( count_t i = 0; i < NumRootTasks; ++i )
|
|
r.spawn( *new(r.allocate_child()) LeavesLauncherTask<LeafTask>(i) );
|
|
r.wait_for_all();
|
|
tbb::task::destroy(r);
|
|
}
|
|
|
|
class Test_ShallowTree : public Test_SPMC {
|
|
const char* Name () { return "ShallowTree"; }
|
|
|
|
void SetWorkload ( int idx ) {
|
|
NumRootTasks = NUM_ROOT_TASKS;
|
|
NumIterations = workloads[idx];
|
|
NumLeafTasks = NumIterations > 200 ? NUM_CHILD_TASKS / 10 :
|
|
(NumIterations > 50 ? NUM_CHILD_TASKS / 2 : NUM_CHILD_TASKS * 2);
|
|
Perf::SetWorkloadName( "%d x %d", NumRootTasks * NumLeafTasks, NumIterations );
|
|
}
|
|
|
|
void Run ( ThreadInfo& ) {
|
|
RunShallowTree<SimpleLeafTask>();
|
|
}
|
|
}; // class Test_ShallowTree
|
|
|
|
class LeafTaskSkewed : public LeafTaskBase {
|
|
task* execute () {
|
|
volatile count_t anchor = 0;
|
|
double K = (double)NumRootTasks * NumLeafTasks;
|
|
count_t n = count_t(sqrt(double(my_ID)) * double(my_ID) * my_ID / (4 * K * K));
|
|
for ( count_t i = 0; i < n; ++i )
|
|
anchor += i;
|
|
return NULL;
|
|
}
|
|
public:
|
|
LeafTaskSkewed ( count_t id ) : LeafTaskBase(id) {}
|
|
};
|
|
|
|
class Test_ShallowTree_Skewed : public Test_SPMC {
|
|
static LeafTaskSkewed SerialTaskBody;
|
|
|
|
const char* Name () { return "ShallowTree_Skewed"; }
|
|
|
|
int NumWorkloads () { return 1; }
|
|
|
|
void SetWorkload ( int ) {
|
|
NumRootTasks = NUM_ROOT_TASKS;
|
|
NumLeafTasks = NUM_CHILD_TASKS;
|
|
Perf::SetWorkloadName( "%d", NumRootTasks * NumLeafTasks );
|
|
}
|
|
|
|
void Run ( ThreadInfo& ) {
|
|
RunShallowTree<LeafTaskSkewed>();
|
|
}
|
|
|
|
public:
|
|
Test_ShallowTree_Skewed () : Test_SPMC(&SerialTaskBody) {}
|
|
}; // class Test_ShallowTree_Skewed
|
|
|
|
LeafTaskSkewed Test_ShallowTree_Skewed::SerialTaskBody(0);
|
|
|
|
typedef tbb::blocked_range<count_t> range_t;
|
|
|
|
static count_t IterRange = N,
|
|
IterGrain = 1;
|
|
|
|
enum PartitionerType {
|
|
SimplePartitioner = 0,
|
|
AutoPartitioner = 1
|
|
};
|
|
|
|
class Test_Algs : public Perf::Test {
|
|
protected:
|
|
static const int numWorkloads = 4;
|
|
static const count_t algRanges[numWorkloads];
|
|
static const count_t algGrains[numWorkloads];
|
|
|
|
tbb::simple_partitioner my_simplePartitioner;
|
|
tbb::auto_partitioner my_autoPartitioner;
|
|
PartitionerType my_partitionerType;
|
|
|
|
bool UseAutoPartitioner () const { return my_partitionerType == AutoPartitioner; }
|
|
|
|
int NumWorkloads () { return UseAutoPartitioner() ? 3 : numWorkloads; }
|
|
|
|
void SetWorkload ( int idx ) {
|
|
if ( UseAutoPartitioner() ) {
|
|
IterRange = algRanges[idx ? numWorkloads - 1 : 0];
|
|
IterGrain = idx > 1 ? algGrains[numWorkloads - 1] : 1;
|
|
}
|
|
else {
|
|
IterRange = algRanges[idx];
|
|
IterGrain = algGrains[idx];
|
|
}
|
|
Perf::SetWorkloadName( "%d / %d", IterRange, IterGrain );
|
|
}
|
|
public:
|
|
Test_Algs ( PartitionerType pt = SimplePartitioner ) : my_partitionerType(pt) {}
|
|
}; // class Test_Algs
|
|
|
|
const count_t Test_Algs::algRanges[] = {N_finest, N_fine, N_med, N};
|
|
const count_t Test_Algs::algGrains[] = {1, FINE_GRAIN, MED_GRAIN, COARSE_GRAIN};
|
|
|
|
template <typename Body>
|
|
class Test_PFor : public Test_Algs {
|
|
protected:
|
|
void Run ( ThreadInfo& ) {
|
|
if ( UseAutoPartitioner() )
|
|
tbb::parallel_for( range_t(0, IterRange, IterGrain), Body(), my_autoPartitioner );
|
|
else
|
|
tbb::parallel_for( range_t(0, IterRange, IterGrain), Body(), my_simplePartitioner );
|
|
}
|
|
|
|
void RunSerial ( ThreadInfo& ) {
|
|
Body body;
|
|
body( range_t(0, IterRange, IterGrain) );
|
|
}
|
|
public:
|
|
Test_PFor ( PartitionerType pt = SimplePartitioner ) : Test_Algs(pt) {}
|
|
}; // class Test_PFor
|
|
|
|
class SimpleForBody {
|
|
public:
|
|
void operator()( const range_t& r ) const {
|
|
count_t end = r.end();
|
|
volatile count_t anchor = 0;
|
|
for( count_t i = r.begin(); i < end; ++i )
|
|
anchor += i;
|
|
}
|
|
}; // class SimpleForBody
|
|
|
|
class Test_PFor_Simple : public Test_PFor<SimpleForBody> {
|
|
protected:
|
|
const char* Name () { return UseAutoPartitioner() ? "PFor-AP" : "PFor"; }
|
|
public:
|
|
Test_PFor_Simple ( PartitionerType pt = SimplePartitioner ) : Test_PFor<SimpleForBody>(pt) {}
|
|
}; // class Test_PFor_Simple
|
|
|
|
class SkewedForBody {
|
|
public:
|
|
void operator()( const range_t& r ) const {
|
|
count_t end = (r.end() + 1) * (r.end() + 1);
|
|
volatile count_t anchor = 0;
|
|
for( count_t i = r.begin() * r.begin(); i < end; ++i )
|
|
anchor += i;
|
|
}
|
|
}; // class SkewedForBody
|
|
|
|
class Test_PFor_Skewed : public Test_PFor<SkewedForBody> {
|
|
typedef Test_PFor<SkewedForBody> base_type;
|
|
protected:
|
|
const char* Name () { return UseAutoPartitioner() ? "PFor-Skewed-AP" : "PFor-Skewed"; }
|
|
|
|
void SetWorkload ( int idx ) {
|
|
base_type::SetWorkload(idx);
|
|
IterRange = (count_t)(sqrt((double)IterRange) * sqrt(sqrt((double)N / IterRange)));
|
|
Perf::SetWorkloadName( "%d", IterRange );
|
|
}
|
|
|
|
public:
|
|
Test_PFor_Skewed ( PartitionerType pt = SimplePartitioner ) : base_type(pt) {}
|
|
}; // class Test_PFor_Skewed
|
|
|
|
PartitionerType gPartitionerType;
|
|
count_t NestingRange;
|
|
count_t NestingGrain;
|
|
|
|
class NestingForBody {
|
|
count_t my_depth;
|
|
tbb::simple_partitioner my_simplePartitioner;
|
|
tbb::auto_partitioner my_autoPartitioner;
|
|
|
|
template<class Partitioner>
|
|
void run ( const range_t& r, Partitioner& p ) const {
|
|
count_t end = r.end();
|
|
if ( my_depth > 1 )
|
|
for ( count_t i = r.begin(); i < end; ++i )
|
|
tbb::parallel_for( range_t(0, IterRange, IterGrain), NestingForBody(my_depth - 1), p );
|
|
else
|
|
for ( count_t i = r.begin(); i < end; ++i )
|
|
tbb::parallel_for( range_t(0, IterRange, IterGrain), SimpleForBody(), p );
|
|
}
|
|
public:
|
|
void operator()( const range_t& r ) const {
|
|
if ( gPartitionerType == AutoPartitioner )
|
|
run( r, my_autoPartitioner );
|
|
else
|
|
run( r, my_simplePartitioner );
|
|
}
|
|
NestingForBody ( count_t depth = 1 ) : my_depth(depth) {}
|
|
}; // class NestingForBody
|
|
|
|
enum NestingType {
|
|
HollowNesting,
|
|
ShallowNesting,
|
|
DeepNesting
|
|
};
|
|
|
|
class Test_PFor_Nested : public Test_Algs {
|
|
typedef Test_Algs base_type;
|
|
|
|
NestingType my_nestingType;
|
|
count_t my_nestingDepth;
|
|
|
|
protected:
|
|
const char* Name () {
|
|
static const char* names[] = { "PFor-HollowNested", "PFor-HollowNested-AP",
|
|
"PFor-ShallowNested", "PFor-ShallowNested-AP",
|
|
"PFor-DeeplyNested", "PFor-DeeplyNested-AP" };
|
|
return names[my_nestingType * 2 + my_partitionerType];
|
|
}
|
|
|
|
int NumWorkloads () { return my_nestingType == ShallowNesting ? (UseAutoPartitioner() ? 3 : 2) : 1; }
|
|
|
|
void SetWorkload ( int idx ) {
|
|
gPartitionerType = my_partitionerType;
|
|
if ( my_nestingType == DeepNesting ) {
|
|
NestingRange = 1024;
|
|
IterGrain = NestingGrain = 1;
|
|
IterRange = 4;
|
|
my_nestingDepth = 4;
|
|
}
|
|
else if ( my_nestingType == ShallowNesting ) {
|
|
int i = idx ? numWorkloads - 1 : 0;
|
|
count_t baseRange = algRanges[i];
|
|
count_t baseGrain = !UseAutoPartitioner() || idx > 1 ? algGrains[i] : 1;
|
|
NestingRange = IterRange = (count_t)sqrt((double)baseRange);
|
|
NestingGrain = IterGrain = (count_t)sqrt((double)baseGrain);
|
|
}
|
|
else {
|
|
NestingRange = N / 100;
|
|
NestingGrain = COARSE_GRAIN / 10;
|
|
IterRange = 2;
|
|
IterGrain = 1;
|
|
}
|
|
Perf::SetWorkloadName( "%d / %d", NestingRange, NestingGrain );
|
|
}
|
|
|
|
void Run ( ThreadInfo& ) {
|
|
if ( UseAutoPartitioner() )
|
|
tbb::parallel_for( range_t(0, NestingRange, NestingGrain), NestingForBody(my_nestingDepth), my_autoPartitioner );
|
|
else
|
|
tbb::parallel_for( range_t(0, NestingRange, NestingGrain), NestingForBody(my_nestingDepth), my_simplePartitioner );
|
|
}
|
|
|
|
void RunSerial ( ThreadInfo& ) {
|
|
for ( int i = 0; i < NestingRange; ++i ) {
|
|
SimpleForBody body;
|
|
body( range_t(0, IterRange, IterGrain) );
|
|
}
|
|
}
|
|
public:
|
|
Test_PFor_Nested ( NestingType nt, PartitionerType pt ) : base_type(pt), my_nestingType(nt), my_nestingDepth(1) {}
|
|
}; // class Test_PFor_Nested
|
|
|
|
class SimpleReduceBody {
|
|
public:
|
|
count_t my_sum;
|
|
SimpleReduceBody () : my_sum(0) {}
|
|
SimpleReduceBody ( SimpleReduceBody&, tbb::split ) : my_sum(0) {}
|
|
void join( SimpleReduceBody& rhs ) { my_sum += rhs.my_sum;}
|
|
void operator()( const range_t& r ) {
|
|
count_t end = r.end();
|
|
volatile count_t anchor = 0;
|
|
for( count_t i = r.begin(); i < end; ++i )
|
|
anchor += i;
|
|
my_sum = anchor;
|
|
}
|
|
}; // class SimpleReduceBody
|
|
|
|
class Test_PReduce : public Test_Algs {
|
|
protected:
|
|
const char* Name () { return UseAutoPartitioner() ? "PReduce-AP" : "PReduce"; }
|
|
|
|
void Run ( ThreadInfo& ) {
|
|
SimpleReduceBody body;
|
|
if ( UseAutoPartitioner() )
|
|
tbb::parallel_reduce( range_t(0, IterRange, IterGrain), body, my_autoPartitioner );
|
|
else
|
|
tbb::parallel_reduce( range_t(0, IterRange, IterGrain), body, my_simplePartitioner );
|
|
}
|
|
|
|
void RunSerial ( ThreadInfo& ) {
|
|
SimpleReduceBody body;
|
|
body( range_t(0, IterRange, IterGrain) );
|
|
}
|
|
public:
|
|
Test_PReduce ( PartitionerType pt = SimplePartitioner ) : Test_Algs(pt) {}
|
|
}; // class Test_PReduce
|
|
|
|
int main( int argc, char* argv[] ) {
|
|
Perf::SessionSettings opts (Perf::UseTaskScheduler | Perf::UseSerialBaseline, "perf_sched.txt"); // Perf::UseBaseline, Perf::UseSmallestWorkloadOnly
|
|
Perf::RegisterTest<Test_SPMC>();
|
|
Perf::RegisterTest<Test_ShallowTree>();
|
|
Perf::RegisterTest<Test_ShallowTree_Skewed>();
|
|
Test_PFor_Simple pf_sp(SimplePartitioner), pf_ap(AutoPartitioner);
|
|
Perf::RegisterTest(pf_sp);
|
|
Perf::RegisterTest(pf_ap);
|
|
Test_PReduce pr_sp(SimplePartitioner), pr_ap(AutoPartitioner);
|
|
Perf::RegisterTest(pr_sp);
|
|
Perf::RegisterTest(pr_ap);
|
|
Test_PFor_Skewed pf_s_sp(SimplePartitioner), pf_s_ap(AutoPartitioner);
|
|
Perf::RegisterTest(pf_s_sp);
|
|
Perf::RegisterTest(pf_s_ap);
|
|
Test_PFor_Nested pf_hn_sp(HollowNesting, SimplePartitioner), pf_hn_ap(HollowNesting, AutoPartitioner),
|
|
pf_sn_sp(ShallowNesting, SimplePartitioner), pf_sn_ap(ShallowNesting, AutoPartitioner),
|
|
pf_dn_sp(DeepNesting, SimplePartitioner), pf_dn_ap(DeepNesting, AutoPartitioner);
|
|
Perf::RegisterTest(pf_hn_sp);
|
|
Perf::RegisterTest(pf_hn_ap);
|
|
Perf::RegisterTest(pf_sn_sp);
|
|
Perf::RegisterTest(pf_sn_ap);
|
|
Perf::RegisterTest(pf_dn_sp);
|
|
Perf::RegisterTest(pf_dn_ap);
|
|
return Perf::TestMain(argc, argv, &opts);
|
|
}
|