tempest/resources/3rdparty/tbb41_20130314_merged-win-l.../src/perf/perf_sched.cpp


								/*

								    Copyright 2005-2013 Intel Corporation.  All Rights Reserved.


								    This file is part of Threading Building Blocks.


								    Threading Building Blocks is free software; you can redistribute it

								    and/or modify it under the terms of the GNU General Public License

								    version 2 as published by the Free Software Foundation.


								    Threading Building Blocks is distributed in the hope that it will be

								    useful, but WITHOUT ANY WARRANTY; without even the implied warranty

								    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								    GNU General Public License for more details.


								    You should have received a copy of the GNU General Public License

								    along with Threading Building Blocks; if not, write to the Free Software

								    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA


								    As a special exception, you may use this file as part of a free software

								    library without restriction.  Specifically, if other files instantiate

								    templates or use macros or inline functions from this file, or you compile

								    this file and link it with other files to produce an executable, this

								    file does not by itself cause the resulting executable to be covered by

								    the GNU General Public License.  This exception does not however

								    invalidate any other reasons why the executable file might be covered by

								    the GNU General Public License.

								*/


								#include "perf.h"


								#include <cmath>


								#include "tbb/blocked_range.h"

								#include "tbb/parallel_for.h"

								#include "tbb/parallel_reduce.h"


								#define NUM_CHILD_TASKS     2096

								#define NUM_ROOT_TASKS      256


								#define N               100000000

								#define FINEST_GRAIN    10

								#define FINE_GRAIN      50

								#define MED_GRAIN       200

								#define COARSE_GRAIN    1000


								typedef int count_t;


								const count_t N_finest = (count_t)(N/log((double)N)/10);

								const count_t N_fine = N_finest * 20;

								const count_t N_med = N_fine * (count_t)log((double)N) / 5;


								class StaticTaskHolder {

								public:

								    tbb::task *my_leafTaskPtr;

								    StaticTaskHolder ();

								};


								static StaticTaskHolder s_tasks;


								static count_t NumIterations;

								static count_t NumLeafTasks;

								static count_t NumRootTasks;


								class LeafTaskBase : public tbb::task {

								public:

								    count_t my_ID;


								    LeafTaskBase () {}

								    LeafTaskBase ( count_t id ) : my_ID(id) {}

								};


								class SimpleLeafTask : public LeafTaskBase {

								    task* execute () {

								        volatile count_t anchor = 0;

								        for ( count_t i=0; i < NumIterations; ++i )

								            anchor += i;

								        return NULL;

								    }

								public:

								    SimpleLeafTask ( count_t ) {}

								};


								StaticTaskHolder::StaticTaskHolder () {

								    static SimpleLeafTask s_t1(0);

								    my_leafTaskPtr = &s_t1;

								}


								class Test_SPMC : public Perf::Test {

								protected:

								    static const int numWorkloads = 4;

								    static const count_t workloads[numWorkloads];


								    LeafTaskBase* my_leafTaskPtr;


								    const char* Name () { return "SPMC"; }


								    int NumWorkloads () { return numWorkloads; }


								    void SetWorkload ( int idx ) {

								        NumRootTasks = 1;

								        NumIterations = workloads[idx];

								        NumLeafTasks = NUM_CHILD_TASKS * NUM_ROOT_TASKS / (NumIterations > 1000 ? 32 : 8);

								        Perf::SetWorkloadName( "%d x %d", NumLeafTasks, NumIterations );

								    }


								    void Run ( ThreadInfo& ) {

								        tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task;

								        r.set_ref_count( NumLeafTasks + 1 );

								        for ( count_t i = 0; i < NumLeafTasks; ++i )

								            r.spawn( *new(r.allocate_child()) SimpleLeafTask(0) );

								        r.wait_for_all();

								        tbb::task::destroy(r);

								    }


								    void RunSerial ( ThreadInfo& ) {

								        const count_t n = NumLeafTasks * NumRootTasks;

								        for ( count_t i=0; i < n; ++i ) {

								            my_leafTaskPtr->my_ID = i;

								            my_leafTaskPtr->execute();

								        }

								    }


								public:

								    Test_SPMC ( LeafTaskBase* leafTaskPtr = NULL ) {

								        static SimpleLeafTask t(0);

								        my_leafTaskPtr = leafTaskPtr ? leafTaskPtr : &t;

								    }

								}; // class Test_SPMC


								const count_t Test_SPMC::workloads[Test_SPMC::numWorkloads] = { 1, 50, 500, 5000 };


								template<class LeafTask>

								class LeavesLauncherTask : public tbb::task {

								    count_t my_groupId;


								    task* execute () {

								        count_t base = my_groupId * NumLeafTasks;

								        set_ref_count(NumLeafTasks + 1);

								        for ( count_t i = 0; i < NumLeafTasks; ++i )

								            spawn( *new(allocate_child()) LeafTask(base + i) );

								        wait_for_all();

								        return NULL;

								    }

								public:

								    LeavesLauncherTask ( count_t groupId ) : my_groupId(groupId) {}

								};


								template<class LeafTask>

								void RunShallowTree () {

								    tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task;

								    r.set_ref_count( NumRootTasks + 1 );

								    for ( count_t i = 0; i < NumRootTasks; ++i )

								        r.spawn( *new(r.allocate_child()) LeavesLauncherTask<LeafTask>(i) );

								    r.wait_for_all();

								    tbb::task::destroy(r);

								}


								class Test_ShallowTree : public Test_SPMC {

								    const char* Name () { return "ShallowTree"; }


								    void SetWorkload ( int idx ) {

								        NumRootTasks = NUM_ROOT_TASKS;

								        NumIterations = workloads[idx];

								        NumLeafTasks = NumIterations > 200 ? NUM_CHILD_TASKS / 10 :

								                            (NumIterations > 50 ? NUM_CHILD_TASKS / 2 : NUM_CHILD_TASKS * 2);

								        Perf::SetWorkloadName( "%d x %d", NumRootTasks * NumLeafTasks, NumIterations );

								    }


								    void Run ( ThreadInfo& ) {

								        RunShallowTree<SimpleLeafTask>();

								    }

								}; // class Test_ShallowTree


								class LeafTaskSkewed : public LeafTaskBase {

								    task* execute () {

								        volatile count_t anchor = 0;

								        double K = (double)NumRootTasks * NumLeafTasks;

								        count_t n = count_t(sqrt(double(my_ID)) * double(my_ID) * my_ID / (4 * K * K));

								        for ( count_t i = 0; i < n; ++i )

								            anchor += i;

								        return NULL;

								    }

								public:

								    LeafTaskSkewed ( count_t id ) : LeafTaskBase(id) {}

								};


								class Test_ShallowTree_Skewed : public Test_SPMC {

								    static LeafTaskSkewed SerialTaskBody;


								    const char* Name () { return "ShallowTree_Skewed"; }


								    int NumWorkloads () { return 1; }


								    void SetWorkload ( int ) {

								        NumRootTasks = NUM_ROOT_TASKS;

								        NumLeafTasks = NUM_CHILD_TASKS;

								        Perf::SetWorkloadName( "%d", NumRootTasks * NumLeafTasks );

								    }


								    void Run ( ThreadInfo& ) {

								        RunShallowTree<LeafTaskSkewed>();

								    }


								public:

								    Test_ShallowTree_Skewed () : Test_SPMC(&SerialTaskBody) {}

								}; // class Test_ShallowTree_Skewed


								LeafTaskSkewed Test_ShallowTree_Skewed::SerialTaskBody(0);


								typedef tbb::blocked_range<count_t> range_t;


								static count_t  IterRange = N,

								                IterGrain = 1;


								enum PartitionerType {

								    SimplePartitioner = 0,

								    AutoPartitioner = 1

								};


								class Test_Algs : public Perf::Test {

								protected:

								    static const int numWorkloads = 4;

								    static const count_t algRanges[numWorkloads];

								    static const count_t algGrains[numWorkloads];


								    tbb::simple_partitioner    my_simplePartitioner;

								    tbb::auto_partitioner    my_autoPartitioner;

								    PartitionerType my_partitionerType;


								    bool UseAutoPartitioner () const { return my_partitionerType == AutoPartitioner; }


								    int NumWorkloads () { return UseAutoPartitioner() ? 3 : numWorkloads; }


								    void SetWorkload ( int idx ) {

								        if ( UseAutoPartitioner() ) {

								            IterRange = algRanges[idx ? numWorkloads - 1 : 0];

								            IterGrain = idx > 1 ? algGrains[numWorkloads - 1] : 1;

								        }

								        else {

								            IterRange = algRanges[idx];

								            IterGrain = algGrains[idx];

								        }

								        Perf::SetWorkloadName( "%d / %d", IterRange, IterGrain );

								    }

								public:

								    Test_Algs ( PartitionerType pt = SimplePartitioner ) : my_partitionerType(pt) {}

								}; // class Test_Algs


								const count_t Test_Algs::algRanges[] = {N_finest, N_fine, N_med, N};

								const count_t Test_Algs::algGrains[] = {1, FINE_GRAIN, MED_GRAIN, COARSE_GRAIN};


								template <typename Body>

								class Test_PFor : public Test_Algs {

								protected:

								    void Run ( ThreadInfo& ) {

								        if ( UseAutoPartitioner() )

								            tbb::parallel_for( range_t(0, IterRange, IterGrain), Body(), my_autoPartitioner );

								        else

								            tbb::parallel_for( range_t(0, IterRange, IterGrain), Body(), my_simplePartitioner );

								    }


								    void RunSerial ( ThreadInfo& ) {

								        Body body;

								        body( range_t(0, IterRange, IterGrain) );

								    }

								public:

								    Test_PFor ( PartitionerType pt = SimplePartitioner ) : Test_Algs(pt) {}

								}; // class Test_PFor


								class SimpleForBody {

								public:

								    void operator()( const range_t& r ) const {

								        count_t end = r.end();

								        volatile count_t anchor = 0;

								        for( count_t i = r.begin(); i < end; ++i )

								            anchor += i;

								    }

								}; // class SimpleForBody


								class Test_PFor_Simple : public Test_PFor<SimpleForBody> {

								protected:

								    const char* Name () { return UseAutoPartitioner() ? "PFor-AP" : "PFor"; }

								public:

								    Test_PFor_Simple ( PartitionerType pt = SimplePartitioner ) : Test_PFor<SimpleForBody>(pt) {}

								}; // class Test_PFor_Simple


								class SkewedForBody {

								public:

								    void operator()( const range_t& r ) const {

								        count_t end = (r.end() + 1) * (r.end() + 1);

								        volatile count_t anchor = 0;

								        for( count_t i = r.begin() * r.begin(); i < end; ++i )

								            anchor += i;

								    }

								}; // class SkewedForBody


								class Test_PFor_Skewed : public Test_PFor<SkewedForBody> {

								    typedef Test_PFor<SkewedForBody> base_type;

								protected:

								    const char* Name () { return UseAutoPartitioner() ? "PFor-Skewed-AP" : "PFor-Skewed"; }


								    void SetWorkload ( int idx ) {

								        base_type::SetWorkload(idx);

								        IterRange = (count_t)(sqrt((double)IterRange) * sqrt(sqrt((double)N / IterRange)));

								        Perf::SetWorkloadName( "%d", IterRange );

								    }


								public:

								    Test_PFor_Skewed ( PartitionerType pt = SimplePartitioner ) : base_type(pt) {}

								}; // class Test_PFor_Skewed


								PartitionerType gPartitionerType;

								count_t NestingRange;

								count_t NestingGrain;


								class NestingForBody {

								    count_t my_depth;

								    tbb::simple_partitioner my_simplePartitioner;

								    tbb::auto_partitioner my_autoPartitioner;


								    template<class Partitioner>

								    void run ( const range_t& r, Partitioner& p ) const {

								        count_t end = r.end();

								        if ( my_depth > 1 )

								            for ( count_t i = r.begin(); i < end; ++i )

								                tbb::parallel_for( range_t(0, IterRange, IterGrain), NestingForBody(my_depth - 1), p );

								        else

								            for ( count_t i = r.begin(); i < end; ++i )

								                tbb::parallel_for( range_t(0, IterRange, IterGrain), SimpleForBody(), p );

								    }

								public:

								    void operator()( const range_t& r ) const {

								        if ( gPartitionerType == AutoPartitioner )

								            run( r, my_autoPartitioner );

								        else

								            run( r, my_simplePartitioner );

								    }

								    NestingForBody ( count_t depth = 1 ) : my_depth(depth) {}

								}; // class NestingForBody


								enum NestingType {

								    HollowNesting,

								    ShallowNesting,

								    DeepNesting

								};


								class Test_PFor_Nested : public Test_Algs {

								    typedef Test_Algs base_type;


								    NestingType my_nestingType;

								    count_t my_nestingDepth;


								protected:

								    const char* Name () {

								        static const char* names[] = { "PFor-HollowNested", "PFor-HollowNested-AP",

								                                       "PFor-ShallowNested", "PFor-ShallowNested-AP",

								                                       "PFor-DeeplyNested", "PFor-DeeplyNested-AP" };

								        return names[my_nestingType * 2 + my_partitionerType];

								    }


								    int NumWorkloads () { return my_nestingType == ShallowNesting ? (UseAutoPartitioner() ? 3 : 2) : 1; }


								    void SetWorkload ( int idx ) {

								        gPartitionerType = my_partitionerType;

								        if ( my_nestingType == DeepNesting ) {

								            NestingRange = 1024;

								            IterGrain = NestingGrain = 1;

								            IterRange = 4;

								            my_nestingDepth = 4;

								        }

								        else if ( my_nestingType == ShallowNesting ) {

								            int i = idx ? numWorkloads - 1 : 0;

								            count_t baseRange = algRanges[i];

								            count_t baseGrain = !UseAutoPartitioner() || idx > 1 ? algGrains[i] : 1;

								            NestingRange = IterRange = (count_t)sqrt((double)baseRange);

								            NestingGrain = IterGrain = (count_t)sqrt((double)baseGrain);

								        }

								        else {

								            NestingRange = N / 100;

								            NestingGrain = COARSE_GRAIN / 10;

								            IterRange = 2;

								            IterGrain = 1;

								        }

								        Perf::SetWorkloadName( "%d / %d", NestingRange, NestingGrain );

								    }


								    void Run ( ThreadInfo& ) {

								        if ( UseAutoPartitioner() )

								            tbb::parallel_for( range_t(0, NestingRange, NestingGrain), NestingForBody(my_nestingDepth), my_autoPartitioner );

								        else

								            tbb::parallel_for( range_t(0, NestingRange, NestingGrain), NestingForBody(my_nestingDepth), my_simplePartitioner );

								    }


								    void RunSerial ( ThreadInfo& ) {

								        for ( int i = 0; i < NestingRange; ++i ) {

								            SimpleForBody body;

								            body( range_t(0, IterRange, IterGrain) );

								        }

								    }

								public:

								    Test_PFor_Nested ( NestingType nt, PartitionerType pt ) : base_type(pt), my_nestingType(nt), my_nestingDepth(1) {}

								}; // class Test_PFor_Nested


								class SimpleReduceBody {

								public:

								    count_t my_sum;

								    SimpleReduceBody () : my_sum(0) {}

								    SimpleReduceBody ( SimpleReduceBody&, tbb::split ) : my_sum(0) {}

								    void join( SimpleReduceBody& rhs ) { my_sum += rhs.my_sum;}

								    void operator()( const range_t& r ) {

								        count_t end = r.end();

								        volatile count_t anchor = 0;

								        for( count_t i = r.begin(); i < end; ++i )

								            anchor += i;

								        my_sum = anchor;

								    }

								}; // class SimpleReduceBody


								class Test_PReduce : public Test_Algs {

								protected:

								    const char* Name () { return UseAutoPartitioner() ? "PReduce-AP" : "PReduce"; }


								    void Run ( ThreadInfo& ) {

								        SimpleReduceBody body;

								        if ( UseAutoPartitioner() )

								            tbb::parallel_reduce( range_t(0, IterRange, IterGrain), body, my_autoPartitioner );

								        else

								            tbb::parallel_reduce( range_t(0, IterRange, IterGrain), body, my_simplePartitioner );

								    }


								    void RunSerial ( ThreadInfo& ) {

								        SimpleReduceBody body;

								        body( range_t(0, IterRange, IterGrain) );

								    }

								public:

								    Test_PReduce ( PartitionerType pt = SimplePartitioner ) : Test_Algs(pt) {}

								}; // class Test_PReduce


								int main( int argc, char* argv[] ) {

								    Perf::SessionSettings opts (Perf::UseTaskScheduler | Perf::UseSerialBaseline, "perf_sched.txt");   // Perf::UseBaseline, Perf::UseSmallestWorkloadOnly

								    Perf::RegisterTest<Test_SPMC>();

								    Perf::RegisterTest<Test_ShallowTree>();

								    Perf::RegisterTest<Test_ShallowTree_Skewed>();

								    Test_PFor_Simple pf_sp(SimplePartitioner), pf_ap(AutoPartitioner);

								    Perf::RegisterTest(pf_sp);

								    Perf::RegisterTest(pf_ap);

								    Test_PReduce pr_sp(SimplePartitioner), pr_ap(AutoPartitioner);

								    Perf::RegisterTest(pr_sp);

								    Perf::RegisterTest(pr_ap);

								    Test_PFor_Skewed pf_s_sp(SimplePartitioner), pf_s_ap(AutoPartitioner);

								    Perf::RegisterTest(pf_s_sp);

								    Perf::RegisterTest(pf_s_ap);

								    Test_PFor_Nested pf_hn_sp(HollowNesting, SimplePartitioner), pf_hn_ap(HollowNesting, AutoPartitioner),

								                     pf_sn_sp(ShallowNesting, SimplePartitioner), pf_sn_ap(ShallowNesting, AutoPartitioner),

								                     pf_dn_sp(DeepNesting, SimplePartitioner), pf_dn_ap(DeepNesting, AutoPartitioner);

								    Perf::RegisterTest(pf_hn_sp);

								    Perf::RegisterTest(pf_hn_ap);

								    Perf::RegisterTest(pf_sn_sp);

								    Perf::RegisterTest(pf_sn_ap);

								    Perf::RegisterTest(pf_dn_sp);

								    Perf::RegisterTest(pf_dn_ap);

								    return Perf::TestMain(argc, argv, &opts);

								}