You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							135 lines
						
					
					
						
							5.1 KiB
						
					
					
				
			
		
		
		
			
			
			
				
					
				
				
					
				
			
		
		
	
	
							135 lines
						
					
					
						
							5.1 KiB
						
					
					
				
								
							 | 
						|
								#include <iostream>
							 | 
						|
								#include <Eigen/Core>
							 | 
						|
								#include <bench/BenchTimer.h>
							 | 
						|
								using namespace Eigen;
							 | 
						|
								
							 | 
						|
								#ifndef SIZE
							 | 
						|
								#define SIZE 50
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								#ifndef REPEAT
							 | 
						|
								#define REPEAT 10000
							 | 
						|
								#endif
							 | 
						|
								
							 | 
						|
								typedef float Scalar;
							 | 
						|
								
							 | 
						|
								__attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
							 | 
						|
								__attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
							 | 
						|
								__attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
							 | 
						|
								
							 | 
						|
								int main(int argc, char* argv[])
							 | 
						|
								{
							 | 
						|
								    int size = SIZE * 8;
							 | 
						|
								    int size2 = size * size;
							 | 
						|
								    Scalar* a = internal::aligned_new<Scalar>(size2);
							 | 
						|
								    Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
							 | 
						|
								    Scalar* c = internal::aligned_new<Scalar>(size2); 
							 | 
						|
								    
							 | 
						|
								    for (int i=0; i<size; ++i)
							 | 
						|
								    {
							 | 
						|
								        a[i] = b[i] = c[i] = 0;
							 | 
						|
								    }
							 | 
						|
								    
							 | 
						|
								    BenchTimer timer;
							 | 
						|
								    
							 | 
						|
								    timer.reset();
							 | 
						|
								    for (int k=0; k<10; ++k)
							 | 
						|
								    {
							 | 
						|
								        timer.start();
							 | 
						|
								        benchVec(a, b, c, size2);
							 | 
						|
								        timer.stop();
							 | 
						|
								    }
							 | 
						|
								    std::cout << timer.value() << "s  " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
							 | 
						|
								    return 0;
							 | 
						|
								    for (int innersize = size; innersize>2 ; --innersize)
							 | 
						|
								    {
							 | 
						|
								        if (size2%innersize==0)
							 | 
						|
								        {
							 | 
						|
								            int outersize = size2/innersize;
							 | 
						|
								            MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
							 | 
						|
								            MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
							 | 
						|
								            MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
							 | 
						|
								            timer.reset();
							 | 
						|
								            for (int k=0; k<3; ++k)
							 | 
						|
								            {
							 | 
						|
								                timer.start();
							 | 
						|
								                benchVec(ma, mb, mc);
							 | 
						|
								                timer.stop();
							 | 
						|
								            }
							 | 
						|
								            std::cout << innersize << " x " << outersize << "  " << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
							 | 
						|
								        }
							 | 
						|
								    }
							 | 
						|
								    
							 | 
						|
								    VectorXf va = Map<VectorXf>(a, size2);
							 | 
						|
								    VectorXf vb = Map<VectorXf>(b, size2);
							 | 
						|
								    VectorXf vc = Map<VectorXf>(c, size2);
							 | 
						|
								    timer.reset();
							 | 
						|
								    for (int k=0; k<3; ++k)
							 | 
						|
								    {
							 | 
						|
								        timer.start();
							 | 
						|
								        benchVec(va, vb, vc);
							 | 
						|
								        timer.stop();
							 | 
						|
								    }
							 | 
						|
								    std::cout << timer.value() << "s   " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
							 | 
						|
								
							 | 
						|
								    return 0;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
							 | 
						|
								{
							 | 
						|
								    for (int k=0; k<REPEAT; ++k)
							 | 
						|
								        a = a + b;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
							 | 
						|
								{
							 | 
						|
								    for (int k=0; k<REPEAT; ++k)
							 | 
						|
								        a = a + b;
							 | 
						|
								}
							 | 
						|
								
							 | 
						|
								void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
							 | 
						|
								{
							 | 
						|
								    typedef internal::packet_traits<Scalar>::type PacketScalar;
							 | 
						|
								    const int PacketSize = internal::packet_traits<Scalar>::size;
							 | 
						|
								    PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
							 | 
						|
								    for (int k=0; k<REPEAT; ++k)
							 | 
						|
								        for (int i=0; i<size; i+=PacketSize*8)
							 | 
						|
								        {
							 | 
						|
								//             a0 = internal::pload(&a[i]);
							 | 
						|
								//             b0 = internal::pload(&b[i]);
							 | 
						|
								//             a1 = internal::pload(&a[i+1*PacketSize]);
							 | 
						|
								//             b1 = internal::pload(&b[i+1*PacketSize]);
							 | 
						|
								//             a2 = internal::pload(&a[i+2*PacketSize]);
							 | 
						|
								//             b2 = internal::pload(&b[i+2*PacketSize]);
							 | 
						|
								//             a3 = internal::pload(&a[i+3*PacketSize]);
							 | 
						|
								//             b3 = internal::pload(&b[i+3*PacketSize]);
							 | 
						|
								//             internal::pstore(&a[i], internal::padd(a0, b0));
							 | 
						|
								//             a0 = internal::pload(&a[i+4*PacketSize]);
							 | 
						|
								//             b0 = internal::pload(&b[i+4*PacketSize]);
							 | 
						|
								//             
							 | 
						|
								//             internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1));
							 | 
						|
								//             a1 = internal::pload(&a[i+5*PacketSize]);
							 | 
						|
								//             b1 = internal::pload(&b[i+5*PacketSize]);
							 | 
						|
								//             
							 | 
						|
								//             internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2));
							 | 
						|
								//             a2 = internal::pload(&a[i+6*PacketSize]);
							 | 
						|
								//             b2 = internal::pload(&b[i+6*PacketSize]);
							 | 
						|
								//             
							 | 
						|
								//             internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3));
							 | 
						|
								//             a3 = internal::pload(&a[i+7*PacketSize]);
							 | 
						|
								//             b3 = internal::pload(&b[i+7*PacketSize]);
							 | 
						|
								//             
							 | 
						|
								//             internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0));
							 | 
						|
								//             internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1));
							 | 
						|
								//             internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2));
							 | 
						|
								//             internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3));
							 | 
						|
								            
							 | 
						|
								            internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize])));
							 | 
						|
								            internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize])));
							 | 
						|
								            internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize])));
							 | 
						|
								            internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize])));
							 | 
						|
								            internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize])));
							 | 
						|
								            internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize])));
							 | 
						|
								        }
							 | 
						|
								}
							 |