#include #include #include #include #include #include //sse2 #define T 64 //macro to print a __m128 vector #define PVEC(a) \ { \ float tp[4]; \ _mm_storeu_ps ( tp ,a); \ printf("VEC %f %f %f %f \n", \ tp[0], \ tp[1], \ tp[2], \ tp[3]); \ } \ void randomInit(float* data, int size) { int i; for (i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX; } void Init(float* data, int size) { int i; for (i = 0; i < size; i++) data[i] = 0; } main(int argc, char** argv) { clock_t tbegin; clock_t tend; if (argc!=3){ printf(" Use :\n"); printf("%s \n",argv[0]); return EXIT_FAILURE; } int N = atoi(argv[1]); if (N%T !=0) { printf("N must be a multiple of T(%i) \n",T); return EXIT_FAILURE; } int check = atoi(argv[2]); int i,j,k; //allocate memory unsigned int mem_size = sizeof(float) * N * N; float* A = (float*) malloc(mem_size); float* B = (float*) malloc(mem_size); float* C = (float*) malloc(mem_size); // initialize memory randomInit(A, N*N); randomInit(B, N*N); Init(C, N*N); tbegin=clock(); //compute matrix multiplication, tiled int ti,tj,tk; for (ti = 0; ti < N; ti+=T) for (tj = 0; tj < N; tj+=T) for (tk = 0; tk < N; tk+=T) for (i = ti; i < ti+T; i++) for (j = tj; j < tj+T; j++) { float sum = C[i * N + j]; for (k = tk; k < tk+T; ++k) { sum += A[i * N + k] * B[k * N + j]; } C[i * N + j] = sum; } tend=clock(); //check result with a simple untiled computation if(check) { float* Cref = (float*) malloc(mem_size); for (i = 0; i < N; ++i) for (j = 0; j < N; ++j) { float sum = 0; for (k = 0; k < N; ++k) { sum += A[i * N + k] * B[k * N + j]; } Cref[i * N + j] = sum; } int ok=1; for (i = 0; i < N; ++i) for (j = 0; j < N; ++j) { if( Cref[i * N + j]!=C[i * N + j])ok=0; } if(ok) printf("Test passed\n"); else printf("Test failed\n"); free(Cref); } float time = (tend-tbegin)/(double)CLOCKS_PER_SEC; printf("Time: %.4lf s ---\n", time); printf("Gflops: %f\n", 2*N*((N*N / (time)) / 1e9) ); free(A); free(B); free(C); }