Example1: demo.c
#include "mpi.h" #include <stdio.h> int main(int argc,char *argv[]) { int numtasks, rank, rc; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&rank); printf ("Number of tasks= %d My rank= %d\n", numtasks,rank); MPI_Finalize(); }Commands
lamboot
mpicc -o demo demo.c
mpirun -np <number of processes> demo
Result The next example is the in class assignment we got for the course module 'High performance computing', to design matrix multiplication using MPI.
Matrix size N is divisible by the number of workers (eg. if matrix size is 4 and the number of workers is 4, each worker will get 1 row from matrix A). Master sends each worker equal number of rows from matrix A, full matrix B and an offset to track down the position of rows. Each worker receives the information master sends and completes the matrix multiplication for the relevant rows and creates the relevant rows of result matrix C and sends it along with the offset to track down the position of the rows. Master receives all the rows of the result matrix C from every worker and the result matrix is completed.
Example2
/********************************************************************** * MPI-based matrix multiplication AxB=C *********************************************************************/ #include <stdio.h> #include "mpi.h" #define N 4 /* number of rows and columns in matrix */ MPI_Status status; double a[N][N],b[N][N],c[N][N]; main(int argc, char **argv) { int numtasks,taskid,numworkers,source,dest,rows,offset,i,j,k; struct timeval start, stop; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &taskid); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); numworkers = numtasks-1; /*---------------------------- master ----------------------------*/ if (taskid == 0) { for (i=0; i<N; i++) { for (j=0; j<N; j++) { a[i][j]= 1.0; b[i][j]= 2.0; } } gettimeofday(&start, 0); /* send matrix data to the worker tasks */ rows = N/numworkers; offset = 0; for (dest=1; dest<=numworkers; dest++) { MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); MPI_Send(&a[offset][0], rows*N, MPI_DOUBLE,dest,1, MPI_COMM_WORLD); MPI_Send(&b, N*N, MPI_DOUBLE, dest, 1, MPI_COMM_WORLD); offset = offset + rows; } /* wait for results from all worker tasks */ for (i=1; i<=numworkers; i++) { source = i; MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); MPI_Recv(&c[offset][0], rows*N, MPI_DOUBLE, source, 2, MPI_COMM_WORLD, &status); } gettimeofday(&stop, 0); printf("Here is the result matrix:\n"); for (i=0; i<N; i++) { for (j=0; j<N; j++) printf("%6.2f ", c[i][j]); printf ("\n"); } fprintf(stdout,"Time = %.6f\n\n", (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6)); } /*---------------------------- worker----------------------------*/ if (taskid > 0) { source = 0; MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&a, rows*N, MPI_DOUBLE, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&b, N*N, MPI_DOUBLE, source, 1, MPI_COMM_WORLD, &status); /* Matrix multiplication */ for (k=0; k<N; k++) for (i=0; i<rows; i++) { c[i][k] = 0.0; for (j=0; j<N; j++) c[i][k] = c[i][k] + a[i][j] * b[j][k]; } MPI_Send(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); MPI_Send(&c, rows*N, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD); } MPI_Finalize(); }Result As the part 2 of the assignment we were suppose to optimize the code so it will fulfill following requirements,
- The matrix size should be defined using #define N
- The type of elements in the matrix should be ``float''
- The matrices can be square (e.g., NXN) but N may not be exactly divisible by the number of processors
- Should check the correctness of computations by printing the result matrix. May use #define PRINT to enclose the code segments for printing
- Should find the speedup of the code (e.g., for N=100, 200, etc.) compared with a sequential matrix multiplication code.
/********************************************************************** * MPI-based matrix multiplication AxB=C *********************************************************************/ #include <stdio.h> #include <sys/time.h> #include "mpi.h" #define N 500 /* number of rows and columns in matrix */ MPI_Status status; float a[N][N],b[N][N],c[N][N]; main(int argc, char **argv) { int numtasks,taskid,numworkers,source,dest,rows,offset,remain,i,j,k; struct timeval start, stop; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &taskid); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); numworkers = numtasks-1; /*---------------------------- master ----------------------------*/ if (taskid == 0) { for (i=0; i<N; i++) { for (j=0; j<N; j++) { a[i][j]= 1.0; b[i][j]= 2.0; } } #ifdef PRINT /* print matrices */ printf("Matrix A:\n"); for (i=0; i<N; i++){ for (j=0; j<N; j++) printf("%.3f\t",a[i][j]); printf("\n"); } printf("Matrix B:\n"); for (i=0; i<N; i++){ for (j=0; j<N; j++) printf("%.3f\t",b[i][j]); printf("\n"); } #endif gettimeofday(&start, 0); /* send matrix data to the worker tasks */ if (N <= numworkers) { rows = 1; } else { if (N%numworkers!=0) // Not divisible by numworkers { rows = N/numworkers+1; remain = N%numworkers; } else { rows = N/numworkers; } } offset = 0; for (dest=1; dest<=numworkers; dest++, remain--) { MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); MPI_Send(&a[offset][0], rows*N, MPI_FLOAT,dest,1, MPI_COMM_WORLD); MPI_Send(&b, N*N, MPI_FLOAT, dest, 1, MPI_COMM_WORLD); offset = offset + rows; if(remain==1) rows-=1; } /* wait for results from all worker tasks */ for (i=1; i<=numworkers; i++) { source = i; MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); MPI_Recv(&c[offset][0], rows*N, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status); } gettimeofday(&stop, 0); #ifdef PRINT printf("Here is the result matrix:\n"); for (i=0; i<N; i++) { for (j=0; j<N; j++) printf("%6.2f ", c[i][j]); printf ("\n"); } #endif fprintf(stdout,"Time = %.6f\n\n", (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6)); } /*---------------------------- worker----------------------------*/ if (taskid > 0) { source = 0; MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&a, rows*N, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&b, N*N, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); /* Matrix multiplication */ for (k=0; k<N; k++) for (i=0; i<rows; i++) { c[i][k] = 0.0; for (j=0; j<N; j++) c[i][k] = c[i][k] + a[i][j] * b[j][k]; } MPI_Send(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); MPI_Send(&c, rows*N, MPI_FLOAT, 0, 2, MPI_COMM_WORLD); } MPI_Finalize(); }Example 4: Sequential code
/* Matrix Multiplication */ #include <stdio.h> #include <stdlib.h> #include <math.h> #include <sys/time.h> #include <assert.h> #define RANDLIMIT 5 /* Magnitude limit of generated randno.*/ #define N 500 /* Matrix Size */ #define NUMLIMIT 70.0 float a[N][N]; float b[N][N]; float c[N][N]; int main(int argc, char *argv[]) { struct timeval start, stop; int i,j,k; /* generate mxs */ for (i=0; i<N; i++) for (j=0; j<N; j++) { a[i][j] = 1+(int) (NUMLIMIT*rand()/(RAND_MAX+1.0)); /*a[i][j] = 1.0; b[i][j] = 2.0;*/ b[i][j] = (double) (rand() % RANDLIMIT); /*c[i][j] = 0.0;*/ } #ifdef PRINT /* print matrices */ printf("Matrix A:\n"); for (i=0; i<N; i++){ for (j=0; j<N; j++) printf("%.3f\t",a[i][j]); printf("\n"); } printf("Matrix B:\n"); for (i=0; i<N; i++){ for (j=0; j<N; j++) printf("%.3f\t",b[i][j]); printf("\n"); } printf("Matrix C:\n"); for (i=0; i<N; i++){ for (j=0; j<N; j++) printf("%.3f\t",c[i][j]); printf("\n"); } #endif gettimeofday(&start, 0); for (i=0; i<N; i++) { for (j=0; j<N; j++) { c[i][j] = 0.0; for (k=0; k<N; k++) c[i][j] = c[i][j] + a[i][k]*b[k][j]; /* Working;standard way */ /*c[j][i] = c[j][i] + a[j][k]*b[k][i];*/ /* Working; Makes C column by col */ } /* end j loop */ } gettimeofday(&stop, 0); #ifdef PRINT /* print results*/ printf("Answer c:\n"); for (i=0; i<N; i++){ for (j=0; j<N; j++) printf("%.3f\t",c[i][j]); printf("\n"); } #endif fprintf(stdout,"Time = %.6f\n\n", (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6)); return(0); }Result Examining the results we can clearly see that sequential program takes more time for matrix multiplication than the parallel program when the size of matrix is large.
[1] http://en.wikipedia.org/wiki/Message_Passing_Interface