Wednesday, February 1, 2012

MPI - Message Passing Interface

MPI is a message-passing library specification. Interface specifications have been defined for C/C++ and Fortran programs[1]. Provided examples use language C and LAM/MPI. LAM/MPI is a high quality implementation of the Message Passing Interface (MPI) Standard.

Example1: demo.c
#include "mpi.h"
#include <stdio.h>

   int main(int argc,char *argv[])
   {
   int  numtasks, rank, rc; 

   MPI_Init(&argc,&argv);
   
   MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
   printf ("Number of tasks= %d My rank= %d\n", numtasks,rank);  

   MPI_Finalize();
   }
Commands

lamboot
mpicc -o demo demo.c
mpirun -np <number of processes> demo


Result
The next example is the in class assignment we got for the course module 'High performance computing', to design matrix multiplication using MPI.

Matrix size N is divisible by the number of workers (eg. if matrix size is 4 and the number of workers is 4, each worker will get 1 row from matrix A). Master sends each worker equal number of rows from matrix A, full matrix B and an offset to track down the position of rows. Each worker receives the information master sends and completes the matrix multiplication for the relevant rows and creates the relevant rows of result matrix C and sends it along with the offset to track down the position of the rows. Master receives all the rows of the result matrix C from every worker and the result matrix is completed.

Example2
/**********************************************************************
 * MPI-based matrix multiplication AxB=C  
 *********************************************************************/


#include <stdio.h>
#include "mpi.h"
#define N    4        /* number of rows and columns in matrix */

MPI_Status status;

double a[N][N],b[N][N],c[N][N];  
       
main(int argc, char **argv) 
{
  int numtasks,taskid,numworkers,source,dest,rows,offset,i,j,k;

  struct timeval start, stop;
  
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

  numworkers = numtasks-1;

  /*---------------------------- master ----------------------------*/
  if (taskid == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {   
 a[i][j]= 1.0;
 b[i][j]= 2.0;
      }
    }

    gettimeofday(&start, 0);

    /* send matrix data to the worker tasks */
    rows = N/numworkers;
    offset = 0;
    
    for (dest=1; dest<=numworkers; dest++) 
    {       
      MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
      MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
      MPI_Send(&a[offset][0], rows*N, MPI_DOUBLE,dest,1, MPI_COMM_WORLD);
      MPI_Send(&b, N*N, MPI_DOUBLE, dest, 1, MPI_COMM_WORLD);
      offset = offset + rows;
    }

    /* wait for results from all worker tasks */
    for (i=1; i<=numworkers; i++) 
    {   
      source = i;
      MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
      MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
      MPI_Recv(&c[offset][0], rows*N, MPI_DOUBLE, source, 2, MPI_COMM_WORLD, &status);
    }

    gettimeofday(&stop, 0);

    printf("Here is the result matrix:\n");
    for (i=0; i<N; i++) { 
      for (j=0; j<N; j++) 
 printf("%6.2f   ", c[i][j]);
      printf ("\n");
    }
  
    fprintf(stdout,"Time = %.6f\n\n",
         (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6));

  } 

  /*---------------------------- worker----------------------------*/
  if (taskid > 0) {
    source = 0;
    MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&a, rows*N, MPI_DOUBLE, source, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&b, N*N, MPI_DOUBLE, source, 1, MPI_COMM_WORLD, &status);
  
    /* Matrix multiplication */
    for (k=0; k<N; k++)
      for (i=0; i<rows; i++) {
        c[i][k] = 0.0;
        for (j=0; j<N; j++)
   c[i][k] = c[i][k] + a[i][j] * b[j][k];
      }


    MPI_Send(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
    MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
    MPI_Send(&c, rows*N, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD);
  }  
    
  MPI_Finalize();
} 
Result
As the part 2 of the assignment we were suppose to optimize the code so it will fulfill following requirements,
  • The matrix size should be defined using #define N
  • The type of elements in the matrix should be ``float''
  • The matrices can be square (e.g., NXN) but N may not be exactly divisible by the number of processors
  • Should check the correctness of computations by printing the result matrix. May use #define PRINT to enclose the code segments for printing
  • Should find the speedup of the code (e.g., for N=100, 200, etc.) compared with a sequential matrix multiplication code.
Example 3: Solution
/**********************************************************************
 * MPI-based matrix multiplication AxB=C  
 *********************************************************************/


#include <stdio.h>
#include <sys/time.h>
#include "mpi.h"
#define N    500        /* number of rows and columns in matrix */

MPI_Status status;

float a[N][N],b[N][N],c[N][N];  
       
main(int argc, char **argv) 
{
  int numtasks,taskid,numworkers,source,dest,rows,offset,remain,i,j,k;
  struct timeval start, stop;
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);

  numworkers = numtasks-1;

  /*---------------------------- master ----------------------------*/
  if (taskid == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {   
        a[i][j]= 1.0;
        b[i][j]= 2.0;
      }
    }

#ifdef PRINT
        /* print matrices */
        printf("Matrix A:\n");           
        for (i=0; i<N; i++){      
           for (j=0; j<N; j++)  
              printf("%.3f\t",a[i][j]); 
           printf("\n");  
        }        

        printf("Matrix B:\n");       
        for (i=0; i<N; i++){  
           for (j=0; j<N; j++)
              printf("%.3f\t",b[i][j]);
           printf("\n");
        }   

#endif

    gettimeofday(&start, 0);

    /* send matrix data to the worker tasks */
    if (N <= numworkers)
    {
 rows = 1;  
    }
    else
    {
      if (N%numworkers!=0) // Not divisible by numworkers
      {        
 rows = N/numworkers+1;
        remain = N%numworkers;
      }
      else
      {
        rows = N/numworkers;  
      }
    }
    offset = 0;
    
    for (dest=1; dest<=numworkers; dest++, remain--) 
    {       
      MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
      MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
      MPI_Send(&a[offset][0], rows*N, MPI_FLOAT,dest,1, MPI_COMM_WORLD);
      MPI_Send(&b, N*N, MPI_FLOAT, dest, 1, MPI_COMM_WORLD);
      offset = offset + rows;
      if(remain==1) rows-=1;
    }

    /* wait for results from all worker tasks */
    for (i=1; i<=numworkers; i++) 
    {   
      source = i;
      MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
      MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
      MPI_Recv(&c[offset][0], rows*N, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status);
    }

    gettimeofday(&stop, 0);
#ifdef PRINT
    printf("Here is the result matrix:\n");
    for (i=0; i<N; i++) { 
      for (j=0; j<N; j++) 
 printf("%6.2f   ", c[i][j]);
      printf ("\n");
    }
#endif  
    fprintf(stdout,"Time = %.6f\n\n",
         (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6));

  } 

  /*---------------------------- worker----------------------------*/
  if (taskid > 0) {
    source = 0;
    MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&a, rows*N, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
    MPI_Recv(&b, N*N, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
  
    /* Matrix multiplication */
    for (k=0; k<N; k++)
      for (i=0; i<rows; i++) {
        c[i][k] = 0.0;
        for (j=0; j<N; j++)
   c[i][k] = c[i][k] + a[i][j] * b[j][k];
      }


    MPI_Send(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
    MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
    MPI_Send(&c, rows*N, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
  }  
    
  MPI_Finalize();
}
 

Example 4: Sequential code
/* Matrix Multiplication */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <assert.h>

#define RANDLIMIT 5 /* Magnitude limit of generated randno.*/
#define N  500   /* Matrix Size */
#define NUMLIMIT 70.0

float a[N][N];
float b[N][N];
float c[N][N];

int main(int argc, char *argv[])
{
    struct timeval start, stop; 
    int i,j,k;

 
    /* generate mxs  */
    for (i=0; i<N; i++)
        for (j=0; j<N; j++) {
            a[i][j] = 1+(int) (NUMLIMIT*rand()/(RAND_MAX+1.0)); 
            /*a[i][j] = 1.0; 
            b[i][j] = 2.0;*/
            b[i][j] = (double) (rand() % RANDLIMIT);
            /*c[i][j] = 0.0;*/
        }

#ifdef PRINT
        /* print matrices */
        printf("Matrix A:\n");           
        for (i=0; i<N; i++){      
           for (j=0; j<N; j++)  
              printf("%.3f\t",a[i][j]); 
           printf("\n");  
        }        

        printf("Matrix B:\n");       
        for (i=0; i<N; i++){  
           for (j=0; j<N; j++)
              printf("%.3f\t",b[i][j]);
           printf("\n");
        }   

        printf("Matrix C:\n");       
        for (i=0; i<N; i++){  
           for (j=0; j<N; j++)
              printf("%.3f\t",c[i][j]);
           printf("\n");
        }   
#endif

    gettimeofday(&start, 0);

    for (i=0; i<N; i++) {
 for (j=0; j<N; j++) {
           c[i][j] = 0.0;
    for (k=0; k<N; k++)
              c[i][j] = c[i][j] + a[i][k]*b[k][j]; /* Working;standard way */
            /*c[j][i] = c[j][i] + a[j][k]*b[k][i];*/ /* Working; Makes C column by col */
        } /* end j loop */
    }

    gettimeofday(&stop, 0);

#ifdef PRINT
    /* print results*/
    printf("Answer c:\n");
    for (i=0; i<N; i++){
        for (j=0; j<N; j++) 
           printf("%.3f\t",c[i][j]);
        printf("\n");
    }
#endif

    fprintf(stdout,"Time = %.6f\n\n",
         (stop.tv_sec+stop.tv_usec*1e-6)-(start.tv_sec+start.tv_usec*1e-6));
    return(0);
}

Result
Examining the results we can clearly see that sequential program takes more time for matrix multiplication than the parallel program when the size of matrix is large.

[1] http://en.wikipedia.org/wiki/Message_Passing_Interface