/* linux: mpicc -O2 [-D CFG_CPUSET=] -o mpi_stress ../utils/mpi_stress.c # cpuset is a bit pattern (hex) for available cpu cores per node # if you want bind tasks to cores using sched_setaffinity() mpirun -np 8 ./mpi_stress --bsize 26 | tee mpi_stress.log see mpi_stress.gpl and at bottom of this file for some data! Changelog: 23.08.2008 - change MPI_Sendrecv loop to avoid bad measurement on NUMA systems (Altix4700: 8-9GB/s measured, but 1.3GB/s real) 20.08.2008 - fix race condition, have lead to speed-down-jumps, MPI_ERR_TRUNCATE and dead-locks at end (thanks to David Gingold) - better option handling */ #define _GNU_SOURCE 1 // this defines __USE_GNU in features.h for sched.h #include #include #include // #include #include /* memalign() */ #include #ifdef CFG_CPUSET // switch on if sched_setaffinity will benefit # include // declare sleep() # include // declare sched_setaffinity() # ifndef _FEATURES_H # warning "_FEATURES_H is not defined, try top add #include " # endif # ifndef __USE_GNU # warning "expect __USE_GNU defined, but it isn't, try -D_GNU_SOURCE=1" # endif #endif int mpi_n, mpi_i, node_dst, node_src, /* destination and source node */ *next_dst, *next_src, /* next destination and source node*/ mintime=1; /* minimum seconds run time (max. is 2*mintime) */ char *buf1, *buf2; /* transfer buffers */ static int header=0; int test_sendrecv(unsigned long len){ unsigned long i1, i2, i3; double t1, t2, t3=0; MPI_Status status; if (!header) { header=1; if (!mpi_i) printf("# tasks msgsize[B] loops[2^x] time[s]" " latency[us] SumBW[MB/s] BW[MB/s] Bcast\n"); /* BW=bandwidth*/ } i3=1; t1=MPI_Wtime(); for (i2=0,i1=1l<10 && t2<=mintime && !mpi_i) // some alive messages printf("# %4d %9lu %6lu %8.2f %9.2f %8.2f %8.2f %5.2f%%\n", mpi_n, len, i3, t2, t2*1e6/i1, i1*1e-6*len*mpi_n/t2, i1*1e-6*len/t2, t3*100/t2); if (t2>mintime) break; } if (!mpi_i) printf(" %4d %9lu %6lu %8.2f %9.2f %8.2f %8.2f %5.2f%%\n", mpi_n, len, i3, t2, t2*1e6/i1, i1*1e-6*len*mpi_n/t2, i1*1e-6*len/t2, t3*100/t2); return 0; } int test_alltoall(unsigned long len){ unsigned long i1, i2, i3; double t1, t2, t3=0; if (!header) { header=1; if (!mpi_i) printf("# tasks msgsize[B] loops[2^x] time[s]" " latency[us] SumBW[MB/s] BW[MB/s] Bcast\n"); /* BW=bandwidth*/ } i3=0; t1=MPI_Wtime(); for (i2=0,i1=1l<10 && t2<=mintime && !mpi_i) printf("# %4d %9lu %6lu %8.2f %9.2f %8.2f %8.2f\n", mpi_n, len, i3, t2, t2*1e6/i1, i1*1e-6*len*mpi_n*mpi_n/t2, i1*1e-6*len*mpi_n/t2); if (t2>mintime) break; } /* we show msgsize as single transfer message bow, 1.. */ if (!mpi_i) printf(" %4d %9lu %6lu %8.2f %9.2f %8.2f %8.2f %5.2f%%\n", mpi_n, len, i3, t2, t2*1e6/i1, i1*1e-6*len*mpi_n*mpi_n/t2, i1*1e-6*len*mpi_n/t2, t3*100/t2); return 0; } int main(int argc, char *argv[]) { unsigned long i1, bsize; /* buffer size or message size, 2^bsize2 */ int version, subversion, i, i2, bsize2=20, /* log2 of bsize */ pairs=0, /* rings (src!=dst, default) versus pairs (src==dst) */ nshift=1, /* node shift, 0=no (fixed src/dst), 1=yes (shifting src/dst) */ ata=0, /* MPI_Sendrecv (default) versus MPI_Alltoall */ align=1; /* align memory to 64 (default) or not */ setvbuf(stdout,NULL,_IONBF,0); if (MPI_Init(&argc, &argv) != MPI_SUCCESS) exit(1); MPI_Comm_size(MPI_COMM_WORLD, &mpi_n); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_i); MPI_Get_version(&version, &subversion); if (!mpi_i) printf("# mpi_stress v2008-09-23\n"); for (i=1;i] [--mintime ]\n" " [--alltoall|{[--pairwise] [--noshift]}] [--noalign]\n"); } bsize = 1l << bsize2; #if defined(CFG_CPUSET) && defined(__USE_GNU) { // this processor bounding is useful for the case, that the jobsystem // does not do this automatically // CFG_CPUSET is the bitmap for available SMP-CPUs per node // CFG_CPUSET=0x0005: use core 0 and 2 (Nodes with 2 Hyper-Threading CPUs) // ToDo: define on runtime per taskset + sched_getaffinity ? (jobsystem?) // use --byslot for numbits(CFG_CPUSET)>ii; ii++) if (1&(CFG_CPUSET>>ii)) ncores++; for (ii=0; CFG_CPUSET>>ii; ii++) if (1&(CFG_CPUSET>>ii)) { if (mpi_i%ncores==in) break; // mycore = 0,1,2,...,0,1,2,... --byslot in++; } CPU_ZERO(&cpuset); CPU_SET(ii,&cpuset); sleep(1); if (mpi_i i -> i+1, src!=dst */ node_dst=(mpi_i+1 )%mpi_n; next_dst[node_dst]=node_dst; node_src=(mpi_i-1+mpi_n)%mpi_n; next_src[node_src]=node_src; /* changing rings i-j -> i -> i+j, src!=dst */ if (pairs==0) for (node_dst=node_src=mpi_i,i2=0; i28) printf(" ..."); printf(" src"); for (i1=0; i18) printf(" ..."); printf("\n"); } if (!nshift) break; // no node_shift? } /* changing pairs i -> n-i+j (fixed pairs: j=1), src==dst */ if (pairs) for (node_dst=node_src=mpi_i,i2=(nshift)?0:1; i28) printf(" ..."); printf("\n"); } if (!nshift) break; } if (!nshift) { /* stop changing, fix! */ next_dst[node_dst]=node_dst; next_src[node_src]=node_src; } /* the last state of node_dst + src is the initial state for the run */ for (i1=0;i10;i1>>=1) test_sendrecv(i1); } if ( ata) { if (!mpi_i) printf("# Measure speed of MPI_alltoall transactions:\n"); for (i1=bsize/mpi_n;i1>0;i1>>=1) test_alltoall(i1); } free(buf2); free(buf1); MPI_Finalize(); return 0; } /* P4-1.3GHz-linux 15.3us 390MB/s(16M)..840MB/s(64K) (2) ES45-Tru64-5.1B 2.6us 5% 820MB/s (2) 2.79us 1130MB/s 2% (4) overload-2x 26.0us 4% 7.2MB/s 0.2%(8) SR=Sendrecv AA=Alltoall GS320-16: 16 12us 860MB/s(128k) (SR) 156us 880MB/s(128k) (AA) runon 0,1 8 12us 409MB/s(128K) (SR) 82us 373MB/s(128k) # psr=0,1,2,3..3 4 6us 373MB/s(128K) (SR) 23us 440MB/s(64K) (AA) 2 5us 260MB/s(128K) (SR) 11us 409MB/s(256K) (AA) GS1280-32: 2 3.1us 4% 715MB/s 4% (slow 32CPUs/RAD) 8 3.9us 3% 1960MB/s(512kB) 1560MB/s(1MB) 3% 32 6.4us 4% 3930MB/s(32kB) 4040MB/s(64kB,128kB) 4160MB/s(256kB) 3500MB/s(512kB) 2500MB/s(1MB) 3% Altix330 numa_alloc_local MPI_via_loopback ToDo: interconnect-topology=star # monitor interconnect-traffic: linkstat, dplace? MPT-MPI-1.2 (insmod xpmem) MPI_USE_XPMEM=1 MPI_XPMEM_VERBOSE=1 ? 2 2.09us 1.8GB/s(1M) 1.1GB/s(256K) 1node numa_alloc_local 2 1.97us 8G(1M) 17G 15G 13G 12G 8G 5G 3G 2G 1G 300M ... no div psr1,11 4 1.6GB/s(1M) 1.6GB/s(256K) 4nodes 8 3.58us 2.5GB/s(1M) 2.7GB/s(256K) 4nodes 12 4.05us 3.8GB/s(1M) 3.4GB/s(256K) 6nodes numa_alloc_local 12 4.05us 4.1GB/s(1M) 34GB/s(128K) 6nodes malloc lo 12 4.05us 4.3GB/s(1M) 32GB/s(128K) 6nodes malloc xpmem # MPI_Sendrecv MPI_Alltoall 4 3.2us 4.2GB/s(1M) 16GB/s(128K) 12.4us 9GB/s(1M) 20GB/s(256K) # running spinpack8 4 3.1us 4.6GB/s(1M) 20GB/s(128K) 12.4us 11GB/s(1M) 23GB/s(512K) # stopped spinpack8 (parallel-fluent-speed*3.5 14.7.08) 12 3.9us 3.4GB/s(1M) 37GB/s(128K) 42.0us 6GB/s(1M) 26GB/s(512K) # stopped spinpack8 2DualAMD265: 2 23us 4% 660MB/s 4% (last!) baldur 4Xeon3GHz_tina 4 3.3us 4% 307MB/s 4% Dec07 node004 2*1G via .1G 2 160us 5% 15MB/s 3% Dec07 node4+node012 4 191us 4% 28MB/s 3% Dec07 node4+node012+... 8 206us 4% 63MB/s 3% Dec07 node4+node012+... ToDo: show scaling speed(threads) */