CUDA: Memory copy to GPU 1 is slower in multi-GPU
- by zenna
My company has a setup of two GTX 295, so a total of 4 GPUs in a server, and we have several servers.
We GPU 1 specifically was slow, in comparison to GPU 0, 2 and 3 so I wrote a little speed test to help find the cause of the problem.
//#include <stdio.h>
//#include <stdlib.h>
//#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cutil.h>
__global__ void test_kernel(float *d_data) {
int tid = blockDim.x*blockIdx.x + threadIdx.x;
for (int i=0;i<10000;++i) {
d_data[tid] = float(i*2.2);
d_data[tid] += 3.3;
}
}
int main(int argc, char* argv[])
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device = 0; //SELECT GPU HERE
cudaSetDevice(device);
cudaEvent_t start, stop;
unsigned int num_vals = 200000000;
float *h_data = new float[num_vals];
for (int i=0;i<num_vals;++i) {
h_data[i] = float(i);
}
float *d_data = NULL;
float malloc_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_data, sizeof(float)*num_vals);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float mem_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
float kernel_timer;
cudaEventCreate(&start);
cudaEventCreate(&stop); cudaEventRecord( start, 0 );
test_kernel<<<1000,256>>>(d_data);
cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf("cudaMalloc took %f ms\n",malloc_timer);
printf("Copy to the GPU took %f ms\n",mem_timer);
printf("Test Kernel took %f ms\n",kernel_timer);
cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost);
delete[] h_data;
return 0;
}
The results are
GPU0
cudaMalloc took 0.908640 ms
Copy to the GPU took 296.058777 ms
Test Kernel took 326.721283 ms
GPU1
cudaMalloc took 0.913568 ms
Copy to the GPU took[b] 663.182251 ms[/b]
Test Kernel took 326.710785 ms
GPU2
cudaMalloc took 0.925600 ms
Copy to the GPU took 296.915039 ms
Test Kernel took 327.127930 ms
GPU3
cudaMalloc took 0.920416 ms
Copy to the GPU took 296.968384 ms
Test Kernel took 327.038696 ms
As you can see, the cudaMemcpy to the GPU is well double the amount of time for GPU1. This is consistent between all our servers, it is always GPU1 that is slow.
Any ideas why this may be?
All servers are running windows XP.