Search Results

Search found 13 results on 1 pages for 'cudamalloc'.

Page 1/1 | 1 

  • How CudaMalloc work?

    - by kitw
    I am trying to modify the imageDenosing class in CUDA SDK, I need to repeat the filter many time incase to capture the time. But my code doesn't work properly. //start __global__ void F1D(TColor *image,int imageW,int imageH, TColor *buffer) { const int ix = blockDim.x * blockIdx.x + threadIdx.x; const int iy = blockDim.y * blockIdx.y + threadIdx.y; if(iy != 0 && iy < imageH-1 && ix < imageW) { float4 fresult = get_color(image[imageW * iy + ix]); float4 fresult4 = get_color(image[imageW * (iy+1) + ix]); float4 fresult5 = get_color(image[imageW * (iy-1) + ix]); float4 fresult7; fresult7.x = fresult.x*0.5+fresult4.x*.25+fresult5.x*.25; fresult7.y = fresult.y*0.5+fresult4.y*.25+fresult5.y*.25; fresult7.z = fresult.z*0.5+fresult4.z*.25+fresult5.z*.25; buffer[imageW * iy + ix] = make_color(fresult7.x,fresult7.y,fresult7.z,0); } image[imageW * iy + ix] = buffer[imageW * iy + ix]; //should be use cudaMemcpy, But it fails } //extern extern "C" void cuda_F1D(TColor *dst, int imageW, int imageH) { dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); dim3 grid(iDivUp(imageW, BLOCKDIM_X), iDivUp(imageH, BLOCKDIM_Y)); Copy<<<grid, threads>>>(dst, imageW, imageH); size_t size = imageW*imageH*sizeof(TColor); TColor *host =(TColor*) malloc(size); TColor *dst2; //TColor *dst3; //TColor *d = new TColor(imageW*imageH*sizeof(TColor)); dim3 threads2(imageW,1); dim3 grid2(iDivUp(imageW, imageW), iDivUp(imageH, 1)); *for(int i = 0;i<1;i++) { cudaMalloc( (void **)&dst2, size); cudaMemcpy(dst2, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice); //cudaMalloc( (void **)&dst3, imageW*imageH*sizeof(TColor)); //cudaMemcpy(dst3, dst, imageW*imageH*sizeof(TColor),cudaMemcpyHostToDevice); F1D<<<grid2, threads2>>>(dst, imageW, imageH,dst2); //cudaMemcpy(dst, dst3, imageW*imageH*sizeof(TColor),cudaMemcpyDeviceToHost); cudaFree(dst2); }* } This code works, but cant synchronise the array of image. and lead to many synchronise problem

    Read the article

  • Beginner error in CUDA

    - by Dimitri
    Hi folks, first all i want to wish you a merry christmas. I am writing a small program in CUDA and i have the following errors : contraste.cu(167): error: calling a host function from a __device__/__global__ function is not allowed I don't understand why. Can you please help me and show me my errors. It seems that my program is correct. Here is a the bunch of code causing the problems : __global__ void kernel_contraste(float power, unsigned char tab_in[], unsigned char tab_out[], int nbl, int nbc) { int x = threadIdx.x; printf("I am the thread %d\n", x); } Part of my main program : unsigned char *dimg, *dimg_res; ..... cudaMalloc((void **)dimg, h * w * sizeof(char)); cudaMemcpy(dimg, r.data, h*w*sizeof(char), cudaMemcpyHostToDevice); cudaMalloc((void **)dimg_res, h*w*sizeof(char)); dim3 nbThreadparBloc(256); dim3 numblocs(1); kernel_contraste<<<numblocs, nbThreadparBloc >>>(puissance, dimg, dimg_res, h, w); cudaThreadSynchronize(); ..... cudaFree(dimg); cudaFree(dimg_res); The line 167 is the line where i call the printf in function kernel_contraste. For information, this program takes an image as an input( a sun Rasterfile ) and a power then it calculates the contraste of that image. Thanks !!

    Read the article

  • CUDA: Memory copy to GPU 1 is slower in multi-GPU

    - by zenna
    My company has a setup of two GTX 295, so a total of 4 GPUs in a server, and we have several servers. We GPU 1 specifically was slow, in comparison to GPU 0, 2 and 3 so I wrote a little speed test to help find the cause of the problem. //#include <stdio.h> //#include <stdlib.h> //#include <cuda_runtime.h> #include <iostream> #include <fstream> #include <sstream> #include <string> #include <cutil.h> __global__ void test_kernel(float *d_data) { int tid = blockDim.x*blockIdx.x + threadIdx.x; for (int i=0;i<10000;++i) { d_data[tid] = float(i*2.2); d_data[tid] += 3.3; } } int main(int argc, char* argv[]) { int deviceCount; cudaGetDeviceCount(&deviceCount); int device = 0; //SELECT GPU HERE cudaSetDevice(device); cudaEvent_t start, stop; unsigned int num_vals = 200000000; float *h_data = new float[num_vals]; for (int i=0;i<num_vals;++i) { h_data[i] = float(i); } float *d_data = NULL; float malloc_timer; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord( start, 0 ); cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice); cudaMalloc((void**)&d_data, sizeof(float)*num_vals); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &malloc_timer, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); float mem_timer; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord( start, 0 ); cudaMemcpy(d_data, h_data, sizeof(float)*num_vals,cudaMemcpyHostToDevice); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &mem_timer, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); float kernel_timer; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord( start, 0 ); test_kernel<<<1000,256>>>(d_data); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &kernel_timer, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); printf("cudaMalloc took %f ms\n",malloc_timer); printf("Copy to the GPU took %f ms\n",mem_timer); printf("Test Kernel took %f ms\n",kernel_timer); cudaMemcpy(h_data,d_data, sizeof(float)*num_vals,cudaMemcpyDeviceToHost); delete[] h_data; return 0; } The results are GPU0 cudaMalloc took 0.908640 ms Copy to the GPU took 296.058777 ms Test Kernel took 326.721283 ms GPU1 cudaMalloc took 0.913568 ms Copy to the GPU took[b] 663.182251 ms[/b] Test Kernel took 326.710785 ms GPU2 cudaMalloc took 0.925600 ms Copy to the GPU took 296.915039 ms Test Kernel took 327.127930 ms GPU3 cudaMalloc took 0.920416 ms Copy to the GPU took 296.968384 ms Test Kernel took 327.038696 ms As you can see, the cudaMemcpy to the GPU is well double the amount of time for GPU1. This is consistent between all our servers, it is always GPU1 that is slow. Any ideas why this may be? All servers are running windows XP.

    Read the article

  • CUDA: cudaMemcpy only works in emulation mode.

    - by Jason
    I am just starting to learn how to use CUDA. I am trying to run some simple example code: float *ah, *bh, *ad, *bd; ah = (float *)malloc(sizeof(float)*4); bh = (float *)malloc(sizeof(float)*4); cudaMalloc((void **) &ad, sizeof(float)*4); cudaMalloc((void **) &bd, sizeof(float)*4); ... initialize ah ... /* copy array on device */ cudaMemcpy(ad,ah,sizeof(float)*N,cudaMemcpyHostToDevice); cudaMemcpy(bd,ad,sizeof(float)*N,cudaMemcpyDeviceToDevice); cudaMemcpy(bh,bd,sizeof(float)*N,cudaMemcpyDeviceToHost); When I run in emulation mode (nvcc -deviceemu) it runs fine (and actually copies the array). But when I run it in regular mode, it runs w/o error, but never copies the data. It's as if the cudaMemcpy lines are just ignored. What am I doing wrong? Thank you very much, Jason

    Read the article

  • CUDA threads output different value

    - by kar
    HAi, I wrote a cuda program , i have given the kernel function below. The device memory is allocated through CUDAMalloc(); the value of *md is 10; __global__ void add(int *md) { int x,oper=2; x=threadIdx.x; * md = *md*oper; if(x==1) { *md = *md*0; } if(x==2) { *md = *md*10; } if(x==3) { *md = *md+1; } if(x==4) { *md = *md-1; } } executed the above code add<<<1,5>>(*md) , add<<<1,4>>>(*md) for <<<1,5>>> the output is 19 for <<<1,4>>> the output is 21 1) I have doubt that cudaMalloc() will allocate in device main memory? 2) Why the last thread alone is executed always in the above program? Thank you

    Read the article

  • CUDA memory transfer issue

    - by Vaibhav Sundriyal
    I am trying to execute a code which first transfers data from CPU to GPU memory and vice-versa. In spite of increasing the volume of data, the data transfer time remains the same as if no data transfer is actually taking place. I am posting the code. #include <stdio.h> /* Core input/output operations */ #include <stdlib.h> /* Conversions, random numbers, memory allocation, etc. */ #include <math.h> /* Common mathematical functions */ #include <time.h> /* Converting between various date/time formats */ #include <cuda.h> /* CUDA related stuff */ #include <sys/time.h> __global__ void device_volume(float *x_d,float *y_d) { int index = blockIdx.x * blockDim.x + threadIdx.x; } int main(void) { float *x_h,*y_h,*x_d,*y_d,*z_h,*z_d; long long size=9999999; long long nbytes=size*sizeof(float); timeval t1,t2; double et; x_h=(float*)malloc(nbytes); y_h=(float*)malloc(nbytes); z_h=(float*)malloc(nbytes); cudaMalloc((void **)&x_d,size*sizeof(float)); cudaMalloc((void **)&y_d,size*sizeof(float)); cudaMalloc((void **)&z_d,size*sizeof(float)); gettimeofday(&t1,NULL); cudaMemcpy(x_d, x_h, nbytes, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y_h, nbytes, cudaMemcpyHostToDevice); cudaMemcpy(z_d, z_h, nbytes, cudaMemcpyHostToDevice); gettimeofday(&t2,NULL); et = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms et += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms printf("\n %ld\t\t%f\t\t",nbytes,et); et=0.0; //printf("%f %d\n",seconds,CLOCKS_PER_SEC); // launch a kernel with a single thread to greet from the device //device_volume<<<1,1>>>(x_d,y_d); gettimeofday(&t1,NULL); cudaMemcpy(x_h, x_d, nbytes, cudaMemcpyDeviceToHost); cudaMemcpy(y_h, y_d, nbytes, cudaMemcpyDeviceToHost); cudaMemcpy(z_h, z_d, nbytes, cudaMemcpyDeviceToHost); gettimeofday(&t2,NULL); et = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms et += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms printf("%f\n",et); cudaFree(x_d); cudaFree(y_d); cudaFree(z_d); return 0; } Can anybody help me with this issue? Thanks

    Read the article

  • CUDA linking error - Visual Express 2008 - nvcc fatal due to (null) configuration file

    - by Josh
    Hi, I've been searching extensively for a possible solution to my error for the past 2 weeks. I have successfully installed the Cuda 64-bit compiler (tools) and SDK as well as the 64-bit version of Visual Studio Express 2008 and Windows 7 SDK with Framework 3.5. I'm using windows XP 64-bit. I have confirmed that VSE is able to compile in 64-bit as I have all of the 64-bit options available to me using the steps on the following website: (since Visual Express does not inherently include the 64-bit packages) http://jenshuebel.wordpress.com/2009/02/12/visual-c-2008-express-edition-and-64-bit-targets/ I have confirmed the 64-bit compile ability since the "x64" is available from the pull-down menu under "Tools-Options-VC++ Directories" and compiling in 64-bit does not result in the entire project being "skipped". I have included all the needed directories for 64-bit cuda tools, 64 SDK and Visual Express (\VC\bin\amd64). Here's the error message I receive when trying to compile in 64-bit: 1>------ Build started: Project: New, Configuration: Release x64 ------ 1>Compiling with CUDA Build Rule... 1>"C:\CUDA\bin64\nvcc.exe" -arch sm_10 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin" -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT " -maxrregcount=32 --compile -o "x64\Release\template.cu.obj" "c:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\src\CUDA_Walkthrough_DeviceKernels\template.cu" 1>nvcc fatal : Visual Studio configuration file '(null)' could not be found for installation at 'C:/Program Files (x86)/Microsoft Visual Studio 9.0/VC/bin/../..' 1>Linking... 1>LINK : fatal error LNK1181: cannot open input file '.\x64\Release\template.cu.obj' 1>Build log was saved at "file://c:\Documents and Settings\Administrator\My Documents\Visual Studio 2008\Projects\New\New\x64\Release\BuildLog.htm" 1>New - 1 error(s), 0 warning(s) ========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ========== Here's the simple code I'm trying to compile/run in 64-bit: #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <cuda.h> void mypause () { printf ( "Press [Enter] to continue . . ." ); fflush ( stdout ); getchar(); } __global__ void VecAdd1_Kernel(float* A, float* B, float* C, int N) { int i = blockDim.x*blockIdx.x+threadIdx.x; if (i<N) C[i] = A[i] + B[i]; //result should be a 16x1 array of 250s } __global__ void VecAdd2_Kernel(float* B, float* C, int N) { int i = blockDim.x*blockIdx.x+threadIdx.x; if (i<N) C[i] = C[i] + B[i]; //result should be a 16x1 array of 400s } int main() { int N = 16; float A[16];float B[16]; size_t size = N*sizeof(float); for(int i=0; i<N; i++) { A[i] = 100.0; B[i] = 150.0; } // Allocate input vectors h_A and h_B in host memory float* h_A = (float*)malloc(size); float* h_B = (float*)malloc(size); float* h_C = (float*)malloc(size); //Initialize Input Vectors memset(h_A,0,size);memset(h_B,0,size); h_A = A;h_B = B; printf("SUM = %f\n",A[1]+B[1]); //simple check for initialization //Allocate vectors in device memory float* d_A; cudaMalloc((void**)&d_A,size); float* d_B; cudaMalloc((void**)&d_B,size); float* d_C; cudaMalloc((void**)&d_C,size); //Copy vectors from host memory to device memory cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice); cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice); //Invoke kernel int threadsPerBlock = 256; int blocksPerGrid = (N+threadsPerBlock-1)/threadsPerBlock; VecAdd1(blocksPerGrid, threadsPerBlock,d_A,d_B,d_C,N); VecAdd2(blocksPerGrid, threadsPerBlock,d_B,d_C,N); //Copy results from device memory to host memory //h_C contains the result in host memory cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost); for(int i=0; i<N; i++) //output result from the kernel "VecAdd" { printf("%f ", h_C[i] ); printf("\n"); } printf("\n"); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); mypause(); return 0; }

    Read the article

  • CUDA 4.1 Update

    - by N0xus
    I'm currently working on porting a particle system to update on the GPU via the use of CUDA. With CUDA, I've already passed over the required data I need to the GPU and allocated and copied the date via the host. When I build the project, it all runs fine, but when I run it, the project says I need to allocate my h_position pointer. This pointer is my host pointer and is meant to hold the data. I know I need to pass in the current particle position to the required cudaMemcpy call and they are currently stored in a list with a for loop being created and interated for each particle calling the following line of code: m_particleList[i].positionY = m_particleList[i].positionY - (m_particleList[i].velocity * frameTime * 0.001f); My current host side cuda code looks like this: float* h_position; // Your host pointer. This holds the data (I assume it's already filled with the data.) float* d_position; // Your device pointer, we will allocate and fill this float* d_velocity; float* d_time; int threads_per_block = 128; // You should play with this value int blocks = m_maxParticles/threads_per_block + ( (m_maxParticles%threads_per_block)?1:0 ); const int N = 10; size_t size = N * sizeof(float); cudaMalloc( (void**)&d_position, m_maxParticles * sizeof(float) ); cudaMemcpy( d_position, h_position, m_maxParticles * sizeof(float), cudaMemcpyHostToDevice); Both of which were / can be found inside my UpdateParticle() method. I had originally thought it would be a simple case of changing the h_position variable in the cudaMemcpy to m_particleList[i] but then I get the following error: no suitable conversion function from "ParticleSystemClass::ParticleType" to "const void *" exists I've probably messed up somewhere, but could someone please help fix the issues I'm facing. Everything else seems to running fine, it's just when I try to run the program that certain things hit the fan.

    Read the article

  • Transferring data from 2d Dynamic array in C to CUDA and back

    - by Soumya
    I have a dynamically declared 2D array in my C program, the contents of which I want to transfer to a CUDA kernel for further processing. Once processed, I want to populate the dynamically declared 2D array in my C code with the CUDA processed data. I am able to do this with static 2D C arrays but not with dynamically declared C arrays. Any inputs would be welcome! I mean the dynamic array of dynamic arrays. The test code that I have written is as below. #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <conio.h> #include <math.h> #include <stdlib.h> const int nItt = 10; const int nP = 5; __device__ int d_nItt = 10; __device__ int d_nP = 5; __global__ void arr_chk(float *d_x_k, float *d_w_k, int row_num) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; int index1 = (row_num * d_nP) + index; if ( (index1 >= row_num * d_nP) && (index1 < ((row_num +1)*d_nP))) //Modifying only one row data pertaining to one particular iteration { d_x_k[index1] = row_num * d_nP; d_w_k[index1] = index; } } float **mat_create2(int r, int c) { float **dynamicArray; dynamicArray = (float **) malloc (sizeof (float)*r); for(int i=0; i<r; i++) { dynamicArray[i] = (float *) malloc (sizeof (float)*c); for(int j= 0; j<c;j++) { dynamicArray[i][j] = 0; } } return dynamicArray; } /* Freeing memory - here only number of rows are passed*/ void cleanup2d(float **mat_arr, int x) { int i; for(i=0; i<x; i++) { free(mat_arr[i]); } free(mat_arr); } int main() { //float w_k[nItt][nP]; //Static array declaration - works! //float x_k[nItt][nP]; // if I uncomment this dynamic declaration and comment the static one, it does not work..... float **w_k = mat_create2(nItt,nP); float **x_k = mat_create2(nItt,nP); float *d_w_k, *d_x_k; // Device variables for w_k and x_k int nblocks, blocksize, nthreads; for(int i=0;i<nItt;i++) { for(int j=0;j<nP;j++) { x_k[i][j] = (nP*i); w_k[i][j] = j; } } for(int i=0;i<nItt;i++) { for(int j=0;j<nP;j++) { printf("x_k[%d][%d] = %f\t",i,j,x_k[i][j]); printf("w_k[%d][%d] = %f\n",i,j,w_k[i][j]); } } int size1 = nItt * nP * sizeof(float); printf("\nThe array size in memory bytes is: %d\n",size1); cudaMalloc( (void**)&d_x_k, size1 ); cudaMalloc( (void**)&d_w_k, size1 ); if((nP*nItt)<32) { blocksize = nP*nItt; nblocks = 1; } else { blocksize = 32; // Defines the number of threads running per block. Taken equal to warp size nthreads = blocksize; nblocks = ceil(float(nP*nItt) / nthreads); // Calculated total number of blocks thus required } for(int i = 0; i< nItt; i++) { cudaMemcpy( d_x_k, x_k, size1,cudaMemcpyHostToDevice ); //copy of x_k to device cudaMemcpy( d_w_k, w_k, size1,cudaMemcpyHostToDevice ); //copy of w_k to device arr_chk<<<nblocks, blocksize>>>(d_x_k,d_w_k,i); cudaMemcpy( x_k, d_x_k, size1, cudaMemcpyDeviceToHost ); cudaMemcpy( w_k, d_w_k, size1, cudaMemcpyDeviceToHost ); } printf("\nVerification after return from gpu\n"); for(int i = 0; i<nItt; i++) { for(int j=0;j<nP;j++) { printf("x_k[%d][%d] = %f\t",i,j,x_k[i][j]); printf("w_k[%d][%d] = %f\n",i,j,w_k[i][j]); } } cudaFree( d_x_k ); cudaFree( d_w_k ); cleanup2d(x_k,nItt); cleanup2d(w_k,nItt); getch(); return 0;

    Read the article

  • CUDA Global Memory, Where is it?

    - by gamerx
    I understand that in CUDA's memory hierachy, we have things like shared memory, texture memory, constant memory, registers and of course the global memory which we allocate using cudaMalloc(). I've been searching through whatever documentations I can find but I have yet to come across any that explicitly explains what is the global memory. I believe that the global memory allocated is on the GDDR of graphics card itself and not the RAM that is shared with the CPU since one of the documentations did state that the pointer cannot be dereferenced by the host side. Am I right?

    Read the article

  • CUDA error message : unspecified launch failure

    - by user1297065
    I received the error message "unspecified launch failure" in following part. off_t *matches_position; ...... cudaMalloc ( (void **) &mat_position, sizeof(off_t)*10); ...... cudaMemcpy (mat_position, matches_position, sizeof(off_t)*10, cudaMemcpyHostToDevice ); ...... err=cudaMemcpy (matches_position, mat_position, sizeof(off_t)*10, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess) { printf("\n3 %s\n", cudaGetErrorString(err)); } Do you know why this error message is reported??

    Read the article

  • CUDA, more threads for same work = Longer run time despite better occupancy, Why?

    - by zenna
    I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance. I created the following program to illustrate the problem: #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> __global__ void less_threads(float * d_out) { int num_inliers; for (int j=0;j<800;++j) { //Do 12 computations num_inliers += threadIdx.x*1; num_inliers += threadIdx.x*2; num_inliers += threadIdx.x*3; num_inliers += threadIdx.x*4; num_inliers += threadIdx.x*5; num_inliers += threadIdx.x*6; num_inliers += threadIdx.x*7; num_inliers += threadIdx.x*8; num_inliers += threadIdx.x*9; num_inliers += threadIdx.x*10; num_inliers += threadIdx.x*11; num_inliers += threadIdx.x*12; } if (threadIdx.x == -1) d_out[blockIdx.x*blockDim.x+threadIdx.x] = num_inliers; } __global__ void more_threads(float *d_out) { int num_inliers; for (int j=0;j<800;++j) { // Do 4 computations num_inliers += threadIdx.x*1; num_inliers += threadIdx.x*2; num_inliers += threadIdx.x*3; num_inliers += threadIdx.x*4; } if (threadIdx.x == -1) d_out[blockIdx.x*blockDim.x+threadIdx.x] = num_inliers; } int main(int argc, char* argv[]) { float *d_out = NULL; cudaMalloc((void**)&d_out,sizeof(float)*25000); more_threads<<<780,128>>>(d_out); less_threads<<<780,32>>>(d_out); return 0; } Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work. Significant results form the profiler results are as followsL: more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552 less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381 As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions. Why are there more instructions? Why is there any branching, let alone divergent branching, considering there is no conditional code? Why are there any gst requests when there is no global memory access? What is going on here! Thanks

    Read the article

  • How to copy the memeory allocated in device function back to main memory

    - by xhe8
    I have a CUDA program containing a host function and a device function Execute(). In the host function, I allocate a global memory output which will then be passed to the device function and used to store the address of the global memory allocated within the device function. I want to access the in-kernel allocated memory in the host function. The following is the code: #include <stdio.h> typedef struct { int * p; int num; } Structure_A; \__global__ void Execute(Structure_A *output); int main(){ Structure_A *output; cudaMalloc((void***)&output,sizeof(Structure_A)*1); dim3 dimBlockExecute(1,1); dim3 dimGridExecute(1,1); Execute<<<dimGridExecute,dimBlockExecute>>>(output); Structure_A * output_cpu; int * p_cpu; cudaError_t err; output_cpu= (Structure_A*)malloc(1); err=cudaMemcpy(output_cpu,output,sizeof(Structure_A),cudaMemcpyDeviceToHost); if( err != cudaSuccess) { printf("CUDA error a: %s\n", cudaGetErrorString(err)); exit(-1); } p_cpu=(int *)malloc(1); err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost); if( err != cudaSuccess) { printf("CUDA error b: %s\n", cudaGetErrorString(err)); exit(-1); } printf("output=(%d,%d)\n",output_cpu[0].num,p_cpu[0]); return 0; } \__global__ void Execute(Structure_A *output){ int thid=threadIdx.x; output[thid].p= (int*)malloc(thid+1); output[thid].num=(thid+1); output[thid].p[0]=5; } I can compile the program. But when I run it, I got a error showing that there is a invalid argument in the following memory copy function. "err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);" CUDA version is 4.2. CUDA card: Tesla C2075 OS: x86_64 GNU/Linux

    Read the article

1