spmv_csr_scalar_kernel(const int num_rows ,
const int * ptr ,
const int * indices ,
const float * data ,
const float * x,
float * y)
{
int row = get_global_id(0);
if(row < num_rows)
{
float dot = 0;
int row_start = ptr[row];
int row_end = ptr[row+1];
for (int jj = row_start; jj < row_end; jj++)
{ dot += data[jj] * x[indices[jj]];
}
y[row] += dot;
}
}
Above is the Open Cl code for multiplying a sparse matrix in CSR format with a Column vector.It uses one global work item per for loop.Can anybody help me in using two work items in each for loop.I am new to open cl and get a lot of issues if I modify even the smallest thing.Please help me.This a part of my project.I made it this parallel but I wanna make it more parallel.Please help me if you can.plzzzz
A single work item executes the for loop from row_start to row_end.I want that this row or for loop is further divided into two parts each executed by a single work item.How do I go on accomplishing that?
This is what I could come up with but its returning the wrong output.plzz help
__kernel void mykernel(__global int* colvector,__global int* val,__global int* result,__global int* index,__global int* rowptr,__global int* sync)
{
__global int vals[8]={0,0,0,0,0,0,0,0};
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_GLOBAL_MEM_FENCE);
int thread_id=get_global_id(0);
int warp_id=thread_id/2;
int lane=(thread_id)&1;
int row=warp_id;
if(row<4)
{
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[thread_id]=0;
for (int i = row_start+lane; i<row_end; i+=2)
{
vals[thread_id]+=val[i]*colvector[index[i]];
}
vals[thread_id]+=vals[thread_id+1];
if(lane==0){
result[row] += vals[thread_id];
}
}
}