How to synchronize cuda threads when they are in the same loop and we need to synchronize them to ex

Posted by Vickey on Stack Overflow See other posts from Stack Overflow or by Vickey
Published on 2010-05-12T07:03:39Z Indexed on 2010/05/12 7:44 UTC
Read the original article Hit count: 264

Filed under:
|
|
|

Hi all, I have written a code and Now I want to implement this on cuda GPU but I'm new to synchronization so please help me with this, It's little urgent to me. Below I'm presenting the code and I want to that LOOP1 to be executed by all threads (heance I want to this portion to take advantage of cuda and the remaining portion (the portion other from the LOOP1) is to be executed by only a single thread.

 do{
  point_set = master_Q[(*num_mas) - 1].q;
  List* temp = point_set;
  List* pa = point_set;
  if(master_Q[num_mas[0] - 1].max)
   max_level =  (int) (ceilf(il2 * log(master_Q[num_mas[0] - 1].max)));
  *num_mas = (*num_mas) - 1;
  while(point_set){
   List* insert_ele = temp;
   while(temp){
    insert_ele = temp;
    if((insert_ele->dist[insert_ele->dist_index-1] <= pow(2, max_level-1)) || (top_level == max_level)){
     if(point_set == temp){
      point_set = temp->next;
      pa  = temp->next;
     }
     else{
      pa->next = temp->next;
     }
     temp = NULL;
     List* new_point_set = point_set;
     float maximum_dist = 0;
     if(parent->p_index != insert_ele->point_index){
      List* tmp = new_point_set;
      float *b = &(data[(insert_ele->point_index)*point_len]);
 **LOOP 1:**     while(tmp){
       float *c = &(data[(tmp->point_index)*point_len]);
         float sum = 0.;
         for(int j = 0; j < point_len; j+=2){
        float d1 = b[j] - c[j];
        float d2 = b[j+1] - c[j+1];
        d1 *= d1;
        d2 *= d2;
        sum = sum + d1 + d2;
           }
          tmp->dist[tmp->dist_index] = sqrt(sum);
       if(maximum_dist < tmp->dist[tmp->dist_index])
           maximum_dist = tmp->dist[tmp->dist_index];
          tmp->dist_index = tmp->dist_index+1;
          tmp = tmp->next;
      }
      max_distance = maximum_dist;
     }
     while(new_point_set || insert_ele){
      List* far, *par, *tmp, *tmp_new;
      far = NULL;
      tmp = new_point_set;
      tmp_new = NULL;
      float level_dist = pow(2, max_level-1);
      float maxdist = 0, maxp = 0;
      while(tmp){
       if(tmp->dist[(tmp->dist_index)-1] > level_dist){
        if(maxdist < tmp->dist[tmp->dist_index-1])
         maxdist = tmp->dist[tmp->dist_index-1];
        if(tmp == new_point_set){
         new_point_set = tmp->next;
         par = tmp->next;
        }
        else{
         par->next = tmp->next;
        }
        if(far == NULL){
         far = tmp;
         tmp_new = far;
        }
        else{
         tmp_new->next = tmp;
         tmp_new = tmp;
        }
        if(parent->p_index != insert_ele->point_index)
         tmp->dist_index = tmp->dist_index - 1;
        tmp = tmp->next;
        tmp_new->next = NULL;
       }
       else{
        par = tmp;
        if(maxp < tmp->dist[(tmp->dist_index)-1])
         maxp = tmp->dist[(tmp->dist_index)-1];
        tmp = tmp->next;
       }

      }
      if(0 == maxp){    
       tmp = new_point_set;
       aloc_mem[*tree_index].p_index = insert_ele->point_index;
       aloc_mem[*tree_index].no_child = 0;
       aloc_mem[*tree_index].level = max_level--;
       parent->children_index[parent->no_child++] = *tree_index;
       parent = &(aloc_mem[*tree_index]);
       tree_index[0] = tree_index[0]+1;
       while(tmp){
        aloc_mem[*tree_index].p_index = tmp->point_index;
        aloc_mem[(*tree_index)].no_child = 0;
        aloc_mem[(*tree_index)].level = master_Q[(*cur_count_Q)-1].level;
        parent->children_index[parent->no_child] = *tree_index;
        parent->no_child = parent->no_child + 1;
        (*tree_index)++;
        tmp = tmp->next;
       }
         cur_count_Q[0] = cur_count_Q[0]-1;
         new_point_set = NULL;
      }
      master_Q[*num_mas].q = far;
      master_Q[*num_mas].parent = parent; 
      master_Q[*num_mas].valid = true;
      master_Q[*num_mas].max = maxdist;
      master_Q[*num_mas].level = max_level;

             num_mas[0] = num_mas[0]+1;
      if(0 != maxp){
       aloc_mem[*tree_index].p_index = insert_ele->point_index;
       aloc_mem[*tree_index].no_child = 0;
       aloc_mem[*tree_index].level = max_level;
       parent->children_index[parent->no_child++] = *tree_index;
       parent = &(aloc_mem[*tree_index]);
       tree_index[0] = tree_index[0]+1;
       if(maxp){
        int new_level = ((int) (ceilf(il2 * log(maxp)))) +1;
        if (new_level < (max_level-1))
         max_level = new_level;
        else
         max_level--;
       }
       else
        max_level--;
      }
      if( 0 == maxp )
       insert_ele = NULL;
     }

    }
    else{
     if(NULL == temp->next){
      master_Q[*num_mas].q = point_set;
      master_Q[*num_mas].parent = parent; 
      master_Q[*num_mas].valid = true;
      master_Q[*num_mas].level = max_level;
      num_mas[0] = num_mas[0]+1;
     }
     pa = temp;
     temp = temp->next;
    }
   }
   if((*num_mas) > 1){
    List *temp2 = master_Q[(*num_mas)-1].q;
    while(temp2){
     List* temp3 = master_Q[(*num_mas)-2].q;
     master_Q[(*num_mas)-2].q = temp2;
     if((master_Q[(*num_mas)-1].parent)->p_index != (master_Q[(*num_mas)-2].parent)->p_index){
      temp2->dist_index = temp2->dist_index - 1;
     }
     temp2 = temp2->next;
     master_Q[(*num_mas)-2].q->next = temp3;
    }
    num_mas[0] = num_mas[0]-1;
   }
   point_set = master_Q[(*num_mas)-1].q;
   temp = point_set;
   pa = point_set;
   parent = master_Q[(*num_mas)-1].parent;
   max_level = master_Q[(*num_mas)-1].level;
   if(master_Q[(*num_mas)-1].max)
    if( max_level > ((int) (ceilf(il2 * log(master_Q[(*num_mas)-1].max)))) +1)
        max_level = ((int) (ceilf(il2 * log(master_Q[(*num_mas)-1].max)))) +1;
   num_mas[0] = num_mas[0]-1;
  }

 }while(*num_mas > 0);

© Stack Overflow or respective owner

Related posts about cuda

Related posts about gpu