Sorting (small) arrays by key in CUDA -


i'm trying write function takes block of unsorted key/value pairs such as

<7, 4> <2, 8> <3, 1> <2, 2> <1, 5> <7, 1> <3, 8> <7, 2> 

and sorts them key while reducing values of pairs same key:

<1, 5> <2, 10> <3, 9> <7, 7> 

currently, i'm using __device__ function 1 below bitonic sort combine values of same key , set old data infinitely large value (just using 99 now) subsequent bitonic sort sift them bottom , array cut value of int * removed.

__device__ void interbitonicsortreduce(int2 *sdata, int tid, int recordnum, int *removed) {   int n = min(default_dimblock, recordnum);   (int k = 2; k <= n; k *= 2) {     (int j = k / 2; j > 0; j /= 2) {       int ixj = tid ^ j;       if (ixj > tid) {         if (sdata[tid].x == sdata[ixj].x && sdata[tid].x < 99) {           atomicadd(&sdata[tid].y, sdata[ixj].y);           sdata[ixj].x = 99;            sdata[ixj].y = 99;            atomicadd(removed, 1);          }            if ((tid & k) == 0 && sdata[tid].x > sdata[ixj].x)           swapdata2(sdata[tid], sdata[ixj]);         if ((tid & k) != 0 && sdata[tid].x < sdata[ixj].x)           swapdata2(sdata[tid], sdata[ixj]);         __syncthreads();       }        }      } } 

this works fine small sets of data larger sets (though still within size of single block) single call won't it.

is wise try combine sorting , reduction in same function? function need called more once possible determine how many times needs called exhaust data based on size?

or should preform reduction separately this:

__device__ int interreduce(int2 *sdata, int tid) {   int index = tid;   while (sdata[index].x == sdata[tid].x) {     index--;     if (index < 0)       break;   }   if (index+1 != tid) {     atomicadd(&sdata[index+1].y, sdata[tid].y);     sdata[tid].x = 99;     sdata[tid].y = 99;     return 1;   }   return 0; } 

i'm trying come efficient solution, experience cuda , parallel algorithms limited.

you can use thrust this.

use thrust::sort_by_key followed thrust::reduce_by_key

here's example:

#include <iostream> #include <thrust/device_vector.h> #include <thrust/copy.h> #include <thrust/sort.h> #include <thrust/reduce.h> #include <thrust/sequence.h>  #define n 12 typedef thrust::device_vector<int>::iterator dintiter; int main(){    thrust::device_vector<int> keys(n);   thrust::device_vector<int> values(n);   thrust::device_vector<int> new_keys(n);   thrust::device_vector<int> new_values(n);   thrust::sequence(keys.begin(), keys.end());   thrust::sequence(values.begin(), values.end());    keys[3] = 1;   keys[9] = 1;   keys[8] = 2;   keys[7] = 4;    thrust::sort_by_key(keys.begin(), keys.end(), values.begin());   thrust::pair<dintiter, dintiter> new_end;   new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin());    std::cout << "results  values:" << std::endl;   thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " "));   std::cout << std::endl << "results keys:" << std::endl;   thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " "));   std::cout << std::endl;    return 0; } 

Comments