i'm trying write function takes block of unsorted key/value pairs such as
<7, 4> <2, 8> <3, 1> <2, 2> <1, 5> <7, 1> <3, 8> <7, 2>
and sorts them key while reducing values of pairs same key:
<1, 5> <2, 10> <3, 9> <7, 7>
currently, i'm using __device__
function 1 below bitonic sort combine values of same key , set old data infinitely large value (just using 99
now) subsequent bitonic sort sift them bottom , array cut value of int *
removed.
__device__ void interbitonicsortreduce(int2 *sdata, int tid, int recordnum, int *removed) { int n = min(default_dimblock, recordnum); (int k = 2; k <= n; k *= 2) { (int j = k / 2; j > 0; j /= 2) { int ixj = tid ^ j; if (ixj > tid) { if (sdata[tid].x == sdata[ixj].x && sdata[tid].x < 99) { atomicadd(&sdata[tid].y, sdata[ixj].y); sdata[ixj].x = 99; sdata[ixj].y = 99; atomicadd(removed, 1); } if ((tid & k) == 0 && sdata[tid].x > sdata[ixj].x) swapdata2(sdata[tid], sdata[ixj]); if ((tid & k) != 0 && sdata[tid].x < sdata[ixj].x) swapdata2(sdata[tid], sdata[ixj]); __syncthreads(); } } } }
this works fine small sets of data larger sets (though still within size of single block) single call won't it.
is wise try combine sorting , reduction in same function? function need called more once possible determine how many times needs called exhaust data based on size?
or should preform reduction separately this:
__device__ int interreduce(int2 *sdata, int tid) { int index = tid; while (sdata[index].x == sdata[tid].x) { index--; if (index < 0) break; } if (index+1 != tid) { atomicadd(&sdata[index+1].y, sdata[tid].y); sdata[tid].x = 99; sdata[tid].y = 99; return 1; } return 0; }
i'm trying come efficient solution, experience cuda , parallel algorithms limited.
you can use thrust this.
use thrust::sort_by_key followed thrust::reduce_by_key
here's example:
#include <iostream> #include <thrust/device_vector.h> #include <thrust/copy.h> #include <thrust/sort.h> #include <thrust/reduce.h> #include <thrust/sequence.h> #define n 12 typedef thrust::device_vector<int>::iterator dintiter; int main(){ thrust::device_vector<int> keys(n); thrust::device_vector<int> values(n); thrust::device_vector<int> new_keys(n); thrust::device_vector<int> new_values(n); thrust::sequence(keys.begin(), keys.end()); thrust::sequence(values.begin(), values.end()); keys[3] = 1; keys[9] = 1; keys[8] = 2; keys[7] = 4; thrust::sort_by_key(keys.begin(), keys.end(), values.begin()); thrust::pair<dintiter, dintiter> new_end; new_end = thrust::reduce_by_key(keys.begin(), keys.end(), values.begin(), new_keys.begin(), new_values.begin()); std::cout << "results values:" << std::endl; thrust::copy(new_values.begin(), new_end.second, std::ostream_iterator<int>( std::cout, " ")); std::cout << std::endl << "results keys:" << std::endl; thrust::copy(new_keys.begin(), new_end.first, std::ostream_iterator<int>( std::cout, " ")); std::cout << std::endl; return 0; }
Comments
Post a Comment