Created
November 6, 2021 08:55
-
-
Save supplient/cf657df60332a198992602f6452fd532 to your computer and use it in GitHub Desktop.
并行算法:反向scan,将片段偏移量转化为片段索引
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Windows.h> | |
// cuda_helpers.h is my own repo | |
// refer to https://github.com/supplient/CudaHelper | |
#define CUDA_HELPER_NAMESPACE cuh | |
#include <cuda_helpers.h> | |
#include <thrust/scan.h> | |
#include <thrust/execution_policy.h> | |
#include <thrust/fill.h> | |
#include <string> | |
#include <unordered_map> | |
#include <random> | |
#include <iostream> | |
#include <functional> | |
#include <vector> | |
#include <numeric> | |
using namespace std; | |
namespace kernel { | |
__global__ void UsingSeqFill(uint32_t* B, const uint32_t* A, size_t n) { | |
auto ti = LINEAR_THREAD_ID; | |
if (ti >= n) | |
return; | |
uint32_t end = A[ti]; | |
uint32_t begin = ti == 0 ? 0 : A[ti - 1]; | |
thrust::fill_n(thrust::seq, B + begin, end - begin, ti); | |
} | |
__global__ void UsingParFill(uint32_t* B, const uint32_t* A, size_t n) { | |
auto ti = LINEAR_THREAD_ID; | |
if (ti >= n) | |
return; | |
uint32_t end = A[ti]; | |
uint32_t begin = ti == 0 ? 0 : A[ti - 1]; | |
thrust::fill_n(thrust::device, B + begin, end - begin, ti); | |
} | |
} | |
size_t UsingSeqFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) { | |
thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp); | |
kernel::UsingSeqFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n); | |
return cuh::GetEntry(d_tmp, n - 1); | |
} | |
size_t UsingParFill(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) { | |
thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp); | |
kernel::UsingParFill KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n); | |
return cuh::GetEntry(d_tmp, n - 1); | |
} | |
namespace kernel { | |
__global__ void SetMark(uint32_t* B, const uint32_t* A, size_t n) { | |
auto ti = LINEAR_THREAD_ID; | |
if (ti >= n-1) | |
return; | |
B[A[ti]] = 1; | |
} | |
} | |
size_t LinearAlgo(uint32_t* d_B, const uint32_t* d_A, size_t n, uint32_t* d_tmp) { | |
thrust::inclusive_scan(thrust::device, d_A, d_A + n, d_tmp); | |
auto m = cuh::GetEntry(d_tmp, n - 1); | |
CheckCuda(cudaMemsetAsync(d_B, 0, sizeof(uint32_t) * m)); | |
kernel::SetMark KERNEL_ARGS_SIMPLE(n, 64) (d_B, d_tmp, n); | |
thrust::inclusive_scan(thrust::device, d_B, d_B + m, d_B); | |
return m; | |
} | |
typedef size_t(*WorkFunc)(uint32_t*, const uint32_t*, size_t, uint32_t*); | |
/// <summary> | |
/// Test function, including correctness testing. | |
/// </summary> | |
/// <param name="f">: Function to test.</param> | |
/// <param name="seed">: A seed for random.</param> | |
/// <returns>Time in ms.</returns> | |
double Benchmark(WorkFunc f, uint32_t seed) { | |
constexpr uint32_t minSegSize = 1; | |
constexpr uint32_t maxSegSize = 500; | |
constexpr size_t n = 1000; | |
constexpr size_t ITER_N = 10000; | |
default_random_engine generator(seed); | |
uniform_int_distribution<uint32_t> dist(minSegSize, maxSegSize); | |
auto rint = bind(dist, generator); | |
vector<uint32_t> A(n); { | |
for (size_t i = 0; i < n; i++) | |
A[i] = rint(); | |
} | |
uint32_t m = accumulate(A.begin(), A.end(), 0); | |
vector<uint32_t> B(m); { | |
size_t bi = 0; | |
for (uint32_t ai = 0; ai < n; ai++) { | |
uint32_t count = A[ai]; | |
do { | |
B[bi] = ai; | |
bi++; | |
count--; | |
} while (count > 0); | |
} | |
assert(bi == m); | |
} | |
vector<uint32_t> resB(m); | |
size_t resSize; | |
cuh::TempStorage<uint32_t> d_A; | |
cuh::TempStorage<uint32_t> d_B; | |
cuh::TempStorage<uint32_t> d_tmp; | |
d_A.CheckAndAlloc(n); | |
d_B.CheckAndAlloc(m); | |
d_tmp.CheckAndAlloc(n); | |
cuh::Copy<uint32_t>(d_A, A.data(), A.size()); | |
LARGE_INTEGER start, end; | |
QueryPerformanceCounter(&start); | |
for(size_t i=0; i<ITER_N; i++) | |
resSize = f(d_B, d_A, n, d_tmp); | |
QueryPerformanceCounter(&end); | |
cuh::Copy<uint32_t>(resB.data(), d_B, m); | |
assert(resSize == m); | |
for (size_t i = 0; i < resSize; i++) | |
assert(resB[i] == B[i]); | |
LARGE_INTEGER freq; | |
QueryPerformanceFrequency(&freq); | |
double delta = double(end.QuadPart - start.QuadPart) / double(freq.QuadPart * ITER_N) * 1000.0; | |
return delta; | |
} | |
int main() { | |
uint32_t seed = 0x46af87ecL; | |
// Warmup | |
cout << "Warm up...\n"; | |
Benchmark(UsingSeqFill, seed); | |
Benchmark(UsingParFill, seed); | |
Benchmark(LinearAlgo, seed); | |
cout << "Calculating...\n"; | |
unordered_map<string, double> algo2time; | |
algo2time["SeqFill" ] = Benchmark(UsingSeqFill, seed); | |
algo2time["ParFill" ] = Benchmark(UsingParFill, seed); | |
algo2time["Linear" ] = Benchmark(LinearAlgo, seed); | |
for (auto& pair : algo2time) { | |
printf("%s: %fms\n", pair.first.c_str(), pair.second); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output on RTX 2060, i7 10700