Created
September 23, 2021 18:08
-
-
Save deckar01/a0df85056f2635677187b011000d84ed to your computer and use it in GitHub Desktop.
[AZSPCS] Monochromatic Triangles: CUDA Simulated Annealing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <iomanip> | |
#include <math.h> | |
#include <curand.h> | |
#include <curand_kernel.h> | |
#define M (N * (N - 1) / 2) | |
#define index (blockDim.x * blockIdx.x + threadIdx.x) | |
#define offset (M * index) | |
#define roll(n) (curand(&rng[index]) % (n)) | |
#define TRI(x, y) ((y * (y - 1) / 2) + x) | |
#define TRI_O(x, y) (x < y ? TRI(x, y) : TRI(y, x)) | |
typedef uint8_t color; | |
__global__ | |
void init(int N, int *counts, color *edges, curandState_t *rng, time_t seed) | |
{ | |
// Randomly assign edge colors. | |
curand_init(seed, index, 0, &rng[index]); | |
for (int i = 0; i < M; i++) | |
edges[offset + i] = roll(3); | |
// Count the monochromatic triangles. | |
counts[index] = 0; | |
for (int x = 0; x < N; x++) { | |
for (int y = x + 1; y < N; y++) { | |
color a = edges[offset + TRI(x, y)]; | |
for (int z = y + 1; z < N; z++) { | |
color b = edges[offset + TRI(y, z)]; | |
color c = edges[offset + TRI(x, z)]; | |
if (a == b && b == c) counts[index]++; | |
} | |
} | |
} | |
} | |
__global__ | |
void search(int N, int LIMIT, int *counts, color *edges, curandState_t *rng) | |
{ | |
// Simulate annealing (gradient descent + random walk). | |
for (int t = 0; t < LIMIT; t++) { | |
// Select an edge randomly. | |
int x = roll(N); | |
int y = roll(N - 1); | |
if (y >= x) y++; | |
color a = edges[offset + TRI_O(x, y)]; | |
// Select a neighboring color randomly. | |
color neighbor = (a + (roll(2) + 1)) % 3; | |
// Compute the score with the new color. | |
int neighbor_score = counts[index]; | |
for (int z = 0; z < N; z++) { | |
if (z == x || z == y) continue; | |
// Get the adjacent edges. | |
color b = edges[offset + TRI_O(z, y)]; | |
color c = edges[offset + TRI_O(z, x)]; | |
// Check for an unaffected non-monochromatic triangle. | |
if (b != c) continue; | |
// Check for breaking a monochromatic triangle. | |
if (a == b) { | |
if (neighbor != b) neighbor_score--; | |
} | |
// Check for creating a monochromatic triangle. | |
else { | |
if (neighbor == b) neighbor_score++; | |
} | |
} | |
// Always keep improvements | |
if (neighbor_score < counts[index]) { | |
counts[index] = neighbor_score; | |
edges[offset + TRI_O(x, y)] = neighbor; | |
// OPTIMIZATION: Don't cache local minima. | |
// O(M) copies are expensive and mostly go unused. | |
// The final state will converge to the global minima. | |
} | |
// Keep non-improvements randomly. | |
else { | |
// Cooling schedule heuristic. | |
double T = 10 * (1.0f - (double)t / LIMIT); | |
double delta = counts[index] - neighbor_score; | |
double bar = curand_uniform_double(&rng[index]); | |
// Weighted by an exponential cooling schedule. | |
if (exp(delta / T) >= bar) { | |
counts[index] = neighbor_score; | |
edges[offset + TRI_O(x, y)] = neighbor; | |
} | |
} | |
} | |
} | |
int main(int argc, char *argv[]) | |
{ | |
// Graph size. | |
int N = atoi(argv[1]); | |
// Simulated annealing iteration limit heuristic. | |
int LIMIT = 220 * M; | |
// RTX 3090's CUDA core count. | |
int nodes = 10496; | |
// Optimal block size for this kernel for small N. | |
int block_size = 128; | |
int blocks = nodes / block_size; | |
// Print a summary of the current search. | |
cudaDeviceProp prop; | |
cudaGetDeviceProperties(&prop, 0); | |
std::cout << "N: " << N << std::endl; | |
std::cout << "Device: " << prop.name << std::endl; | |
std::cout << "Blocks: " << blocks << std::endl; | |
std::cout << "Threads per Block: " << block_size << std::endl; | |
std::cout << "Total Threads: " << nodes << std::endl << std::endl; | |
// Allocate memory. | |
int *counts; | |
color *edges; | |
curandState_t *rng; | |
// Memory that needs to be read by the host. | |
cudaMallocManaged(&counts, nodes * sizeof(int)); | |
cudaMallocManaged(&edges, nodes * M * sizeof(color)); | |
// Memory that only the device uses. | |
cudaMalloc(&rng, nodes * sizeof(curandState_t)); | |
// Setup GPU timers. | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
// Randomize the starting graphs. | |
cudaEventRecord(start); | |
init <<<blocks, block_size >>> (N, counts, edges, rng, time(NULL)); | |
cudaEventRecord(stop); | |
cudaDeviceSynchronize(); | |
float t; | |
cudaEventElapsedTime(&t, start, stop); | |
float triangle_speed = (float)nodes * (M * (N - 2) / 2) / (t * 1e-3); | |
std::cout << std::scientific << std::setprecision(1); | |
std::cout << "Setup :: "; | |
std::cout << "Ts:" << triangle_speed << std::endl; | |
std::cout << "----" << std::endl << std::endl; | |
// Search until all monochromatic triangles are eliminated. | |
int minima = INT_MAX; | |
while (minima > 0) { | |
// Saturate the GPU with simulated annealing search threads. | |
cudaEventRecord(start); | |
search <<<blocks, block_size>>> (N, LIMIT, counts, edges, rng); | |
cudaEventRecord(stop); | |
cudaDeviceSynchronize(); | |
// Calculate the execution time and speed. | |
cudaEventElapsedTime(&t, start, stop); | |
triangle_speed = (float)nodes * LIMIT * N / (t * 1e-3); | |
// Find the best improvement in the batch. | |
int improvement = -1; | |
for (int b = 0; b < nodes; b++) { | |
if (counts[b] < minima) { | |
improvement = b; | |
minima = counts[b]; | |
} | |
} | |
// Overwrite the last monitor. | |
std::cout << " \r"; | |
// Report improvements after each batch. | |
if (improvement >= 0) { | |
for (int i = 1; i < N; i++) { | |
for (int j = 0; j < i; j++) | |
std::cout << (int)edges[improvement * M + TRI(j, i)]; | |
if (i < N - 1) std::cout << ","; | |
} | |
std::cout << std::endl << std::endl; | |
std::cout << minima << " :: "; | |
} | |
// Monitor the limit and processing speed. | |
std::cout << "L:" << (float)LIMIT << ", "; | |
std::cout << "T/s:" << triangle_speed << " "; | |
// Start a new monitor after a report. | |
if (improvement >= 0) { | |
std::cout << std::endl << "----" << std::endl << std::endl; | |
} | |
std::cout << std::flush; | |
// Slowly search deeper into the solution space. | |
LIMIT += (LIMIT * 0.05 + 1); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment