Codice
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include "energy_storms.h"
#define NUM_THREADS_PER_BLOCK 256
struct max_pos {
float max;
int pos;
};
__global__ void layerUpdateKernel(float* layer, int layer_size, int* d_posval, int num_impacts, float threshold) {
int myID = blockIdx.x * blockDim.x + threadIdx.x;
if (myID < layer_size) {
float total_energy_to_add = layer[myID]; // Start with the current energy at this position (non fotto con i float)
for (int j = 0; j < num_impacts; j++) {
int position = d_posval[j*2];
float energy = (float)d_posval[j*2+1] * 1000.0f / layer_size;
int distance = abs(position - myID) + 1;
float atenuacion = sqrtf((float)distance);
float energy_k = energy / atenuacion;
if (energy_k >= threshold || energy_k <= -threshold) {
total_energy_to_add += energy_k;
}
}
// Write to global memory ONLY ONCE at the end
layer[myID] = total_energy_to_add;
}
}
__global__ void layer_stancil_kernel(float* layer, float* layer_copy, int layer_size) {
int myID = blockIdx.x * blockDim.x + threadIdx.x;
if (myID > 0 && myID < layer_size - 1) {
layer[myID] = (layer_copy[myID - 1] + layer_copy[myID] + layer_copy[myID + 1]) / 3.0f;
}
}
__global__ void local_max_kernel(float* layer, struct max_pos* max_per_block, int layer_size) {
int myID = blockIdx.x * blockDim.x + threadIdx.x;
int local_thread_id = threadIdx.x;
int block_size = blockDim.x;
__shared__ struct max_pos local_max[NUM_THREADS_PER_BLOCK]; // Assuming blockDim.x <= 256
local_max[local_thread_id].max = -INFINITY; // Initialize to negative infinity
local_max[local_thread_id].pos = -1; // Initialize to invalid position
if (myID < layer_size) {
if (layer[myID] > local_max[local_thread_id].max) {
local_max[local_thread_id].max = layer[myID];
local_max[local_thread_id].pos = myID;
}
}
__syncthreads();
// Reduction to find the maximum in the block
for (int offset = block_size / 2; offset > 0; offset /= 2) {
if (local_thread_id < offset) {
if (local_max[local_thread_id + offset].max > local_max[local_thread_id].max) {
local_max[local_thread_id] = local_max[local_thread_id + offset];
}
}
__syncthreads();
}
// The first thread in the block writes the block's maximum to global memory
if (local_thread_id == 0) {
max_per_block[blockIdx.x] = local_max[0];
}
}
__global__ void max_reduction_kernel(struct max_pos* data, int size) {
int myID = blockIdx.x * blockDim.x + threadIdx.x;
int local_thread_id = threadIdx.x;
int block_size = blockDim.x;
__shared__ struct max_pos local_max[NUM_THREADS_PER_BLOCK];
local_max[local_thread_id].max = -INFINITY;
local_max[local_thread_id].pos = -1;
if (myID < size) {
local_max[local_thread_id] = data[myID];
}
__syncthreads();
for (int offset = block_size / 2; offset > 0; offset /= 2) {
if (local_thread_id < offset && (myID + offset) < size) {
if (local_max[local_thread_id + offset].max > local_max[local_thread_id].max) {
local_max[local_thread_id] = local_max[local_thread_id + offset];
}
}
__syncthreads();
}
if (local_thread_id == 0) {
data[blockIdx.x] = local_max[0];
}
}
void core(int layer_size, int num_storms, Storm *storms, float *maximum, int *positions) {
float threshold = THRESHOLD / layer_size;
// - DEFINE DIMENSIONS
int threadsPerBlock = NUM_THREADS_PER_BLOCK;
int blocksPerGrid = (layer_size + threadsPerBlock - 1) / threadsPerBlock; // Calculate blocks needed (ceil division)
// - ALLOCATE HOST LAYER
float *layer;
cudaError_t err0 = cudaMallocHost((void**)&layer, layer_size * sizeof(float));
if (err0 != cudaSuccess) {
printf("CUDA Host Memory allocation error\n");
return;
}
memset(layer, 0, layer_size * sizeof(float));
// - ALLOCATE DEVICE LAYER
float *d_layer, *d_layer_copy;
cudaError_t err1 = cudaMalloc((void**)&d_layer, sizeof(float) * layer_size);
cudaError_t err2 = cudaMalloc((void**)&d_layer_copy, sizeof(float) * layer_size);
if (err1 != cudaSuccess) {
printf("CUDA Error allocating device layer: %s\n", cudaGetErrorString(err1));
return;
}
if (err2 != cudaSuccess) {
printf("CUDA Error allocating device layer copy: %s\n", cudaGetErrorString(err2));
return;
}
// - POSVAL ARRAY ALLOCATION ON DEVICE
int max_storm_size = 0;
for(int i=0; i<num_storms; i++) {
if(storms[i].size > max_storm_size) max_storm_size = storms[i].size;
}
int *d_posval;
cudaError_t err3 = cudaMalloc((void**)&d_posval, max_storm_size * 2 * sizeof(int)); // * 2 sice each particle has position and energy
if (err3 != cudaSuccess) {
printf("CUDA Error allocating device particles array: %s\n", cudaGetErrorString(err3));
return;
}
// - Allocate array for block local maxima
struct max_pos *d_block_local_max;
cudaError_t err4 = cudaMalloc((void**)&d_block_local_max, sizeof(struct max_pos) * blocksPerGrid);
if (err4 != cudaSuccess) {
printf("CUDA Error allocating device block local max array: %s\n", cudaGetErrorString(err4));
return;
}
// ------------------------------------------------------------------------------------------
for( int i=0; i<num_storms; i++) {
// - Move the particles of the current storm to device
cudaMemcpy(d_posval, storms[i].posval, sizeof(int) * storms[i].size * 2, cudaMemcpyHostToDevice);
// - Update Layer
layerUpdateKernel<<<blocksPerGrid, threadsPerBlock>>>(d_layer, layer_size, d_posval, storms[i].size, threshold);
// Copy layer to layer_copy for the next step
cudaMemcpy(d_layer_copy, d_layer, sizeof(float) * layer_size, cudaMemcpyDeviceToDevice);
// - Apply stancil (smoothing - 3 points average)
layer_stancil_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_layer, d_layer_copy, layer_size);
// - FIND MAXIMUM VALUE AND POSITION
int blocks = blocksPerGrid;
int blocks_prev = blocks;
local_max_kernel<<<blocks, threadsPerBlock>>>(d_layer, d_block_local_max, layer_size);
while (blocks > 1) {
blocks_prev = blocks;
blocks = (blocks_prev + threadsPerBlock - 1) / threadsPerBlock;
max_reduction_kernel<<<blocks, threadsPerBlock>>>(d_block_local_max, blocks_prev);
}
// Copy the final result (single element) back to host
struct max_pos global_max;
cudaMemcpy(&global_max, d_block_local_max, sizeof(struct max_pos), cudaMemcpyDeviceToHost);
maximum[i] = global_max.max;
positions[i] = global_max.pos;
}
// Cleanup
cudaFree(d_layer);
cudaFree(d_layer_copy);
cudaFree(d_posval);
cudaFree(d_block_local_max);
cudaFreeHost(layer);
}Modifiche Fatte
Fatto tutta la ricerca del massimo dentro la GPU, cosi da ridurre il peso della memoria da mandare dalla gpu alla cpu, prima (nella versione 4) era un 256esimo del layer (, dato che abbiamo un float e un int) ora è un solo elemento (un float e un int).
Questo però non ha cambiato i tempi anche sui test con layer molto grandi.
Tempi
TEST1 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_01_a35_p8_w1 test_files/test_01_a35_p7_w2 test_files/test_01_a35_p5_w3 test_files/test_01_a35_p8_w4
Time: 0.004673
Result: 14 19098.357422 14 24859.306641 10 30043.429688 3 54815.644531srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_01_a35_p8_w1 test_files/test_01_a35_p7_w2 test_files/test_01_a35_p5_w3 test_files/test_01_a35_p8_w4
Time: 0.004602
Result: 14 19098.357422 14 24859.306641 10 30043.429688 3 54815.644531srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_01_a35_p8_w1 test_files/test_01_a35_p7_w2 test_files/test_01_a35_p5_w3 test_files/test_01_a35_p8_w4
Time: 0.004437
Result: 14 19098.357422 14 24859.306641 10 30043.429688 3 54815.644531TEST2 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 20000 test_files/test_02_a30k_p20k_w1 test_files/test_02_a30k_p20k_w2 test_files/test_02_a30k_p20k_w3 test_files/test_02_a30k_p20k_w4 test_files/test_02_a30k_p20k_w5 test_files/test_02_a30k_p20k_w6
Time: 0.036671
Result: 17197 1654.755371 17223 3274.642578 17229 5727.363281 17222 8155.511230 15849 11403.597656 15924 14647.188477srun -N 1 -n 1 ./energy_storms_cuda 20000 test_files/test_02_a30k_p20k_w1 test_files/test_02_a30k_p20k_w2 test_files/test_02_a30k_p20k_w3 test_files/test_02_a30k_p20k_w4 test_files/test_02_a30k_p20k_w5 test_files/test_02_a30k_p20k_w6
Time: 0.036766
Result: 17197 1654.755371 17223 3274.642578 17229 5727.363281 17222 8155.511230 15849 11403.597656 15924 14647.188477srun -N 1 -n 1 ./energy_storms_cuda 20000 test_files/test_02_a30k_p20k_w1 test_files/test_02_a30k_p20k_w2 test_files/test_02_a30k_p20k_w3 test_files/test_02_a30k_p20k_w4 test_files/test_02_a30k_p20k_w5 test_files/test_02_a30k_p20k_w6
Time: 0.037841
Result: 17197 1654.755371 17223 3274.642578 17229 5727.363281 17222 8155.511230 15849 11403.597656 15924 14647.188477TEST3 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_03_a20_p4_w1
Time: 0.004469
Result: 10 12966.631836srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_03_a20_p4_w1
Time: 0.004449
Result: 10 12966.631836srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_03_a20_p4_w1
Time: 0.004406
Result: 10 12966.631836TEST4 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_04_a20_p4_w1
Time: 0.004469
Result: 16 12966.632812srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_04_a20_p4_w1
Time: 0.004458
Result: 16 12966.632812srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_04_a20_p4_w1
Time: 0.004375
Result: 16 12966.632812TEST5 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_05_a20_p4_w1
Time: 0.004414
Result: 9 12829.288086srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_05_a20_p4_w1
Time: 0.004328
Result: 9 12829.288086srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_05_a20_p4_w1
Time: 0.004309
Result: 9 12829.288086TEST6 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_06_a20_p4_w1
Time: 0.004440
Result: 16 12829.288086srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_06_a20_p4_w1
Time: 0.004427
Result: 16 12829.288086srun -N 1 -n 1 ./energy_storms_cuda 20 test_files/test_06_a20_p4_w1
Time: 0.004438
Result: 16 12829.288086TEST7 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 1000000 test_files/test_07_a1M_p5k_w1 test_files/test_07_a1M_p5k_w2 test_files/test_07_a1M_p5k_w3 test_files/test_07_a1M_p5k_w4
Time: 0.107805
Result: 507805 12177.071289 400548 23157.615234 511786 34230.769531 511786 44824.339844srun -N 1 -n 1 ./energy_storms_cuda 1000000 test_files/test_07_a1M_p5k_w1 test_files/test_07_a1M_p5k_w2 test_files/test_07_a1M_p5k_w3 test_files/test_07_a1M_p5k_w4
Time: 0.087635
Result: 507805 12177.071289 400548 23157.615234 511786 34230.769531 511786 44824.339844srun -N 1 -n 1 ./energy_storms_cuda 1000000 test_files/test_07_a1M_p5k_w1 test_files/test_07_a1M_p5k_w2 test_files/test_07_a1M_p5k_w3 test_files/test_07_a1M_p5k_w4
Time: 0.088140
Result: 507805 12177.071289 400548 23157.615234 511786 34230.769531 511786 44824.339844TEST8 Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 100000000 test_files/test_08_a100M_p1_w1 test_files/test_08_a100M_p1_w2 test_files/test_08_a100M_p1_w3
Time: 0.700265
Result: 10 0.000080 658423 0.000081 658423 0.000078srun -N 1 -n 1 ./energy_storms_cuda 100000000 test_files/test_08_a100M_p1_w1 test_files/test_08_a100M_p1_w2 test_files/test_08_a100M_p1_w3
Time: 0.697439
Result: 10 0.000080 658423 0.000081 658423 0.000078srun -N 1 -n 1 ./energy_storms_cuda 100000000 test_files/test_08_a100M_p1_w1 test_files/test_08_a100M_p1_w2 test_files/test_08_a100M_p1_w3
Time: 0.696588
Result: 10 0.000080 658423 0.000081 658423 0.000078TEST9 (16) Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 16 test_files/test_09_a16-17_p3_w1
Time: 0.004445
Result: 15 241.892746TEST9 (17) Simulation:
srun -N 1 -n 1 ./energy_storms_cuda 17 test_files/test_09_a16-17_p3_w1
Time: 0.004492
Result: 15 193.299606