Used for my ECE459 assignment.
#include <math.h>
#include <stdio.h>
extern "C" __global__ void conv(const double* input, const double* conv_filters,
double* outputs, int input_dim, int filter_dim,
int conv_out_dim) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int filter = blockIdx.z;
if (row >= conv_out_dim || col >= conv_out_dim) return;
double sum = 0.0;
for (int fr = 0; fr < filter_dim; fr++) {
for (int fc = 0; fc < filter_dim; fc++) {
int in_row = row * filter_dim + fr;
int in_col = col * filter_dim + fc;
int filter_idx = (filter * filter_dim + fr) * filter_dim + fc;
int input_idx = in_row * input_dim + in_col;
sum += conv_filters[filter_idx] * input[input_idx];
}
}
int out_idx = (filter * conv_out_dim + row) * conv_out_dim + col;
outputs[out_idx] = sum;
}
extern "C" __global__ void relu(double* input, int conv_out_dim,
int conv_layer_size) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int layer = blockIdx.z;
if (col >= conv_out_dim || row >= conv_out_dim || layer >= conv_layer_size)
return;
int idx = (layer * conv_out_dim + row) * conv_out_dim + col;
input[idx] = fmax(0.0, input[idx]);
}
extern "C" __global__ void output(const double* input, const double* weights,
double* output, int out_neuron_dim) {
int neuron = blockIdx.x;
int tid = threadIdx.x;
__shared__ double partial[256];
double sum = 0.0;
for (int i = tid; i < out_neuron_dim; i += blockDim.x) {
sum += input[i] * weights[neuron * out_neuron_dim + i];
}
partial[tid] = sum;
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
partial[tid] += partial[tid + stride];
}
__syncthreads();
}
if (tid == 0) {
output[neuron] = partial[0];
}
}