Used for my ECE459 assignment.

#include <math.h>
#include <stdio.h>
 
 
extern "C" __global__ void conv(const double* input, const double* conv_filters,
                                double* outputs, int input_dim, int filter_dim,
                                int conv_out_dim) {
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int filter = blockIdx.z;
 
  if (row >= conv_out_dim || col >= conv_out_dim) return;
 
  double sum = 0.0;
  for (int fr = 0; fr < filter_dim; fr++) {
    for (int fc = 0; fc < filter_dim; fc++) {
      int in_row = row * filter_dim + fr;
      int in_col = col * filter_dim + fc;
 
      int filter_idx = (filter * filter_dim + fr) * filter_dim + fc;
      int input_idx = in_row * input_dim + in_col;
 
      sum += conv_filters[filter_idx] * input[input_idx];
    }
  }
 
  int out_idx = (filter * conv_out_dim + row) * conv_out_dim + col;
  outputs[out_idx] = sum;
}
 
extern "C" __global__ void relu(double* input, int conv_out_dim,
                                int conv_layer_size) {
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int layer = blockIdx.z;
 
  if (col >= conv_out_dim || row >= conv_out_dim || layer >= conv_layer_size)
    return;
 
  int idx = (layer * conv_out_dim + row) * conv_out_dim + col;
  input[idx] = fmax(0.0, input[idx]);
}
 
extern "C" __global__ void output(const double* input, const double* weights,
                                  double* output, int out_neuron_dim) {
  int neuron = blockIdx.x;
  int tid = threadIdx.x;
 
  __shared__ double partial[256];
 
  double sum = 0.0;
  for (int i = tid; i < out_neuron_dim; i += blockDim.x) {
    sum += input[i] * weights[neuron * out_neuron_dim + i];
  }
 
  partial[tid] = sum;
  __syncthreads();
 
  for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
    if (tid < stride) {
      partial[tid] += partial[tid + stride];
    }
    __syncthreads();
  }
 
  if (tid == 0) {
    output[neuron] = partial[0];
  }
}