PyTorch CUDA

Use the torch cpp extensions

import torch
from torch.utils.cpp_extension import load
 
# Compile the .cu file dynamically on script startup
cuda_module = load(
    name="torch_cuda_ext", 
    sources=["extension.cu"], 
    verbose=True
)
 
# Test tensors directly on the GPU device
x = torch.ones(1000, device="cuda") * 10
y = torch.ones(1000, device="cuda") * 20
 
# Run without any Host <-> Device data transfers
z = cuda_module.vector_add(x, y)
print(z)  # Outputs tensor of 30s on cuda:0