PyTorch CUDA
Use the torch cpp extensions
import torch
from torch.utils.cpp_extension import load
# Compile the .cu file dynamically on script startup
cuda_module = load(
name="torch_cuda_ext",
sources=["extension.cu"],
verbose=True
)
# Test tensors directly on the GPU device
x = torch.ones(1000, device="cuda") * 10
y = torch.ones(1000, device="cuda") * 20
# Run without any Host <-> Device data transfers
z = cuda_module.vector_add(x, y)
print(z) # Outputs tensor of 30s on cuda:0