feat: sum two vecs in CUDA

1 year ago · 8e0248b2ec
7 changed files with 163 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+target
+.venv
+uv.lock
+Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "rust-triton"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+cust = "0.3.2"
--- a/README.md
+++ b/README.md
@ -0,0 +1,22 @@
+# Installation
+
+You need to install `triton` and `torch` on your system or create a venv with:
+```bash
+# With uv 
+uv sync
+
+# Or manually
+python -m venv .venv
+source .venv/bin/activate
+pip install build
+pip install -e .
+````
+
+Then, set the Rust nightly toolchain and build:
+
+```bash
+rustup toolchain install nightly
+cargo run
+```
+
+The cargo build hooks will take care of compiling the PTX files with Python.
--- a/build.rs
+++ b/build.rs
@ -0,0 +1,30 @@
+use std::path::Path;
+use std::process::Command;
+
+fn main() {
+    // Path to your Python script
+    let python_script = "src/kernel.py";
+
+    let python_executable = if Path::new(".venv").exists() {
+        if cfg!(windows) {
+            ".venv\\Scripts\\python.exe"
+        } else {
+            ".venv/bin/python"
+        }
+    } else {
+        "python"
+    };
+
+    // Run the Python script
+    let status = Command::new(python_executable)
+        .arg(python_script)
+        .status()
+        .expect("Failed to execute Python script");
+
+    if !status.success() {
+        panic!("CUDA kernel compilation failed");
+    }
+
+    // Rerun the PTX generation script if it has changed
+    println!("cargo:rerun-if-changed=src/kernel.py");
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,15 @@
+[project]
+name = "rust-triton"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "triton>=3.0.0",
+    "setuptools>=75.1.0",
+    "torch>=2.4.1",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
--- a/src/kernel.py
+++ b/src/kernel.py
@ -0,0 +1,33 @@
+import triton
+import triton.language as tl
+import torch
+
+
+@triton.jit
+def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+
+    output = x + y
+
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+N_ELEMENTS = 1024
+x = torch.zeros(N_ELEMENTS).cuda()
+y = torch.zeros(N_ELEMENTS).cuda()
+output = torch.zeros(N_ELEMENTS).cuda()
+
+
+def grid(meta):
+    return (triton.cdiv(N_ELEMENTS, meta["BLOCK_SIZE"]),)
+
+
+add_kernel[grid](x, y, output, N_ELEMENTS, BLOCK_SIZE=256)
+with open("add_kernel.ptx", "w") as a:
+    print(list(add_kernel.cache[0].values())[0].asm["ptx"], file=a)
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,52 @@
+use cust::prelude::*;
+use std::error::Error;
+
+const SIZE: usize = 4096;
+
+static PTX: &str = include_str!("../add_kernel.ptx");
+
+fn run() -> Result<(), Box<dyn Error>> {
+    let _ctx = cust::quick_init().expect("Could not create CUDA context");
+
+    let x: [f32; SIZE] = std::array::from_fn(|i| i as f32 + 1.);
+    let y: [f32; SIZE] = std::array::from_fn(|i| i as f32 + 1.);
+    let o: [f32; SIZE] = [0.0; SIZE];
+
+    let module = Module::from_ptx(PTX, &[]).expect("Could not create module from PTX");
+
+    let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
+
+    let x_d = x.as_slice().as_dbuf()?;
+    let y_d = y.as_slice().as_dbuf()?;
+    let o_d = o.as_slice().as_dbuf()?;
+
+    let func = module
+        .get_function("add_kernel")
+        .expect("could not find the kernel!");
+
+    let block_size = cust::function::BlockSize { x: 128, y: 1, z: 1 };
+    let grid_size = cust::function::GridSize {
+        x: SIZE as u32 / block_size.x,
+        y: 1,
+        z: 1,
+    };
+    unsafe {
+        launch!(func<<<grid_size, block_size, 9216, stream>>>(
+            x_d.as_device_ptr(),
+            y_d.as_device_ptr(),
+            o_d.as_device_ptr(),
+            SIZE as i32,
+        ))?;
+    }
+    stream.synchronize().expect("failed to sync");
+
+    let o = o_d.as_slice().as_host_vec()?;
+
+    println!("o: {:?}", &o[..20]);
+
+    Ok(())
+}
+
+fn main() {
+    run().expect("something went wrong");
+}