msaroufim · March 21, 2025 00:31 · msaroufim · Mar 21, 2025
diff --git a/gistfile1.txt b/gistfile1.txt
 """
 Minimal example that:
  - Only includes <ATen/core/Tensor.h> (for at::Tensor)
    and <ATen/Functions.h> (for at::empty).
  - Avoids <torch/extension.h> or <torch/types.h>.
  - Uses <torch/csrc/utils/pybind.h> so PyBind can cast torch.Tensor <-> at::Tensor.
  - Demonstrates a custom CUDA kernel that adds x + y + 1.
  - Uses no_implicit_headers=True to reduce compile overhead.
 """

 import os
 import shutil
 from datetime import datetime

 import torch
 import torch.utils.cpp_extension

 # If you have a custom CUDA include path, set it here:
 cuda_include_dir = os.path.join(os.environ["HOME"], ".conda/envs/pt/targets/x86_64-linux/include")

 BUILD_DIR = os.path.join(os.getcwd(), "minimal_tensor_build")
 if os.path.exists(BUILD_DIR):
    print(f"Removing build directory: {BUILD_DIR}")
    shutil.rmtree(BUILD_DIR)
 os.makedirs(BUILD_DIR, exist_ok=True)
 print(f"Created build directory: {BUILD_DIR}")


 # --------------------------------------------------------------------------
 # C++ source: minimal includes
 # --------------------------------------------------------------------------
 cpp_source = r"""
 #include <ATen/core/Tensor.h>         // at::Tensor
 #include <ATen/Functions.h>           // at::empty(...) and other creation ops
 #include <c10/cuda/CUDAGuard.h>       // at::cuda::CUDAGuard
 #include <pybind11/pybind11.h>        // pybind11
 #include <torch/csrc/utils/pybind.h>  // Allows torch.Tensor <-> at::Tensor casting

 // Forward-declare our CUDA kernel launcher
 void launch_add_kernel(const float* x_data,
                       const float* y_data,
                       float* out_data,
                       int64_t num_elements);

 // Simple function: x + y + 1
 at::Tensor tensor_add_cpp(const at::Tensor& x, const at::Tensor& y) {
    // Basic checks
    TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
    TORCH_CHECK(y.is_cuda(), "y must be a CUDA tensor");
    TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, "x must be float32");
    TORCH_CHECK(y.scalar_type() == at::ScalarType::Float, "y must be float32");
    TORCH_CHECK(x.sizes() == y.sizes(), "x and y must have the same shape");
    TORCH_CHECK(x.is_contiguous() && y.is_contiguous(), "x and y must be contiguous tensors");

    // Create output on the same device & dtype as x
    auto out = at::empty(x.sizes(), x.options());

    // Ensure we're on the correct device and call our kernel
    at::cuda::CUDAGuard device_guard(x.device());
    launch_add_kernel(x.data_ptr<float>(),
                      y.data_ptr<float>(),
                      out.data_ptr<float>(),
                      x.numel());
    return out;
 }

 // pybind11 module definition
 namespace py = pybind11;

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("tensor_add_cpp",
          &tensor_add_cpp,
          "Add x + y + 1 (float32 CUDA).");
 }
 """

 # --------------------------------------------------------------------------
 # CUDA source
 # --------------------------------------------------------------------------
 cuda_source = r"""
 #include <cuda_runtime.h>

 __global__ void add_kernel(const float* x, const float* y, float* out, int64_t size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        out[idx] = x[idx] + y[idx] + 1.0f;
    }
 }

 void launch_add_kernel(const float* x_data,
                       const float* y_data,
                       float* out_data,
                       int64_t num_elements) {
    const int threads = 256;
    const int blocks = (num_elements + threads - 1) / threads;
    add_kernel<<<blocks, threads>>>(x_data, y_data, out_data, num_elements);
 }
 """


 def main():
    # Build the extension with minimal includes
    start_time = datetime.now()
    module = torch.utils.cpp_extension.load_inline(
        name="minimal_tensor_extension",
        cpp_sources=cpp_source,
        cuda_sources=cuda_source,
        verbose=True,
        no_implicit_headers=False,         # Avoid heavy auto-includes
        extra_include_paths=[cuda_include_dir],
        build_directory=BUILD_DIR,
        extra_cuda_cflags=["-arch=sm_80"],
    )
    total_time = datetime.now() - start_time
    print(f"\nExtension compiled in {total_time}!\n")

    if not torch.cuda.is_available():
        print("CUDA is not available. Exiting.")
        return

    # Create test tensors
    x = torch.randn(10, device="cuda", dtype=torch.float32)
    y = torch.randn(10, device="cuda", dtype=torch.float32)

    # Call our custom function
    result = module.tensor_add_cpp(x, y)

    # Check correctness (our kernel does x + y + 1)
    expected = x + y + 1.0
    max_diff = (result - expected).abs().max()
    print(f"Max difference: {max_diff.item()}")

    if torch.allclose(result, expected):
        print("Test PASSED! ✓")
    else:
        print("Test FAILED!")


 if __name__ == "__main__":
    main()
	"""
	Minimal example that:
	- Only includes <ATen/core/Tensor.h> (for at::Tensor)
	and <ATen/Functions.h> (for at::empty).
	- Avoids <torch/extension.h> or <torch/types.h>.
	- Uses <torch/csrc/utils/pybind.h> so PyBind can cast torch.Tensor <-> at::Tensor.
	- Demonstrates a custom CUDA kernel that adds x + y + 1.
	- Uses no_implicit_headers=True to reduce compile overhead.
	"""

	import os
	import shutil
	from datetime import datetime

	import torch
	import torch.utils.cpp_extension

	# If you have a custom CUDA include path, set it here:
	cuda_include_dir = os.path.join(os.environ["HOME"], ".conda/envs/pt/targets/x86_64-linux/include")

	BUILD_DIR = os.path.join(os.getcwd(), "minimal_tensor_build")
	if os.path.exists(BUILD_DIR):
	print(f"Removing build directory: {BUILD_DIR}")
	shutil.rmtree(BUILD_DIR)
	os.makedirs(BUILD_DIR, exist_ok=True)
	print(f"Created build directory: {BUILD_DIR}")


	# --------------------------------------------------------------------------
	# C++ source: minimal includes
	# --------------------------------------------------------------------------
	cpp_source = r"""
	#include <ATen/core/Tensor.h> // at::Tensor
	#include <ATen/Functions.h> // at::empty(...) and other creation ops
	#include <c10/cuda/CUDAGuard.h> // at::cuda::CUDAGuard
	#include <pybind11/pybind11.h> // pybind11
	#include <torch/csrc/utils/pybind.h> // Allows torch.Tensor <-> at::Tensor casting

	// Forward-declare our CUDA kernel launcher
	void launch_add_kernel(const float* x_data,
	const float* y_data,
	float* out_data,
	int64_t num_elements);

	// Simple function: x + y + 1
	at::Tensor tensor_add_cpp(const at::Tensor& x, const at::Tensor& y) {
	// Basic checks
	TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
	TORCH_CHECK(y.is_cuda(), "y must be a CUDA tensor");
	TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, "x must be float32");
	TORCH_CHECK(y.scalar_type() == at::ScalarType::Float, "y must be float32");
	TORCH_CHECK(x.sizes() == y.sizes(), "x and y must have the same shape");
	TORCH_CHECK(x.is_contiguous() && y.is_contiguous(), "x and y must be contiguous tensors");

	// Create output on the same device & dtype as x
	auto out = at::empty(x.sizes(), x.options());

	// Ensure we're on the correct device and call our kernel
	at::cuda::CUDAGuard device_guard(x.device());
	launch_add_kernel(x.data_ptr<float>(),
	y.data_ptr<float>(),
	out.data_ptr<float>(),
	x.numel());
	return out;
	}

	// pybind11 module definition
	namespace py = pybind11;

	PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
	m.def("tensor_add_cpp",
	&tensor_add_cpp,
	"Add x + y + 1 (float32 CUDA).");
	}
	"""

	# --------------------------------------------------------------------------
	# CUDA source
	# --------------------------------------------------------------------------
	cuda_source = r"""
	#include <cuda_runtime.h>

	__global__ void add_kernel(const float* x, const float* y, float* out, int64_t size) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < size) {
	out[idx] = x[idx] + y[idx] + 1.0f;
	}
	}

	void launch_add_kernel(const float* x_data,
	const float* y_data,
	float* out_data,
	int64_t num_elements) {
	const int threads = 256;
	const int blocks = (num_elements + threads - 1) / threads;
	add_kernel<<<blocks, threads>>>(x_data, y_data, out_data, num_elements);
	}
	"""


	def main():
	# Build the extension with minimal includes
	start_time = datetime.now()
	module = torch.utils.cpp_extension.load_inline(
	name="minimal_tensor_extension",
	cpp_sources=cpp_source,
	cuda_sources=cuda_source,
	verbose=True,
	no_implicit_headers=False, # Avoid heavy auto-includes
	extra_include_paths=[cuda_include_dir],
	build_directory=BUILD_DIR,
	extra_cuda_cflags=["-arch=sm_80"],
	)
	total_time = datetime.now() - start_time
	print(f"\nExtension compiled in {total_time}!\n")

	if not torch.cuda.is_available():
	print("CUDA is not available. Exiting.")
	return

	# Create test tensors
	x = torch.randn(10, device="cuda", dtype=torch.float32)
	y = torch.randn(10, device="cuda", dtype=torch.float32)

	# Call our custom function
	result = module.tensor_add_cpp(x, y)

	# Check correctness (our kernel does x + y + 1)
	expected = x + y + 1.0
	max_diff = (result - expected).abs().max()
	print(f"Max difference: {max_diff.item()}")

	if torch.allclose(result, expected):
	print("Test PASSED! ✓")
	else:
	print("Test FAILED!")


	if __name__ == "__main__":
	main()