malfet’s gists

malfet / debug_index_put.py

Created August 21, 2025 19:23

	import torch
	import os
	os.environ["MTL_CAPTURE_ENABLED"]="1"
	a = torch.ones(2, (1 << 31) + 5, dtype=torch.int8, device='mps')
	index_0 = torch.tensor([0, -1, 0, 1], device=a.device)
	index_1 = torch.tensor([-2, -1, 0, 1], device=a.device)
	values = torch.tensor([12, 13, 10, 11], dtype=a.dtype, device=a.device)
	with torch.mps.profiler.metal_capture("index_put"):
	a.index_put_((index_0, index_1), values, accumulate=True)
	b = a[1, -2].cpu()

malfet / install_nightly.py

Last active August 6, 2025 19:53

	#!/usr/bin/env python3

	import sys
	import subprocess
	import urllib.request
	import json

	def get_latest_version(package_name: str) -> str:
	"""Get latest version from PyPI"""
	api_url = f"https://pypi.org/pypi/{package_name}/json"

malfet / gist:46bd5716da0c3ddc3fd3ca0703ff8fd2

Created July 18, 2025 19:19

This file has been truncated, but you can view the full file.

	With cudnn-9.10.2.21
	```
	$ CUDNN_LOGINFO_DBG=3 RUN_SLOW=1 python3 -m pytest -v tests/models/vit/test_modeling_vit.py::ViTModelTest::test_batching_equivalence
	========================================================================================== test session starts ===========================================================================================
	platform linux -- Python 3.10.12, pytest-8.4.1, pluggy-1.6.0 -- /home/ubuntu/py3.10-nightly/bin/python3
	cachedir: .pytest_cache
	rootdir: /home/ubuntu/transformers
	configfile: pyproject.toml
	plugins: xdist-3.8.0, asyncio-1.1.0, rerunfailures-15.1, order-1.3.0, timeout-2.4.0, rich-0.2.0
	asyncio: mode=strict, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function

malfet / test_header_only.py

Created May 20, 2025 00:34

	import torch
	import torch.utils.cpp_extension as _ce
	import tempfile
	import os
	import subprocess

	src = """#include <c10/util/BFloat16.h>
	#include <iostream>
	int main() {
	std::cout << c10::BFloat16(3.14) << std::endl;

malfet / ctypes-nvrtc.py

Last active April 16, 2025 21:47

	import ctypes
	import torch
	import time

	def nvrtc_compile(source: str) -> str:
	from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
	libnvrtc = CDLL('libnvrtc.so')
	def get_error_string() -> str:
	err_p = c_char_p()
	libnvrtc.nvrtcGetErrorString(result, byref(err_str))

malfet / tensor-base-example.py

Last active March 21, 2025 23:38

	"""
	Example showing how to use the no_header mode with a TensorBase CUDA extension

	This example creates a CUDA extension that directly includes ATen/core/TensorBase.h
	instead of torch/extension.h, resulting in faster compilation with no_header=True
	"""
	from datetime import datetime
	import torch
	import torch.utils.cpp_extension

malfet / listtodictdis.py

Last active March 17, 2025 22:58

malfet / metal-internal-compiler-error-m2.swift

Created March 14, 2025 05:11

	// Fail with Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" on M1/M2 (using MacOS 15.3.1)
	// Works on M4 (and may be M3)

	let shader_source = """
	template <typename T>
	float bessel_j0_forward(T x) {
	constexpr float PP[] = {
	+7.96936729297347051624e-04,
	+8.28352392107440799803e-02,
	+1.23953371646414299388e+00,

malfet / mpsinductor-minmax.py

Created March 12, 2025 16:39

	# How to reuse shared memory
	# Right now MPS inductor produces following code
	# #include <c10/metal/random.h>
	# #include <c10/metal/special_math.h>
	# #include <c10/metal/utils.h>
	# #include <c10/metal/reduction_utils.h>
	# kernel void generated_kernel(
	# device float* out_ptr0,
	# device float* out_ptr1,
	# constant float* in_ptr0,

malfet / anon_kernel_is_missing.swift

Last active February 12, 2025 18:16

	let shader_source = """
	struct add_functor {
	template <typename T>
	inline T operator()(const T a, const T b) {
	return static_cast<T>(a + b);
	}
	};

	namespace {
	struct sub_functor {

Nikita Shulga malfet