RedHatAI
/

quantization

Model card Files Files and versions

quantization / cutlass_extensions /common.hpp

danieldk's picture

danieldk HF Staff

Sync to vLLM 20250627

8aa00a3 8 months ago

history blame contribute delete

1.9 kB

	#pragma once

	#include "cutlass/cutlass.h"
	#include <climits>
	#include "cuda_runtime.h"
	#include <iostream>

	/**
	* Helper function for checking CUTLASS errors
	*/
	#define CUTLASS_CHECK(status) \
	{ \
	cutlass::Status error = status; \
	TORCH_CHECK(error == cutlass::Status::kSuccess, \
	cutlassGetStatusString(error)); \
	}

	inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
	int max_shared_mem_per_block_opt_in = 0;
	cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
	cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
	return max_shared_mem_per_block_opt_in;
	}

	int32_t get_sm_version_num();

	/**
	* A wrapper for a kernel that is used to guard against compilation on
	* architectures that will never use the kernel. The purpose of this is to
	* reduce the size of the compiled binary.
	* __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
	* into code that will be executed on the device where it is defined.
	*/
	template <typename Kernel>
	struct enable_sm90_or_later : Kernel {
	template <typename... Args>
	CUTLASS_DEVICE void operator()(Args&&... args) {
	#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
	Kernel::operator()(std::forward<Args>(args)...);
	#endif
	}
	};

	template <typename Kernel>
	struct enable_sm90_only : Kernel {
	template <typename... Args>
	CUTLASS_DEVICE void operator()(Args&&... args) {
	#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
	Kernel::operator()(std::forward<Args>(args)...);
	#endif
	}
	};

	template <typename Kernel>
	struct enable_sm100_only : Kernel {
	template <typename... Args>
	CUTLASS_DEVICE void operator()(Args&&... args) {
	#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
	Kernel::operator()(std::forward<Args>(args)...);
	#endif
	}
	};