diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a41236..a312238 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,7 @@ option(WITH_CPU "Enable CPU backend" OFF) option(WITH_NVIDIA "Enable CUDA backend" OFF) option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF) option(WITH_METAX "Enable MetaX backend" OFF) +option(WITH_CAMBRICON "Enable Cambricon backend" OFF) option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF) @@ -55,6 +56,11 @@ if(AUTO_DETECT_DEVICES) message(STATUS "No MetaX GPU detected") endif() endif() + + if(DEFINED ENV{NEUWARE_HOME}) + set(WITH_CAMBRICON ON) + message(STATUS "Auto-detected Cambricon environment.") + endif() endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) @@ -105,7 +111,22 @@ if(WITH_METAX) find_library(MACA_BLAS_LIB NAMES mcblas HINTS "${MACA_PATH}/lib" REQUIRED) endif() -# If no GPU platform is enabled, CPU is enabled by default. +if(WITH_CAMBRICON) + add_compile_definitions(WITH_CAMBRICON=1) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + + include_directories("${NEUWARE_HOME}/include") + link_directories("${NEUWARE_HOME}/lib") + link_directories("${NEUWARE_HOME}/lib64") + + # Libraries: `cnrt` / `cnnl` / `cnnl_extra` / `cnpapi`. + find_library(CAMBRICON_RUNTIME_LIB NAMES cnrt HINTS "${NEUWARE_HOME}/lib64" REQUIRED) + find_library(CAMBRICON_CNNL_LIB NAMES cnnl HINTS "${NEUWARE_HOME}/lib64" REQUIRED) + find_library(CAMBRICON_CNNL_EXTRA_LIB NAMES cnnl_extra HINTS "${NEUWARE_HOME}/lib64" REQUIRED) + find_library(CAMBRICON_PAPI_LIB NAMES cnpapi HINTS "${NEUWARE_HOME}/lib64" REQUIRED) +endif() + +# If all other platforms are not enabled, CPU is enabled by default. if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX) add_compile_definitions(WITH_CPU=1) endif() diff --git a/examples/gemm/gemm.cc b/examples/gemm/gemm.cc index 62779e6..bb82890 100644 --- a/examples/gemm/gemm.cc +++ b/examples/gemm/gemm.cc @@ -14,6 +14,9 @@ #if WITH_METAX #include "metax/gemm/mcblas.h" #endif +#if WITH_CAMBRICON +#include "cambricon/gemm/cnblas.h" +#endif #include "runtime_api.h" #include "tensor.h" diff --git a/examples/runtime_api.h b/examples/runtime_api.h index 896af64..b56a8fd 100644 --- a/examples/runtime_api.h +++ b/examples/runtime_api.h @@ -28,6 +28,15 @@ #define DEVICE_MEMCPY_HOST_TO_DEVICE mcMemcpyHostToDevice #define DEVICE_MEMCPY_DEVICE_TO_HOST mcMemcpyDeviceToHost #define DEFAULT_DEVICE_TYPE Device::Type::kMetax +#elif WITH_CAMBRICON +#include +#define DEVICE_MALLOC cnrtMalloc +#define DEVICE_FREE cnrtFree +#define DEVICE_MEMCPY cnrtMemcpy +#define DEVICE_MEMSET cnrtMemset +#define DEVICE_MEMCPY_HOST_TO_DEVICE cnrtMemcpyHostToDev +#define DEVICE_MEMCPY_DEVICE_TO_HOST cnrtMemcpyDevToHost +#define DEFAULT_DEVICE_TYPE Device::Type::kCambricon #elif WITH_CPU #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 02fbc4d..6eef5d3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -103,6 +103,15 @@ if(WITH_METAX) list(APPEND DEVICE_LIST "metax") endif() +if(WITH_CAMBRICON) + target_compile_definitions(infiniops PUBLIC WITH_CAMBRICON=1) + + target_include_directories(infiniops PUBLIC "${NEUWARE_HOME}/include") + target_link_libraries(infiniops PUBLIC ${CAMBRICON_RUNTIME_LIB} ${CAMBRICON_CNNL_LIB} ${CAMBRICON_CNNL_EXTRA_LIB} ${CAMBRICON_PAPI_LIB}) + + list(APPEND DEVICE_LIST "cambricon") +endif() + target_include_directories(infiniops PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) if(GENERATE_PYTHON_BINDINGS) diff --git a/src/cambricon/common.h b/src/cambricon/common.h new file mode 100644 index 0000000..50775c2 --- /dev/null +++ b/src/cambricon/common.h @@ -0,0 +1,25 @@ +#ifndef INFINI_OPS_CAMBRICON_COMMON_H_ +#define INFINI_OPS_CAMBRICON_COMMON_H_ + +#include + +#include "data_type.h" + +namespace infini::ops::cnnl_utils { + +inline cnnlDataType_t GetDataType(DataType dtype) { + switch (dtype) { + case DataType::kInt32: + return CNNL_DTYPE_INT32; + case DataType::kFloat16: + return CNNL_DTYPE_HALF; + case DataType::kFloat32: + return CNNL_DTYPE_FLOAT; + default: + return CNNL_DTYPE_INVALID; + } +} + +} // namespace infini::ops::cnnl_utils + +#endif diff --git a/src/cambricon/gemm/cnblas.h b/src/cambricon/gemm/cnblas.h new file mode 100644 index 0000000..ac95bd5 --- /dev/null +++ b/src/cambricon/gemm/cnblas.h @@ -0,0 +1,159 @@ +#ifndef INFINI_OPS_CAMBRICON_GEMM_CNBLAS_H_ +#define INFINI_OPS_CAMBRICON_GEMM_CNBLAS_H_ + +#include +#include +#include + +// clang-format off +#include +#include +// clang-format on + +#include "base/gemm.h" +#include "cambricon/common.h" + +namespace infini::ops { + +template <> +class Operator : public Gemm { + public: + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) + : Gemm{a, b, alpha, beta, trans_a, trans_b, c}, + a_rows_{a.size(-2)}, + a_cols_{a.size(-1)}, + b_rows_{b.size(-2)}, + b_cols_{b.size(-1)}, + c_rows_{c.size(-2)}, + c_cols_{c.size(-1)} { + assert(!trans_a_ && "`trans_a` is not currently supported"); + assert(!trans_b_ && "`trans_b` is not currently supported"); + + cnnlCreate(&cnnl_handle_); + + cnnlCreateTensorDescriptor(&desc_a_); + cnnlCreateTensorDescriptor(&desc_b_); + cnnlCreateTensorDescriptor(&desc_c_); + + cnnlCreateMatMulDescriptor(&matmul_desc_); + cnnlCreateMatMulAlgo(&matmul_algo_); + cnnlCreateMatMulHeuristicResult(&heuristic_result_); + + int32_t use_stride = 1; + cnnlSetMatMulDescAttr(matmul_desc_, CNNL_MATMUL_USE_STRIDE, &use_stride, + sizeof(int32_t)); + + SetupTensorDescriptor(desc_a_, a_strides_, a_type_, a_rows_, a_cols_, + batch_count_, batch_stride_a_); + SetupTensorDescriptor(desc_b_, b_strides_, b_type_, b_rows_, b_cols_, + batch_count_, batch_stride_b_); + SetupTensorDescriptor(desc_c_, c_strides_, c_type_, c_rows_, c_cols_, + batch_count_, batch_stride_c_); + int count = 0; + cnnlGetBatchMatMulExAlgoHeuristic(cnnl_handle_, matmul_desc_, desc_a_, + desc_b_, desc_c_, NULL, 1, + &heuristic_result_, &count); + + cnrtMalloc(&default_workspace_, workspace_size_in_bytes()); + } + + Operator(const Tensor a, const Tensor b, Tensor c) + : Operator{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + c} {} + + Operator(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, Tensor c) + : Operator{a, b, alpha, beta, std::nullopt, std::nullopt, c} {} + + ~Operator() { + cnrtFree(default_workspace_); + cnnlDestroyTensorDescriptor(desc_c_); + cnnlDestroyTensorDescriptor(desc_b_); + cnnlDestroyTensorDescriptor(desc_a_); + cnnlDestroyMatMulDescriptor(matmul_desc_); + cnnlDestroyMatMulAlgo(matmul_algo_); + cnnlDestroyMatMulHeuristicResult(heuristic_result_); + cnnlDestroy(cnnl_handle_); + } + + void operator()(const Tensor a, const Tensor b, std::optional alpha, + std::optional beta, std::optional trans_a, + std::optional trans_b, Tensor c) const override { + const auto& alpha_value{alpha.value_or(alpha_)}; + const auto& beta_value{beta.value_or(beta_)}; + + cnnlSetQueue(cnnl_handle_, (cnrtQueue_t)stream_); + + auto workspace{workspace_ ? workspace_ : default_workspace_}; + auto workspace_size{workspace_size_in_bytes_ ? workspace_size_in_bytes_ + : workspace_size_in_bytes()}; + + cnnlBatchMatMulEx(cnnl_handle_, matmul_desc_, matmul_algo_, &alpha_value, + desc_a_, a.data(), desc_b_, b.data(), &beta_value, + desc_c_, c.data(), workspace, workspace_size); + } + + std::size_t workspace_size_in_bytes() const override { + std::size_t size{0}; + + cnnlGetBatchMatMulExHeuristicResult(heuristic_result_, matmul_algo_, &size); + + return size; + } + + private: + void SetupTensorDescriptor(cnnlTensorDescriptor_t desc, + const Tensor::Strides& strides, DataType dtype, + Tensor::Size rows, Tensor::Size cols, + Tensor::Size batch, Tensor::Stride batch_stride) { + cnnlDataType_t cnnl_dtype = cnnl_utils::GetDataType(dtype); + + if (batch > 1) { + std::vector dims = {static_cast(batch), static_cast(rows), + static_cast(cols)}; + std::vector strides_arr = { + static_cast(batch_stride), + static_cast(strides[strides.size() - 2]), + static_cast(strides[strides.size() - 1])}; + cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, cnnl_dtype, + dims.size(), dims.data(), strides_arr.data()); + } else { + std::vector dims = {static_cast(rows), static_cast(cols)}; + std::vector strides_arr = { + static_cast(strides[strides.size() - 2]), + static_cast(strides[strides.size() - 1])}; + cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, cnnl_dtype, + dims.size(), dims.data(), strides_arr.data()); + } + } + + cnnlHandle_t cnnl_handle_; + + cnnlTensorDescriptor_t desc_a_; + + cnnlTensorDescriptor_t desc_b_; + + cnnlTensorDescriptor_t desc_c_; + + cnnlMatMulDescriptor_t matmul_desc_; + + cnnlMatMulAlgo_t matmul_algo_; + + cnnlMatMulHeuristicResult_t heuristic_result_; + + Tensor::Size a_rows_, a_cols_; + + Tensor::Size b_rows_, b_cols_; + + Tensor::Size c_rows_, c_cols_; + + // TODO: Remove the following member after default workspace mechanism has + // been introduced globally. + void* default_workspace_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/tests/test_gemm.py b/tests/test_gemm.py index c091136..faee9d5 100644 --- a/tests/test_gemm.py +++ b/tests/test_gemm.py @@ -38,6 +38,10 @@ def test_gemm( rtol, atol, ): + # Skip transposing test cases for MLU platform as transposing is not currently supported. + if device == "mlu" and (trans_a or trans_b): + pytest.skip("transposing is not currently supported on MLU") + a = randn_strided(a_shape, a_strides, dtype=dtype, device=device) b = randn_strided(b_shape, b_strides, dtype=dtype, device=device)