diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a41236..a312238 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@ option(WITH_CPU "Enable CPU backend" OFF)
 option(WITH_NVIDIA "Enable CUDA backend" OFF)
 option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF)
 option(WITH_METAX "Enable MetaX backend" OFF)
+option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
 
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
@@ -55,6 +56,11 @@ if(AUTO_DETECT_DEVICES)
             message(STATUS "No MetaX GPU detected")
         endif()
     endif()
+
+    if(DEFINED ENV{NEUWARE_HOME})
+        set(WITH_CAMBRICON ON)
+        message(STATUS "Auto-detected Cambricon environment.")
+    endif()
 endif()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
@@ -105,7 +111,22 @@ if(WITH_METAX)
     find_library(MACA_BLAS_LIB    NAMES mcblas    HINTS "${MACA_PATH}/lib" REQUIRED)
 endif()
 
-# If no GPU platform is enabled, CPU is enabled by default.
+if(WITH_CAMBRICON)
+    add_compile_definitions(WITH_CAMBRICON=1)
+    set(NEUWARE_HOME $ENV{NEUWARE_HOME})
+
+    include_directories("${NEUWARE_HOME}/include")
+    link_directories("${NEUWARE_HOME}/lib")
+    link_directories("${NEUWARE_HOME}/lib64")
+
+    # Libraries: `cnrt` / `cnnl` / `cnnl_extra` / `cnpapi`.
+    find_library(CAMBRICON_RUNTIME_LIB    NAMES cnrt       HINTS "${NEUWARE_HOME}/lib64" REQUIRED)
+    find_library(CAMBRICON_CNNL_LIB       NAMES cnnl       HINTS "${NEUWARE_HOME}/lib64" REQUIRED)
+    find_library(CAMBRICON_CNNL_EXTRA_LIB NAMES cnnl_extra HINTS "${NEUWARE_HOME}/lib64" REQUIRED)
+    find_library(CAMBRICON_PAPI_LIB       NAMES cnpapi     HINTS "${NEUWARE_HOME}/lib64" REQUIRED)
+endif()
+
+# If all other platforms are not enabled, CPU is enabled by default.
 if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX)
     add_compile_definitions(WITH_CPU=1)
 endif()
diff --git a/examples/gemm/gemm.cc b/examples/gemm/gemm.cc
index 62779e6..bb82890 100644
--- a/examples/gemm/gemm.cc
+++ b/examples/gemm/gemm.cc
@@ -14,6 +14,9 @@
 #if WITH_METAX
 #include "metax/gemm/mcblas.h"
 #endif
+#if WITH_CAMBRICON
+#include "cambricon/gemm/cnblas.h"
+#endif
 
 #include "runtime_api.h"
 #include "tensor.h"
diff --git a/examples/runtime_api.h b/examples/runtime_api.h
index 896af64..b56a8fd 100644
--- a/examples/runtime_api.h
+++ b/examples/runtime_api.h
@@ -28,6 +28,15 @@
 #define DEVICE_MEMCPY_HOST_TO_DEVICE mcMemcpyHostToDevice
 #define DEVICE_MEMCPY_DEVICE_TO_HOST mcMemcpyDeviceToHost
 #define DEFAULT_DEVICE_TYPE Device::Type::kMetax
+#elif WITH_CAMBRICON
+#include <cnrt.h>
+#define DEVICE_MALLOC cnrtMalloc
+#define DEVICE_FREE cnrtFree
+#define DEVICE_MEMCPY cnrtMemcpy
+#define DEVICE_MEMSET cnrtMemset
+#define DEVICE_MEMCPY_HOST_TO_DEVICE cnrtMemcpyHostToDev
+#define DEVICE_MEMCPY_DEVICE_TO_HOST cnrtMemcpyDevToHost
+#define DEFAULT_DEVICE_TYPE Device::Type::kCambricon
 #elif WITH_CPU
 #include <cstdlib>
 #include <cstring>
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 02fbc4d..6eef5d3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -103,6 +103,15 @@ if(WITH_METAX)
     list(APPEND DEVICE_LIST "metax")
 endif()
 
+if(WITH_CAMBRICON)
+    target_compile_definitions(infiniops PUBLIC WITH_CAMBRICON=1)
+
+    target_include_directories(infiniops PUBLIC "${NEUWARE_HOME}/include")
+    target_link_libraries(infiniops PUBLIC ${CAMBRICON_RUNTIME_LIB} ${CAMBRICON_CNNL_LIB} ${CAMBRICON_CNNL_EXTRA_LIB} ${CAMBRICON_PAPI_LIB})
+
+    list(APPEND DEVICE_LIST "cambricon")
+endif()
+
 target_include_directories(infiniops PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 if(GENERATE_PYTHON_BINDINGS)
diff --git a/src/cambricon/common.h b/src/cambricon/common.h
new file mode 100644
index 0000000..50775c2
--- /dev/null
+++ b/src/cambricon/common.h
@@ -0,0 +1,25 @@
+#ifndef INFINI_OPS_CAMBRICON_COMMON_H_
+#define INFINI_OPS_CAMBRICON_COMMON_H_
+
+#include <cnnl.h>
+
+#include "data_type.h"
+
+namespace infini::ops::cnnl_utils {
+
+inline cnnlDataType_t GetDataType(DataType dtype) {
+  switch (dtype) {
+    case DataType::kInt32:
+      return CNNL_DTYPE_INT32;
+    case DataType::kFloat16:
+      return CNNL_DTYPE_HALF;
+    case DataType::kFloat32:
+      return CNNL_DTYPE_FLOAT;
+    default:
+      return CNNL_DTYPE_INVALID;
+  }
+}
+
+}  // namespace infini::ops::cnnl_utils
+
+#endif
diff --git a/src/cambricon/gemm/cnblas.h b/src/cambricon/gemm/cnblas.h
new file mode 100644
index 0000000..ac95bd5
--- /dev/null
+++ b/src/cambricon/gemm/cnblas.h
@@ -0,0 +1,159 @@
+#ifndef INFINI_OPS_CAMBRICON_GEMM_CNBLAS_H_
+#define INFINI_OPS_CAMBRICON_GEMM_CNBLAS_H_
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+// clang-format off
+#include <cnnl.h>
+#include <cnrt.h>
+// clang-format on
+
+#include "base/gemm.h"
+#include "cambricon/common.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Gemm, Device::Type::kCambricon> : public Gemm {
+ public:
+  Operator(const Tensor a, const Tensor b, std::optional<float> alpha,
+           std::optional<float> beta, std::optional<int> trans_a,
+           std::optional<int> trans_b, Tensor c)
+      : Gemm{a, b, alpha, beta, trans_a, trans_b, c},
+        a_rows_{a.size(-2)},
+        a_cols_{a.size(-1)},
+        b_rows_{b.size(-2)},
+        b_cols_{b.size(-1)},
+        c_rows_{c.size(-2)},
+        c_cols_{c.size(-1)} {
+    assert(!trans_a_ && "`trans_a` is not currently supported");
+    assert(!trans_b_ && "`trans_b` is not currently supported");
+
+    cnnlCreate(&cnnl_handle_);
+
+    cnnlCreateTensorDescriptor(&desc_a_);
+    cnnlCreateTensorDescriptor(&desc_b_);
+    cnnlCreateTensorDescriptor(&desc_c_);
+
+    cnnlCreateMatMulDescriptor(&matmul_desc_);
+    cnnlCreateMatMulAlgo(&matmul_algo_);
+    cnnlCreateMatMulHeuristicResult(&heuristic_result_);
+
+    int32_t use_stride = 1;
+    cnnlSetMatMulDescAttr(matmul_desc_, CNNL_MATMUL_USE_STRIDE, &use_stride,
+                          sizeof(int32_t));
+
+    SetupTensorDescriptor(desc_a_, a_strides_, a_type_, a_rows_, a_cols_,
+                          batch_count_, batch_stride_a_);
+    SetupTensorDescriptor(desc_b_, b_strides_, b_type_, b_rows_, b_cols_,
+                          batch_count_, batch_stride_b_);
+    SetupTensorDescriptor(desc_c_, c_strides_, c_type_, c_rows_, c_cols_,
+                          batch_count_, batch_stride_c_);
+    int count = 0;
+    cnnlGetBatchMatMulExAlgoHeuristic(cnnl_handle_, matmul_desc_, desc_a_,
+                                      desc_b_, desc_c_, NULL, 1,
+                                      &heuristic_result_, &count);
+
+    cnrtMalloc(&default_workspace_, workspace_size_in_bytes());
+  }
+
+  Operator(const Tensor a, const Tensor b, Tensor c)
+      : Operator{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
+                 c} {}
+
+  Operator(const Tensor a, const Tensor b, std::optional<float> alpha,
+           std::optional<float> beta, Tensor c)
+      : Operator{a, b, alpha, beta, std::nullopt, std::nullopt, c} {}
+
+  ~Operator() {
+    cnrtFree(default_workspace_);
+    cnnlDestroyTensorDescriptor(desc_c_);
+    cnnlDestroyTensorDescriptor(desc_b_);
+    cnnlDestroyTensorDescriptor(desc_a_);
+    cnnlDestroyMatMulDescriptor(matmul_desc_);
+    cnnlDestroyMatMulAlgo(matmul_algo_);
+    cnnlDestroyMatMulHeuristicResult(heuristic_result_);
+    cnnlDestroy(cnnl_handle_);
+  }
+
+  void operator()(const Tensor a, const Tensor b, std::optional<float> alpha,
+                  std::optional<float> beta, std::optional<int> trans_a,
+                  std::optional<int> trans_b, Tensor c) const override {
+    const auto& alpha_value{alpha.value_or(alpha_)};
+    const auto& beta_value{beta.value_or(beta_)};
+
+    cnnlSetQueue(cnnl_handle_, (cnrtQueue_t)stream_);
+
+    auto workspace{workspace_ ? workspace_ : default_workspace_};
+    auto workspace_size{workspace_size_in_bytes_ ? workspace_size_in_bytes_
+                                                 : workspace_size_in_bytes()};
+
+    cnnlBatchMatMulEx(cnnl_handle_, matmul_desc_, matmul_algo_, &alpha_value,
+                      desc_a_, a.data(), desc_b_, b.data(), &beta_value,
+                      desc_c_, c.data(), workspace, workspace_size);
+  }
+
+  std::size_t workspace_size_in_bytes() const override {
+    std::size_t size{0};
+
+    cnnlGetBatchMatMulExHeuristicResult(heuristic_result_, matmul_algo_, &size);
+
+    return size;
+  }
+
+ private:
+  void SetupTensorDescriptor(cnnlTensorDescriptor_t desc,
+                             const Tensor::Strides& strides, DataType dtype,
+                             Tensor::Size rows, Tensor::Size cols,
+                             Tensor::Size batch, Tensor::Stride batch_stride) {
+    cnnlDataType_t cnnl_dtype = cnnl_utils::GetDataType(dtype);
+
+    if (batch > 1) {
+      std::vector<int> dims = {static_cast<int>(batch), static_cast<int>(rows),
+                               static_cast<int>(cols)};
+      std::vector<int> strides_arr = {
+          static_cast<int>(batch_stride),
+          static_cast<int>(strides[strides.size() - 2]),
+          static_cast<int>(strides[strides.size() - 1])};
+      cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, cnnl_dtype,
+                                dims.size(), dims.data(), strides_arr.data());
+    } else {
+      std::vector<int> dims = {static_cast<int>(rows), static_cast<int>(cols)};
+      std::vector<int> strides_arr = {
+          static_cast<int>(strides[strides.size() - 2]),
+          static_cast<int>(strides[strides.size() - 1])};
+      cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, cnnl_dtype,
+                                dims.size(), dims.data(), strides_arr.data());
+    }
+  }
+
+  cnnlHandle_t cnnl_handle_;
+
+  cnnlTensorDescriptor_t desc_a_;
+
+  cnnlTensorDescriptor_t desc_b_;
+
+  cnnlTensorDescriptor_t desc_c_;
+
+  cnnlMatMulDescriptor_t matmul_desc_;
+
+  cnnlMatMulAlgo_t matmul_algo_;
+
+  cnnlMatMulHeuristicResult_t heuristic_result_;
+
+  Tensor::Size a_rows_, a_cols_;
+
+  Tensor::Size b_rows_, b_cols_;
+
+  Tensor::Size c_rows_, c_cols_;
+
+  // TODO: Remove the following member after default workspace mechanism has
+  // been introduced globally.
+  void* default_workspace_{nullptr};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
index c091136..faee9d5 100644
--- a/tests/test_gemm.py
+++ b/tests/test_gemm.py
@@ -38,6 +38,10 @@ def test_gemm(
     rtol,
     atol,
 ):
+    # Skip transposing test cases for MLU platform as transposing is not currently supported.
+    if device == "mlu" and (trans_a or trans_b):
+        pytest.skip("transposing is not currently supported on MLU")
+
     a = randn_strided(a_shape, a_strides, dtype=dtype, device=device)
     b = randn_strided(b_shape, b_strides, dtype=dtype, device=device)