Update fp8 conv3d to use mslk (#3530)

jerryzh168 · web-flow · commit 09b18eee486f · 2025-12-22T13:33:04.000-08:00
Update fp8 conv3d to use mlsk Summary: fbgemm_gpu_genai is renamed to https://github.com/meta-pytorch/MSLK/tree/main, so updating the dependency to mslk for fp8 conv for now (we can migrate others in the future) Next: we'll remove the permute and the test the new functionality added in fp8 conv op Test Plan: python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_fp8_conv_variants Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -31,6 +31,7 @@
 from torchao.testing.utils import TorchAOIntegrationTestCase
 from torchao.utils import (
     _is_fbgemm_gpu_genai_available,
+    _is_mslk_available,
     is_sm_at_least_89,
     is_sm_at_least_90,
     is_sm_at_least_100,
@@ -329,8 +330,8 @@ def _test_fp8_matmul_model(
         not is_sm_at_least_100(), "Requires GPU with compute capability >= 10.0"
     )
     @unittest.skipIf(
-        not _is_fbgemm_gpu_genai_available(),
-        "Requires fbgemm_gpu_genai to be installed",
+        not _is_mslk_available(),
+        "Requires mslk to be installed",
     )
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
     @common_utils.parametrize("compile", [True, False])
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -41,6 +41,7 @@
 from torchao.utils import (
     TorchAOBaseTensor,
     _is_fbgemm_gpu_genai_available,
+    _is_mslk_available,
     fill_defaults,
     is_sm_at_least_90,
     is_sm_at_least_100,
@@ -506,9 +507,7 @@ def _quantize_and_scaled_conv3d(
     assert input_tensor.dim() == 5 and weight_tensor.dim() == 5, (
         "Only support 3D conv currently"
     )
-    assert _is_fbgemm_gpu_genai_available(), (
-        "quantized fp8 conv3d requires fbgemm_gpu_genai to be available"
-    )
+    assert _is_mslk_available(), "quantized fp8 conv3d requires mslk to be available"
     act_quant_kwargs = weight_tensor.act_quant_kwargs
     # quantize activation, if `act_quant_kwargs` is specified
     if act_quant_kwargs is not None:
@@ -519,8 +518,8 @@ def _quantize_and_scaled_conv3d(
     if isinstance(input_tensor, Float8Tensor):
         kernel_choice = None
         if weight_tensor.kernel_preference == KernelPreference.AUTO:
-            if _is_fbgemm_gpu_genai_available() and is_sm_at_least_100():
-                kernel_choice = "fbgemm"
+            if _is_mslk_available() and is_sm_at_least_100():
+                kernel_choice = "mslk"
             else:
                 raise NotImplementedError(
                     f"No available kernel choice for {weight_tensor.kernel_preference}"
@@ -532,7 +531,7 @@ def _quantize_and_scaled_conv3d(
                 f"No available kernel choice for {weight_tensor.kernel_preference}"
             )
 
-    assert kernel_choice == "fbgemm", "Only fbgemm kernel choice is supported currently"
+    assert kernel_choice == "mslk", "Only mslk kernel choice is supported currently"
     input_qdata = input_tensor.qdata
     weight_qdata = weight_tensor.qdata
 
@@ -560,7 +559,10 @@ def _quantize_and_scaled_conv3d(
 
     input_scale = input_tensor.scale
     weight_scale = weight_tensor.scale
-    output = torch.ops.fbgemm.f8f8bf16_conv(
+
+    import mslk.conv  # noqa: F401
+
+    output = torch.ops.mslk.f8f8bf16_conv(
         input_qdata,
         weight_qdata,
         input_scale * weight_scale,
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -1146,6 +1146,13 @@ def _is_fbgemm_gpu_genai_available():
     return True
 
 
+def _is_mslk_available():
+    if is_fbcode():
+        return True
+
+    return importlib.util.find_spec("mslk") is not None
+
+
 class DummyModule(torch.nn.Module):
     """This is used because the TorchAO quantization functions tend to operate on modules so to apply the transform to a tensor, we can load a
     DummyModule with the target tensor and then apply the transformation to the module and then extract the transformed tensor.