Fix NVFP4 QAT mixed precision (#3501)

andrewor14 · web-flow · commit e4bfca1fb2bf · 2025-12-22T17:21:08.000-05:00
**Summary:** This commit adds support for bf16 activations +
fp32 weights mixed precision for NVFP4 QAT, which previously
threw a dtype assertion error:
```
File "ao/torchao/prototype/qat/nvfp4.py", line 159, in forward
  assert fq.dtype == x.dtype
```

**Test Plan:**
```
python test/quantization/test_qat.py -k test_nvfp4_fake_quantized_linear_mixed_precision
```
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -2182,6 +2182,42 @@ def test_qat_nvfp4_training(self, use_per_tensor_scale: bool):
             self.assertNotEqual(torch.count_nonzero(new_weight.grad), 0)
             self.assertFalse(torch.equal(new_weight, prev_weight))
 
+    @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    def test_nvfp4_fake_quanitzed_linear_mixed_precision(self):
+        """
+        Test `NVFP4FakeQuantizedLinear` with bf16 input activations and fp32 weights.
+        """
+        from torchao.prototype.qat.nvfp4 import (
+            NVFP4FakeQuantizeConfig,
+            NVFP4FakeQuantizedLinear,
+        )
+
+        activation_dtype = torch.bfloat16
+        weight_dtype = torch.float32
+        linear = torch.nn.Linear(128, 512, dtype=weight_dtype).cuda()
+        activation_config = NVFP4FakeQuantizeConfig(use_per_tensor_scale=True)
+        weight_config = NVFP4FakeQuantizeConfig(use_per_tensor_scale=True)
+        linear = NVFP4FakeQuantizedLinear.from_linear(
+            linear, activation_config, weight_config
+        )
+
+        # simulate 1 step of training
+        optimizer = torch.optim.SGD(linear.parameters())
+        loss_fn = torch.nn.CrossEntropyLoss()
+        target = torch.randn(1, 512).float().cuda()
+        x = torch.randn(1, 128, dtype=activation_dtype).cuda()
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            out = linear(x)
+            self.assertEqual(linear.weight.dtype, weight_dtype)
+            self.assertEqual(x.dtype, activation_dtype)
+            self.assertEqual(out.dtype, activation_dtype)
+        loss = loss_fn(out, target)
+        loss.backward()
+        self.assertEqual(linear.weight.grad.dtype, weight_dtype)
+        optimizer.step()
+        optimizer.zero_grad()
+
     @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
     @unittest.skipIf(
         not _is_fbgemm_gpu_genai_available(), "Requires fbgemm-gpu-genai >= 1.2.0"
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -492,7 +492,7 @@ def _addmm_nvfp4_dispatch(
 
     # Add bias after scaling if needed
     if should_add_bias_separately:
-        result = result + bias
+        result = result + bias.to(a._orig_dtype)
 
     return result
 
diff --git a/torchao/prototype/qat/nvfp4.py b/torchao/prototype/qat/nvfp4.py
@@ -39,6 +39,7 @@ class _NVFP4QuantizedForwardFakeQuantizedBackward(torch.autograd.Function):
     """
 
     @staticmethod
+    @torch.amp.custom_fwd(device_type="cuda")
     def forward(
         ctx,
         _input: torch.Tensor,
@@ -87,6 +88,7 @@ def forward(
         )
 
     @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
     def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
         _input, weight = ctx.saved_tensors
         assert isinstance(_input, NVFP4Tensor)