pytorch · andrewor14 · Dec 17, 2025 · vkuzo · Dec 17, 2025 · andrewor14
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -2182,6 +2182,31 @@ def test_qat_nvfp4_training(self, use_per_tensor_scale: bool):
             self.assertNotEqual(torch.count_nonzero(new_weight.grad), 0)
             self.assertFalse(torch.equal(new_weight, prev_weight))
 
+    @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    def test_nvfp4_fake_quanitzed_linear_mixed_precision(self):
+        """
+        Test `NVFP4FakeQuantizedLinear` with bf16 input activations and fp32 weights.
+        """
+        from torchao.prototype.qat.nvfp4 import (
+            NVFP4FakeQuantizeConfig,
+            NVFP4FakeQuantizedLinear,
+        )
+
+        activation_dtype = torch.bfloat16
+        weight_dtype = torch.float32
+        linear = torch.nn.Linear(128, 512, dtype=weight_dtype).cuda()
+        activation_config = NVFP4FakeQuantizeConfig(use_per_tensor_scale=True)
+        weight_config = NVFP4FakeQuantizeConfig(use_per_tensor_scale=True)
+        linear = NVFP4FakeQuantizedLinear.from_linear(
+            linear, activation_config, weight_config
+        )
+        x = torch.randn(1, 128, dtype=activation_dtype).cuda()
+        out = linear(x)
+        self.assertEqual(linear.weight.dtype, weight_dtype)
+        self.assertEqual(x.dtype, activation_dtype)
+        self.assertEqual(out.dtype, activation_dtype)
+
     @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
     @unittest.skipIf(
         not _is_fbgemm_gpu_genai_available(), "Requires fbgemm-gpu-genai >= 1.2.0"

diff --git a/torchao/prototype/qat/nvfp4.py b/torchao/prototype/qat/nvfp4.py
@@ -53,6 +53,7 @@ def forward(
             per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
         else:
             per_tensor_scale = None
+        input_dtype = _input.dtype
         _input = NVFP4Tensor.to_nvfp4(
             _input,
             per_tensor_scale=per_tensor_scale,
@@ -84,7 +85,7 @@ def forward(
             weight.t(),
             None,  # aten_op, not used
             bias,
-        )
+        ).to(input_dtype)
 
     @staticmethod
     def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
@@ -156,7 +157,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         fq = _NVFP4QuantizedForwardFakeQuantizedBackward.apply(
             x, self.weight, self.bias, self.activation_config, self.weight_config
         )
-        assert fq.dtype == x.dtype
         if batch_size is not None:
             return fq.view(batch_size, -1, fq.shape[-1])
         else: