Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/dtypes/test_nf4.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ def world_size(self) -> int:
return 2

@skip_if_lt_x_gpu(2)
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
@unittest.skipIf(not torch.cuda.is_available(), "Need GPU available")
def test_comm(self):
self.run_subtests(
{"input_size": [512, 2048]},
Expand Down
29 changes: 15 additions & 14 deletions test/quantization/quantize_/workflows/int8/test_int8_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from torchao.quantization.utils import compute_error, get_block_size
from torchao.testing.model_architectures import ToyTwoLinearModel
from torchao.testing.utils import TorchAOIntegrationTestCase
from torchao.utils import torch_version_at_least
from torchao.utils import get_current_accelerator_device, torch_version_at_least

INT8_TEST_CONFIGS = [
Int8WeightOnlyConfig(version=2, granularity=PerTensor()),
Expand All @@ -38,9 +38,10 @@
version=2, granularity=PerRow(), act_mapping_type=MappingType.SYMMETRIC
),
]
_DEVICE = get_current_accelerator_device()


@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
@common_utils.instantiate_parametrized_tests
class TestInt8Tensor(TorchAOIntegrationTestCase):
def setUp(self):
Expand All @@ -60,7 +61,7 @@ def test_creation_and_attributes(self, config):
self.test_shape[0],
bias=False,
dtype=self.dtype,
device="cuda",
device=_DEVICE,
)
quantize_(linear, config)

Expand Down Expand Up @@ -99,8 +100,8 @@ def test_int8_linear_variants(
torch.compiler.reset()

M, N, K = sizes
input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
model = ToyTwoLinearModel(K, N, K, dtype=dtype, device="cuda").eval()
input_tensor = torch.randn(*M, K, dtype=dtype, device=_DEVICE)
model = ToyTwoLinearModel(K, N, K, dtype=dtype, device=_DEVICE).eval()
model_q = copy.deepcopy(model)

quantize_(model_q, config)
Expand Down Expand Up @@ -128,7 +129,7 @@ def test_int8_linear_variants(
)

@common_utils.parametrize("config", INT8_TEST_CONFIGS)
@common_utils.parametrize("device", ["cpu", "cuda"])
@common_utils.parametrize("device", ["cpu", _DEVICE])
@common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
def test_slice(self, config, device, dtype):
"""Test tensor slicing with per-row quantization"""
Expand Down Expand Up @@ -159,8 +160,8 @@ def test_slice(self, config, device, dtype):
def test_index_select(self, config):
"""test that `x_0 = x[0]` works when `x` is a 2D quantized tensor."""
N, K = 256, 512
x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device="cuda")
x = torch.randn(N, K, device=_DEVICE, dtype=torch.bfloat16)
linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device=_DEVICE)
linear.weight.data = x

quantize_(linear, config)
Expand All @@ -187,7 +188,7 @@ def test_index_select(self, config):
def test_dequantization_accuracy(self, config):
"""Test dequantization accuracy separately"""
linear = torch.nn.Linear(
256, 512, bias=False, dtype=torch.bfloat16, device="cuda"
256, 512, bias=False, dtype=torch.bfloat16, device=_DEVICE
)
weight_fp = copy.deepcopy(linear.weight)
quantize_(linear, config)
Expand All @@ -208,14 +209,14 @@ def test_available_gpu_kernels(self):

M, K, N = 128, 256, 512
m = torch.nn.Sequential(
torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
torch.nn.Linear(K, N, device=_DEVICE, dtype=torch.bfloat16)
)

config = Int8DynamicActivationInt8WeightConfig(version=2)
quantize_(m, config)

m = torch.compile(m)
x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
x = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)

out, code = run_and_get_code(m, x)

Expand Down Expand Up @@ -248,7 +249,7 @@ def test_pin_memory(self, config):
)


@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
@common_utils.instantiate_parametrized_tests
class TestInt8StaticQuant(TorchAOIntegrationTestCase):
@common_utils.parametrize("granularity", [PerRow(), PerTensor()])
Expand All @@ -257,9 +258,9 @@ def test_static_activation_per_row_int8_weight(self, granularity, dtype):
torch.compiler.reset()

M, N, K = 32, 32, 32
input_tensor = torch.randn(M, K, dtype=dtype, device="cuda")
input_tensor = torch.randn(M, K, dtype=dtype, device=_DEVICE)

model = torch.nn.Linear(K, N, bias=False).eval().to(device="cuda", dtype=dtype)
model = torch.nn.Linear(K, N, bias=False).eval().to(device=_DEVICE, dtype=dtype)
model_static_quant = copy.deepcopy(model)
model_dynamic_quant = copy.deepcopy(model)

Expand Down