From 9e2820779de596976f6bc54b14c186edf810c210 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Tue, 2 Dec 2025 09:42:37 +0800 Subject: [PATCH 1/2] extend TorchAoTest::test_model_memory_usage to other platform Signe-off-by: Wang, Yi --- tests/quantization/torchao/test_torchao.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 38997de17b12..9786eb10a036 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -35,6 +35,7 @@ from diffusers.quantizers import PipelineQuantizationConfig from ...testing_utils import ( + Expectations, backend_empty_cache, backend_synchronize, enable_full_determinism, @@ -497,8 +498,14 @@ def test_memory_footprint(self): def test_model_memory_usage(self): model_id = "hf-internal-testing/tiny-flux-pipe" - expected_memory_saving_ratio = 2.0 - + expected_memory_saving_ratios = Expectations( + { + ("xpu", None): 1.15, + ("cuda", 8): 1.02, + ("cuda", 9): 2.0, + } + ) + expected_memory_saving_ratio = expected_memory_saving_ratios.get_expectation() inputs = self.get_dummy_tensor_inputs(device=torch_device) transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"] From 029f67995bb4265dc809a6add3c65d11572bcea2 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Wed, 17 Dec 2025 15:04:01 +0800 Subject: [PATCH 2/2] add some comments Signed-off-by: Wang, Yi --- tests/quantization/torchao/test_torchao.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 9786eb10a036..e6bfc2530a5a 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -500,8 +500,17 @@ def test_model_memory_usage(self): model_id = "hf-internal-testing/tiny-flux-pipe" expected_memory_saving_ratios = Expectations( { + # XPU: For this tiny model, per-tensor overheads (alignment, fragmentation, metadata) become visible. + # While XPU doesn't have the large fixed cuBLAS workspace of A100, these small overheads prevent reaching the ideal 2.0 ratio. + # Observed ~1.27x (158k vs 124k) for model size. + # The runtime memory overhead is ~88k for both bf16 and int8wo. Adding this to model size: (158k+88k)/(124k+88k) ≈ 1.15. ("xpu", None): 1.15, + # On Ampere, the cuBLAS kernels used for matrix multiplication often allocate a fixed-size workspace. + # Since the tiny-flux model weights are likely smaller than or comparable to this workspace, the total memory is dominated by the workspace. ("cuda", 8): 1.02, + # On Hopper, TorchAO utilizes newer, highly optimized kernels (via Triton or CUTLASS 3.x) that are designed to be workspace-free or use negligible extra memory. + # Additionally, Triton kernels often handle unaligned memory better, avoiding the padding overhead seen on other backends for tiny tensors. + # This allows it to achieve the near-ideal 2.0x compression ratio. ("cuda", 9): 2.0, } )