From e419ae3e96bacb03a305623cc24cc563f2796067 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Sun, 31 May 2026 12:10:16 -0700
Subject: [PATCH 1/7] refactor vq tests

---
 tests/models/autoencoders/test_models_vq.py | 81 ++++++++++++---------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/tests/models/autoencoders/test_models_vq.py b/tests/models/autoencoders/test_models_vq.py
index b88d24d1f2d8..ce1606f0e859 100644
--- a/tests/models/autoencoders/test_models_vq.py
+++ b/tests/models/autoencoders/test_models_vq.py
@@ -13,43 +13,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import VQModel
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import backend_manual_seed, enable_full_determinism, floats_tensor, torch_device
-from ..test_modeling_common import ModelTesterMixin
-from .testing_utils import AutoencoderTesterMixin
+from ...testing_utils import backend_manual_seed, enable_full_determinism, torch_device
+from ..testing_utils import BaseModelTesterConfig, MemoryTesterMixin, ModelTesterMixin, TrainingTesterMixin
+from .testing_utils import NewAutoencoderTesterMixin
 
 
 enable_full_determinism()
 
 
-class VQModelTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.TestCase):
-    model_class = VQModel
-    main_input_name = "sample"
-
+class VQModelTesterConfig(BaseModelTesterConfig):
     @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+    def model_class(self):
+        return VQModel
 
-        return {"sample": image}
+    @property
+    def main_input_name(self) -> str:
+        return "sample"
 
     @property
-    def input_shape(self):
+    def output_shape(self) -> tuple:
         return (3, 32, 32)
 
     @property
-    def output_shape(self):
-        return (3, 32, 32)
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
+    def get_init_dict(self) -> dict:
+        return {
             "block_out_channels": [8, 16],
             "norm_num_groups": 8,
             "in_channels": 3,
@@ -58,24 +54,23 @@ def prepare_init_args_and_inputs_for_common(self):
             "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
             "latent_channels": 3,
         }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    @unittest.skip("Test not supported.")
-    def test_forward_signature(self):
-        pass
+    def get_dummy_inputs(self) -> dict:
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+        image = randn_tensor((batch_size, num_channels, *sizes), generator=self.generator, device=torch_device)
+        return {"sample": image}
 
-    @unittest.skip("Test not supported.")
-    def test_training(self):
-        pass
 
+class TestVQModel(VQModelTesterConfig, ModelTesterMixin):
     def test_from_pretrained_hub(self):
         model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
+        assert model is not None
+        assert len(loading_info["missing_keys"]) == 0
 
         model.to(torch_device)
-        image = model(**self.dummy_input)
+        image = model(**self.get_dummy_inputs())
 
         assert image is not None, "Make sure output is not None"
 
@@ -95,7 +90,7 @@ def test_output_pretrained(self):
         # fmt: off
         expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143])
         # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-3)
 
     def test_loss_pretrained(self):
         model = VQModel.from_pretrained("fusing/vqgan-dummy")
@@ -111,4 +106,24 @@ def test_loss_pretrained(self):
         # fmt: off
         expected_output = torch.tensor([0.1936])
         # fmt: on
-        self.assertTrue(torch.allclose(output, expected_output, atol=1e-3))
+        assert torch.allclose(output, expected_output, atol=1e-3)
+
+
+class TestVQModelTraining(VQModelTesterConfig, TrainingTesterMixin):
+    """Training tests for VQModel."""
+
+    @pytest.mark.skip("Test not supported.")
+    def test_training(self):
+        super().test_training()
+
+    @pytest.mark.skip("Test not supported.")
+    def test_training_with_ema(self):
+        super().test_training_with_ema()
+
+
+class TestVQModelMemory(VQModelTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for VQModel."""
+
+
+class TestVQModelSlicingTiling(VQModelTesterConfig, NewAutoencoderTesterMixin):
+    """Slicing and tiling tests for VQModel."""

From 17f1bdc908538a405e1cbb1a90a162841c83ef39 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Sun, 31 May 2026 12:10:16 -0700
Subject: [PATCH 2/7] refactor autoencoder_kl_kvae_video tests

---
 .../test_models_autoencoder_kl_kvae_video.py  | 123 +++++++++---------
 1 file changed, 64 insertions(+), 59 deletions(-)

diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py b/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py
index 7e9eebb87cf4..1e04c6be5a5b 100644
--- a/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py
+++ b/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py
@@ -13,24 +13,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
+import pytest
+import torch
 
 from diffusers import AutoencoderKLKVAEVideo
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import enable_full_determinism, floats_tensor, torch_device
-from ..test_modeling_common import ModelTesterMixin
-from .testing_utils import AutoencoderTesterMixin
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import BaseModelTesterConfig, MemoryTesterMixin, ModelTesterMixin, TrainingTesterMixin
+from .testing_utils import NewAutoencoderTesterMixin
 
 
 enable_full_determinism()
 
 
-class AutoencoderKLKVAEVideoTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKLKVAEVideo
-    main_input_name = "sample"
-    base_precision = 1e-2
+def _run_nondeterministic(fn):
+    # reflection_pad3d_backward_out_cuda has no deterministic CUDA implementation;
+    # temporarily relax the requirement for tests that do backward passes.
+    torch.use_deterministic_algorithms(False)
+    try:
+        fn()
+    finally:
+        torch.use_deterministic_algorithms(True)
+
+
+class AutoencoderKLKVAEVideoTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AutoencoderKLKVAEVideo
+
+    @property
+    def main_input_name(self) -> str:
+        return "sample"
+
+    @property
+    def output_shape(self) -> tuple:
+        return (3, 3, 16, 16)
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
-    def get_autoencoder_kl_kvae_video_config(self):
+    def get_init_dict(self) -> dict:
         return {
             "ch": 32,
             "ch_mult": (1, 2),
@@ -41,78 +65,59 @@ def get_autoencoder_kl_kvae_video_config(self):
             "temporal_compress_times": 2,
         }
 
-    @property
-    def dummy_input(self):
+    def get_dummy_inputs(self) -> dict:
         batch_size = 2
         num_frames = 3  # satisfies (T-1) % temporal_compress_times == 0 with temporal_compress_times=2
         num_channels = 3
         sizes = (16, 16)
-
-        video = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
-
+        video = randn_tensor(
+            (batch_size, num_channels, num_frames, *sizes), generator=self.generator, device=torch_device
+        )
         return {"sample": video}
 
-    @property
-    def input_shape(self):
-        return (3, 3, 16, 16)
-
-    @property
-    def output_shape(self):
-        return (3, 3, 16, 16)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = self.get_autoencoder_kl_kvae_video_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {
-            "KVAECachedEncoder3D",
-            "KVAECachedDecoder3D",
-        }
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+class TestAutoencoderKLKVAEVideo(AutoencoderKLKVAEVideoTesterConfig, ModelTesterMixin):
+    base_precision = 1e-2
 
-    @unittest.skip("Unsupported test.")
+    @pytest.mark.skip("Unsupported test.")
     def test_outputs_equivalence(self):
-        pass
+        super().test_outputs_equivalence()
 
-    @unittest.skip(
+    @pytest.mark.skip(
         "Multi-GPU inference is not supported due to the stateful cache_dict passing through the forward pass."
     )
     def test_model_parallelism(self):
-        pass
+        super().test_model_parallelism()
 
-    @unittest.skip(
-        "Multi-GPU inference is not supported due to the stateful cache_dict passing through the forward pass."
-    )
-    def test_sharded_checkpoints_device_map(self):
-        pass
 
-    def _run_nondeterministic(self, fn):
-        # reflection_pad3d_backward_out_cuda has no deterministic CUDA implementation;
-        # temporarily relax the requirement for training tests that do backward passes.
-        import torch
+class TestAutoencoderKLKVAEVideoTraining(AutoencoderKLKVAEVideoTesterConfig, TrainingTesterMixin):
+    """Training tests for AutoencoderKLKVAEVideo."""
 
-        torch.use_deterministic_algorithms(False)
-        try:
-            fn()
-        finally:
-            torch.use_deterministic_algorithms(True)
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"KVAECachedEncoder3D", "KVAECachedDecoder3D"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
 
     def test_training(self):
-        self._run_nondeterministic(super().test_training)
+        _run_nondeterministic(super().test_training)
 
-    def test_ema_training(self):
-        self._run_nondeterministic(super().test_ema_training)
+    def test_training_with_ema(self):
+        _run_nondeterministic(super().test_training_with_ema)
 
-    @unittest.skip(
+    @pytest.mark.skip(
         "Gradient checkpointing recomputes the forward pass, but the model uses a stateful cache_dict "
         "that is mutated during the first forward. On recomputation the cache is already populated, "
-        "causing a different execution path and numerically different gradients. "
-        "GC still reduces peak memory usage; gradient correctness in the presence of GC is a known limitation."
+        "causing a different execution path and numerically different gradients."
     )
-    def test_effective_gradient_checkpointing(self):
-        pass
+    def test_gradient_checkpointing_equivalence(self):
+        super().test_gradient_checkpointing_equivalence()
 
     def test_layerwise_casting_training(self):
-        self._run_nondeterministic(super().test_layerwise_casting_training)
+        _run_nondeterministic(super().test_layerwise_casting_training)
+
+
+class TestAutoencoderKLKVAEVideoMemory(AutoencoderKLKVAEVideoTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AutoencoderKLKVAEVideo."""
+
+
+class TestAutoencoderKLKVAEVideoSlicingTiling(AutoencoderKLKVAEVideoTesterConfig, NewAutoencoderTesterMixin):
+    """Slicing and tiling tests for AutoencoderKLKVAEVideo."""

From 331fa74514e77ec2f2f38886c181e4e584f7be2c Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Sun, 31 May 2026 12:10:16 -0700
Subject: [PATCH 3/7] refactor autoencoder_oobleck tests

---
 .../test_models_autoencoder_oobleck.py        | 141 ++++++++----------
 1 file changed, 59 insertions(+), 82 deletions(-)

diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
index d10e8ba33a12..96b70bc29bb4 100644
--- a/tests/models/autoencoders/test_models_autoencoder_oobleck.py
+++ b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -16,34 +16,47 @@
 import gc
 import unittest
 
+import pytest
 import torch
 from datasets import load_dataset
 from parameterized import parameterized
 
 from diffusers import AutoencoderOobleck
+from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import (
     backend_empty_cache,
     enable_full_determinism,
-    floats_tensor,
     slow,
     torch_all_close,
     torch_device,
 )
-from ..test_modeling_common import ModelTesterMixin
-from .testing_utils import AutoencoderTesterMixin
+from ..testing_utils import BaseModelTesterConfig, MemoryTesterMixin, ModelTesterMixin, TrainingTesterMixin
+from .testing_utils import NewAutoencoderTesterMixin
 
 
 enable_full_determinism()
 
 
-class AutoencoderOobleckTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.TestCase):
-    model_class = AutoencoderOobleck
-    main_input_name = "sample"
-    base_precision = 1e-2
+class AutoencoderOobleckTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AutoencoderOobleck
+
+    @property
+    def main_input_name(self) -> str:
+        return "sample"
+
+    @property
+    def output_shape(self) -> tuple:
+        return (2, 24)
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
-    def get_autoencoder_oobleck_config(self, block_out_channels=None):
-        init_dict = {
+    def get_init_dict(self) -> dict:
+        return {
             "encoder_hidden_size": 12,
             "decoder_channels": 12,
             "decoder_input_channels": 6,
@@ -51,33 +64,46 @@ def get_autoencoder_oobleck_config(self, block_out_channels=None):
             "downsampling_ratios": [2, 4],
             "channel_multiples": [1, 2],
         }
-        return init_dict
 
-    @property
-    def dummy_input(self):
+    def get_dummy_inputs(self) -> dict:
         batch_size = 4
         num_channels = 2
         seq_len = 24
+        waveform = randn_tensor((batch_size, num_channels, seq_len), generator=self.generator, device=torch_device)
+        return {"sample": waveform, "sample_posterior": False}
 
-        waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device)
 
-        return {"sample": waveform, "sample_posterior": False}
+class TestAutoencoderOobleck(AutoencoderOobleckTesterConfig, ModelTesterMixin):
+    base_precision = 1e-2
 
-    @property
-    def input_shape(self):
-        return (2, 24)
 
-    @property
-    def output_shape(self):
-        return (2, 24)
+class TestAutoencoderOobleckTraining(AutoencoderOobleckTesterConfig, TrainingTesterMixin):
+    """Training tests for AutoencoderOobleck."""
+
+
+class TestAutoencoderOobleckMemory(AutoencoderOobleckTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AutoencoderOobleck."""
+
+    @pytest.mark.skip(
+        "Test not supported because of 'weight_norm_fwd_first_dim_kernel' not implemented for 'Float8_e4m3fn'"
+    )
+    def test_layerwise_casting_training(self):
+        super().test_layerwise_casting_training()
+
+    @pytest.mark.skip(
+        "The convolution layers of AutoencoderOobleck are wrapped with torch.nn.utils.weight_norm. "
+        "This causes the hook's pre_forward to not cast the module weights to compute_dtype."
+    )
+    def test_layerwise_casting_memory(self):
+        super().test_layerwise_casting_memory()
+
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = self.get_autoencoder_oobleck_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
+class TestAutoencoderOobleckSlicingTiling(AutoencoderOobleckTesterConfig, NewAutoencoderTesterMixin):
+    """Slicing and tiling tests for AutoencoderOobleck."""
 
     def test_enable_disable_slicing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs_dict = self.get_dummy_inputs()
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict).to(torch_device)
@@ -91,55 +117,23 @@ def test_enable_disable_slicing(self):
         model.enable_slicing()
         output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
 
-        self.assertLess(
-            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
-            0.5,
-            "VAE slicing should not affect the inference results",
-        )
+        assert (
+            output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()
+        ).max() < 0.5, "VAE slicing should not affect the inference results"
 
         torch.manual_seed(0)
         model.disable_slicing()
         output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
 
-        self.assertEqual(
-            output_without_slicing.detach().cpu().numpy().all(),
-            output_without_slicing_2.detach().cpu().numpy().all(),
-            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
-        )
-
-    @unittest.skip("No attention module used in this model")
-    def test_set_attn_processor_for_determinism(self):
-        return
-
-    @unittest.skip(
-        "Test not supported because of 'weight_norm_fwd_first_dim_kernel' not implemented for 'Float8_e4m3fn'"
-    )
-    def test_layerwise_casting_training(self):
-        return super().test_layerwise_casting_training()
-
-    @unittest.skip(
-        "The convolution layers of AutoencoderOobleck are wrapped with torch.nn.utils.weight_norm. This causes the hook's pre_forward to not "
-        "cast the module weights to compute_dtype (as required by forward pass). As a result, forward pass errors out. To fix:\n"
-        "1. Make sure `nn::Module::to` works with `torch.nn.utils.weight_norm` wrapped convolution layer.\n"
-        "2. Unskip this test."
-    )
-    def test_layerwise_casting_inference(self):
-        pass
-
-    @unittest.skip(
-        "The convolution layers of AutoencoderOobleck are wrapped with torch.nn.utils.weight_norm. This causes the hook's pre_forward to not "
-        "cast the module weights to compute_dtype (as required by forward pass). As a result, forward pass errors out. To fix:\n"
-        "1. Make sure `nn::Module::to` works with `torch.nn.utils.weight_norm` wrapped convolution layer.\n"
-        "2. Unskip this test."
-    )
-    def test_layerwise_casting_memory(self):
-        pass
+        assert (
+            output_without_slicing.detach().cpu().numpy().all()
+            == output_without_slicing_2.detach().cpu().numpy().all()
+        ), "Without slicing outputs should match with the outputs when slicing is manually disabled."
 
 
 @slow
 class AutoencoderOobleckIntegrationTests(unittest.TestCase):
     def tearDown(self):
-        # clean up the VRAM after each test
         super().tearDown()
         gc.collect()
         backend_empty_cache(torch_device)
@@ -148,9 +142,7 @@ def _load_datasamples(self, num_samples):
         ds = load_dataset(
             "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
         )
-        # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
         return torch.nn.utils.rnn.pad_sequence(
             [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True
         )
@@ -158,25 +150,14 @@ def _load_datasamples(self, num_samples):
     def get_audio(self, audio_sample_size=2097152, fp16=False):
         dtype = torch.float16 if fp16 else torch.float32
         audio = self._load_datasamples(2).to(torch_device).to(dtype)
-
-        # pad / crop to audio_sample_size
         audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1]))
-
-        # todo channel
         audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device)
-
         return audio
 
     def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False):
         torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = AutoencoderOobleck.from_pretrained(
-            model_id,
-            subfolder="vae",
-            torch_dtype=torch_dtype,
-        )
+        model = AutoencoderOobleck.from_pretrained(model_id, subfolder="vae", torch_dtype=torch_dtype)
         model.to(torch_device)
-
         return model
 
     def get_generator(self, seed=0):
@@ -206,7 +187,6 @@ def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_dif
 
         output_slice = sample[-1, 1, 5:10].cpu()
         expected_output_slice = torch.tensor(expected_slice)
-
         assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
 
     def test_stable_diffusion_mode(self):
@@ -237,13 +217,10 @@ def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mea
             z = posterior.sample(generator=generator)
             sample = model.decode(z).sample
 
-        # (batch_size, latent_dim, sequence_length)
         assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024)
-
         assert sample.shape == audio.shape
         assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
 
         output_slice = sample[-1, 1, 5:10].cpu()
         expected_output_slice = torch.tensor(expected_slice)
-
         assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)

From cae36fb9477c5c54d1874255e9283275fcfda365 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Sun, 31 May 2026 12:10:16 -0700
Subject: [PATCH 4/7] refactor consistency_decoder_vae tests

---
 .../test_models_consistency_decoder_vae.py    | 86 +++++++++----------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
index ef04d151ecd1..ef37797bad12 100644
--- a/tests/models/autoencoders/test_models_consistency_decoder_vae.py
+++ b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -30,22 +30,33 @@
     torch_all_close,
     torch_device,
 )
-from ..test_modeling_common import ModelTesterMixin
-from .testing_utils import AutoencoderTesterMixin
+from ..testing_utils import BaseModelTesterConfig, MemoryTesterMixin, ModelTesterMixin, TrainingTesterMixin
+from .testing_utils import NewAutoencoderTesterMixin
 
 
 enable_full_determinism()
 
 
-class ConsistencyDecoderVAETests(ModelTesterMixin, AutoencoderTesterMixin, unittest.TestCase):
-    model_class = ConsistencyDecoderVAE
-    main_input_name = "sample"
-    base_precision = 1e-2
-    forward_requires_fresh_args = True
+class ConsistencyDecoderVAETesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return ConsistencyDecoderVAE
+
+    @property
+    def main_input_name(self) -> str:
+        return "sample"
+
+    @property
+    def output_shape(self) -> tuple:
+        return (3, 32, 32)
 
-    def get_consistency_vae_config(self, block_out_channels=None, norm_num_groups=None):
-        block_out_channels = block_out_channels or [2, 4]
-        norm_num_groups = norm_num_groups or 2
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict:
+        block_out_channels = [2, 4]
+        norm_num_groups = 2
         return {
             "encoder_block_out_channels": block_out_channels,
             "encoder_in_channels": 3,
@@ -69,48 +80,43 @@ def get_consistency_vae_config(self, block_out_channels=None, norm_num_groups=No
             "latent_channels": 4,
         }
 
-    def inputs_dict(self, seed=None):
-        if seed is None:
-            generator = torch.Generator("cpu").manual_seed(0)
-        else:
-            generator = torch.Generator("cpu").manual_seed(seed)
-        image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch.device(torch_device))
-
+    def get_dummy_inputs(self) -> dict:
+        generator = torch.Generator("cpu").manual_seed(0)
+        image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch_device)
         return {"sample": image, "generator": generator}
 
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
 
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
+class TestConsistencyDecoderVAE(ConsistencyDecoderVAETesterConfig, ModelTesterMixin):
+    base_precision = 1e-2
+
+
+class TestConsistencyDecoderVAETraining(ConsistencyDecoderVAETesterConfig, TrainingTesterMixin):
+    """Training tests for ConsistencyDecoderVAE."""
 
-    @property
-    def init_dict(self):
-        return self.get_consistency_vae_config()
 
-    def prepare_init_args_and_inputs_for_common(self):
-        return self.init_dict, self.inputs_dict()
+class TestConsistencyDecoderVAEMemory(ConsistencyDecoderVAETesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for ConsistencyDecoderVAE."""
+
+
+class TestConsistencyDecoderVAESlicingTiling(ConsistencyDecoderVAETesterConfig, NewAutoencoderTesterMixin):
+    """Slicing and tiling tests for ConsistencyDecoderVAE."""
 
 
 @slow
 class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
     def setUp(self):
-        # clean up the VRAM before each test
         super().setUp()
         gc.collect()
         backend_empty_cache(torch_device)
 
     def tearDown(self):
-        # clean up the VRAM after each test
         super().tearDown()
         gc.collect()
         backend_empty_cache(torch_device)
 
     @torch.no_grad()
     def test_encode_decode(self):
-        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")
         vae.to(torch_device)
 
         image = load_image(
@@ -122,16 +128,14 @@ def test_encode_decode(self):
         )
 
         latent = vae.encode(image).latent_dist.mean
-
         sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
 
         actual_output = sample[0, :2, :2, :2].flatten().cpu()
         expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024])
-
         assert torch_all_close(actual_output, expected_output, atol=5e-3)
 
     def test_sd(self):
-        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")  # TODO - update
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder")
         pipe = StableDiffusionPipeline.from_pretrained(
             "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None
         )
@@ -146,13 +150,10 @@ def test_sd(self):
 
         actual_output = out[:2, :2, :2].flatten().cpu()
         expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759])
-
         assert torch_all_close(actual_output, expected_output, atol=5e-3)
 
     def test_encode_decode_f16(self):
-        vae = ConsistencyDecoderVAE.from_pretrained(
-            "openai/consistency-decoder", torch_dtype=torch.float16
-        )  # TODO - update
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
         vae.to(torch_device)
 
         image = load_image(
@@ -166,7 +167,6 @@ def test_encode_decode_f16(self):
         )
 
         latent = vae.encode(image).latent_dist.mean
-
         sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample
 
         actual_output = sample[0, :2, :2, :2].flatten().cpu()
@@ -174,13 +174,10 @@ def test_encode_decode_f16(self):
             [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471],
             dtype=torch.float16,
         )
-
         assert torch_all_close(actual_output, expected_output, atol=5e-3)
 
     def test_sd_f16(self):
-        vae = ConsistencyDecoderVAE.from_pretrained(
-            "openai/consistency-decoder", torch_dtype=torch.float16
-        )  # TODO - update
+        vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
         pipe = StableDiffusionPipeline.from_pretrained(
             "stable-diffusion-v1-5/stable-diffusion-v1-5",
             torch_dtype=torch.float16,
@@ -201,7 +198,6 @@ def test_sd_f16(self):
             [0.0000, 0.0249, 0.0000, 0.0000, 0.1709, 0.2773, 0.0471, 0.1035],
             dtype=torch.float16,
         )
-
         assert torch_all_close(actual_output, expected_output, atol=5e-3)
 
     def test_vae_tiling(self):
@@ -219,7 +215,6 @@ def test_vae_tiling(self):
             generator=torch.Generator("cpu").manual_seed(0),
         ).images[0]
 
-        # make sure tiled vae decode yields the same result
         pipe.enable_vae_tiling()
         out_2 = pipe(
             "horse",
@@ -230,7 +225,6 @@ def test_vae_tiling(self):
 
         assert torch_all_close(out_1, out_2, atol=5e-3)
 
-        # test that tiled decode works with various shapes
         shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
         with torch.no_grad():
             for shape in shapes:

From 3168001cc67fae70fa1070639e7f1aecae2d849b Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Sun, 31 May 2026 12:10:16 -0700
Subject: [PATCH 5/7] refactor autoencoder_tiny tests

---
 .../test_models_autoencoder_tiny.py           | 128 ++++++++----------
 1 file changed, 57 insertions(+), 71 deletions(-)

diff --git a/tests/models/autoencoders/test_models_autoencoder_tiny.py b/tests/models/autoencoders/test_models_autoencoder_tiny.py
index 68232aa12fdf..741a31a15f65 100644
--- a/tests/models/autoencoders/test_models_autoencoder_tiny.py
+++ b/tests/models/autoencoders/test_models_autoencoder_tiny.py
@@ -17,35 +17,48 @@
 import gc
 import unittest
 
+import pytest
 import torch
 from parameterized import parameterized
 
 from diffusers import AutoencoderTiny
+from diffusers.utils.torch_utils import randn_tensor
 
 from ...testing_utils import (
     backend_empty_cache,
     enable_full_determinism,
-    floats_tensor,
     load_hf_numpy,
     slow,
     torch_all_close,
     torch_device,
 )
-from ..test_modeling_common import ModelTesterMixin
-from .testing_utils import AutoencoderTesterMixin
+from ..testing_utils import BaseModelTesterConfig, MemoryTesterMixin, ModelTesterMixin, TrainingTesterMixin
+from .testing_utils import NewAutoencoderTesterMixin
 
 
 enable_full_determinism()
 
 
-class AutoencoderTinyTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.TestCase):
-    model_class = AutoencoderTiny
-    main_input_name = "sample"
-    base_precision = 1e-2
+class AutoencoderTinyTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AutoencoderTiny
+
+    @property
+    def main_input_name(self) -> str:
+        return "sample"
+
+    @property
+    def output_shape(self) -> tuple:
+        return (3, 32, 32)
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
-    def get_autoencoder_tiny_config(self, block_out_channels=None):
-        block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32]
-        init_dict = {
+    def get_init_dict(self) -> dict:
+        block_out_channels = [32, 32]
+        return {
             "in_channels": 3,
             "out_channels": 3,
             "encoder_block_out_channels": block_out_channels,
@@ -53,53 +66,36 @@ def get_autoencoder_tiny_config(self, block_out_channels=None):
             "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels],
             "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)],
         }
-        return init_dict
 
-    @property
-    def dummy_input(self):
+    def get_dummy_inputs(self) -> dict:
         batch_size = 4
         num_channels = 3
         sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
+        image = randn_tensor((batch_size, num_channels, *sizes), generator=self.generator, device=torch_device)
         return {"sample": image}
 
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = self.get_autoencoder_tiny_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
 
-    @unittest.skip("Model doesn't yet support smaller resolution.")
-    def test_enable_disable_tiling(self):
-        pass
+class TestAutoencoderTiny(AutoencoderTinyTesterConfig, ModelTesterMixin):
+    base_precision = 1e-2
 
-    @unittest.skip("Test not supported.")
+    @pytest.mark.skip("Test not supported.")
     def test_outputs_equivalence(self):
-        pass
+        super().test_outputs_equivalence()
 
-    @unittest.skip("Test not supported.")
-    def test_forward_with_norm_groups(self):
-        pass
+
+class TestAutoencoderTinyTraining(AutoencoderTinyTesterConfig, TrainingTesterMixin):
+    """Training tests for AutoencoderTiny."""
 
     def test_gradient_checkpointing_is_applied(self):
         expected_set = {"DecoderTiny", "EncoderTiny"}
         super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
 
-    def test_effective_gradient_checkpointing(self):
+    def test_gradient_checkpointing_equivalence(self):
         if not self.model_class._supports_gradient_checkpointing:
-            return  # Skip test if model does not support gradient checkpointing
+            return
 
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs_dict = self.get_dummy_inputs()
         inputs_dict_copy = copy.deepcopy(inputs_dict)
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
@@ -108,18 +104,13 @@ def test_effective_gradient_checkpointing(self):
         assert not model.is_gradient_checkpointing and model.training
 
         out = model(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
         model.zero_grad()
-
         labels = torch.randn_like(out)
         loss = (out - labels).mean()
         loss.backward()
 
-        # re-instantiate the model now enabling gradient checkpointing
         torch.manual_seed(0)
         model_2 = self.model_class(**init_dict)
-        # clone model
         model_2.load_state_dict(model.state_dict())
         model_2.to(torch_device)
         model_2.enable_gradient_checkpointing()
@@ -127,43 +118,45 @@ def test_effective_gradient_checkpointing(self):
         assert model_2.is_gradient_checkpointing and model_2.training
 
         out_2 = model_2(**inputs_dict_copy).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
         model_2.zero_grad()
         loss_2 = (out_2 - labels).mean()
         loss_2.backward()
 
-        # compare the output and parameters gradients
-        self.assertTrue((loss - loss_2).abs() < 1e-3)
+        assert (loss - loss_2).abs() < 1e-3
         named_params = dict(model.named_parameters())
         named_params_2 = dict(model_2.named_parameters())
-
         for name, param in named_params.items():
             if "encoder.layers" in name:
                 continue
-            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=3e-2))
+            assert torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=3e-2)
 
-    @unittest.skip(
-        "The forward pass of AutoencoderTiny creates a torch.float32 tensor. This causes inference in compute_dtype=torch.bfloat16 to fail. To fix:\n"
-        "1. Change the forward pass to be dtype agnostic.\n"
-        "2. Unskip this test."
-    )
-    def test_layerwise_casting_inference(self):
-        pass
 
-    @unittest.skip(
-        "The forward pass of AutoencoderTiny creates a torch.float32 tensor. This causes inference in compute_dtype=torch.bfloat16 to fail. To fix:\n"
-        "1. Change the forward pass to be dtype agnostic.\n"
-        "2. Unskip this test."
+class TestAutoencoderTinyMemory(AutoencoderTinyTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AutoencoderTiny."""
+
+    @pytest.mark.skip(
+        "The forward pass of AutoencoderTiny creates a torch.float32 tensor. "
+        "This causes inference in compute_dtype=torch.bfloat16 to fail."
     )
     def test_layerwise_casting_memory(self):
-        pass
+        super().test_layerwise_casting_memory()
+
+
+class TestAutoencoderTinySlicingTiling(AutoencoderTinyTesterConfig, NewAutoencoderTesterMixin):
+    """Slicing and tiling tests for AutoencoderTiny."""
+
+    @pytest.mark.skip("Model does not yet support smaller resolution.")
+    def test_enable_disable_tiling(self):
+        super().test_enable_disable_tiling()
+
+    @pytest.mark.skip("Test not supported.")
+    def test_forward_with_norm_groups(self):
+        super().test_forward_with_norm_groups()
 
 
 @slow
 class AutoencoderTinyIntegrationTests(unittest.TestCase):
     def tearDown(self):
-        # clean up the VRAM after each test
         super().tearDown()
         gc.collect()
         backend_empty_cache(torch_device)
@@ -178,7 +171,6 @@ def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
 
     def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False):
         torch_dtype = torch.float16 if fp16 else torch.float32
-
         model = AutoencoderTiny.from_pretrained(model_id, torch_dtype=torch_dtype)
         model.to(torch_device).eval()
         return model
@@ -211,26 +203,20 @@ def test_stable_diffusion(self):
 
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor([0.0093, 0.6385, -0.1274, 0.1631, -0.1762, 0.5232, -0.3108, -0.0382])
-
         assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
 
     @parameterized.expand([(True,), (False,)])
     def test_tae_roundtrip(self, enable_tiling):
-        # load the autoencoder
         model = self.get_sd_vae_model()
         if enable_tiling:
             model.enable_tiling()
 
-        # make a black image with a white square in the middle,
-        # which is large enough to split across multiple tiles
         image = -torch.ones(1, 3, 1024, 1024, device=torch_device)
         image[..., 256:768, 256:768] = 1.0
 
-        # round-trip the image through the autoencoder
         with torch.no_grad():
             sample = model(image).sample
 
-        # the autoencoder reconstruction should match original image, sorta
         def downscale(x):
             return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor)
 

From ce58c5ba1ae4952853a0866ba833f5a6dbe27bcd Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Sun, 31 May 2026 12:18:26 -0700
Subject: [PATCH 6/7] refactor autoencoder_vidtok tests

---
 .../test_models_autoencoder_vidtok.py         | 145 +++++++++---------
 1 file changed, 75 insertions(+), 70 deletions(-)

diff --git a/tests/models/autoencoders/test_models_autoencoder_vidtok.py b/tests/models/autoencoders/test_models_autoencoder_vidtok.py
index 70932f2b55aa..0b1bd49b5ba7 100644
--- a/tests/models/autoencoders/test_models_autoencoder_vidtok.py
+++ b/tests/models/autoencoders/test_models_autoencoder_vidtok.py
@@ -13,26 +13,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from diffusers import AutoencoderVidTok
-from diffusers.utils.testing_utils import (
-    floats_tensor,
-    torch_device,
-)
+from diffusers.utils.torch_utils import randn_tensor
 
-from ...testing_utils import IS_GITHUB_ACTIONS
-from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
+from ...testing_utils import IS_GITHUB_ACTIONS, enable_full_determinism, torch_device
+from ..testing_utils import BaseModelTesterConfig, MemoryTesterMixin, ModelTesterMixin, TrainingTesterMixin
+from .testing_utils import NewAutoencoderTesterMixin
 
 
-class AutoencoderVidTokTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
-    model_class = AutoencoderVidTok
-    main_input_name = "sample"
-    base_precision = 1e-2
+enable_full_determinism()
+
+
+class AutoencoderVidTokTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AutoencoderVidTok
+
+    @property
+    def main_input_name(self) -> str:
+        return "sample"
+
+    @property
+    def output_shape(self) -> tuple:
+        return (3, 16, 32, 32)
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
 
-    def get_autoencoder_vidtok_config(self):
+    def get_init_dict(self) -> dict:
         return {
             "is_causal": False,
             "in_channels": 3,
@@ -46,32 +58,47 @@ def get_autoencoder_vidtok_config(self):
             "codebook_size": 262144,
         }
 
-    @property
-    def dummy_input(self):
+    def get_dummy_inputs(self) -> dict:
         batch_size = 4
         num_frames = 16
         num_channels = 3
         sizes = (32, 32)
+        image = randn_tensor(
+            (batch_size, num_channels, num_frames, *sizes), generator=self.generator, device=torch_device
+        )
+        return {"sample": image}
 
-        image = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
 
-        return {"sample": image}
+class TestAutoencoderVidTok(AutoencoderVidTokTesterConfig, ModelTesterMixin):
+    base_precision = 1e-2
 
-    @property
-    def input_shape(self):
-        return (3, 16, 32, 32)
+    @pytest.mark.skip("Unsupported test.")
+    def test_outputs_equivalence(self):
+        super().test_outputs_equivalence()
 
-    @property
-    def output_shape(self):
-        return (3, 16, 32, 32)
 
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = self.get_autoencoder_vidtok_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
+class TestAutoencoderVidTokTraining(AutoencoderVidTokTesterConfig, TrainingTesterMixin):
+    """Training tests for AutoencoderVidTok."""
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"VidTokEncoder3D", "VidTokDecoder3D"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @pytest.mark.skipif(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
+    def test_layerwise_casting_training(self):
+        super().test_layerwise_casting_training()
+
+
+class TestAutoencoderVidTokMemory(AutoencoderVidTokTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AutoencoderVidTok."""
+
+
+class TestAutoencoderVidTokSlicingTiling(AutoencoderVidTokTesterConfig, NewAutoencoderTesterMixin):
+    """Slicing and tiling tests for AutoencoderVidTok."""
 
     def test_enable_disable_tiling(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs_dict = self.get_dummy_inputs()
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict).to(torch_device)
@@ -83,28 +110,24 @@ def test_enable_disable_tiling(self):
         model.enable_tiling()
         output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0]
 
-        self.assertLess(
-            (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(),
-            0.5,
-            "VAE tiling should not affect the inference results",
-        )
+        assert (
+            output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()
+        ).max() < 0.5, "VAE tiling should not affect the inference results"
 
         torch.manual_seed(0)
         model.disable_tiling()
         output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
 
-        self.assertEqual(
-            output_without_tiling.detach().cpu().numpy().all(),
-            output_without_tiling_2.detach().cpu().numpy().all(),
-            "Without tiling outputs should match with the outputs when tiling is manually disabled.",
-        )
+        assert (
+            output_without_tiling.detach().cpu().numpy().all() == output_without_tiling_2.detach().cpu().numpy().all()
+        ), "Without tiling outputs should match with the outputs when tiling is manually disabled."
 
     def test_enable_disable_slicing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict = self.get_init_dict()
+        inputs_dict = self.get_dummy_inputs()
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict).to(torch_device)
-
         inputs_dict.update({"return_dict": False})
 
         torch.manual_seed(0)
@@ -114,50 +137,32 @@ def test_enable_disable_slicing(self):
         model.enable_slicing()
         output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0]
 
-        self.assertLess(
-            (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(),
-            0.5,
-            "VAE slicing should not affect the inference results",
-        )
+        assert (
+            output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()
+        ).max() < 0.5, "VAE slicing should not affect the inference results"
 
         torch.manual_seed(0)
         model.disable_slicing()
         output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0]
 
-        self.assertEqual(
-            output_without_slicing.detach().cpu().numpy().all(),
-            output_without_slicing_2.detach().cpu().numpy().all(),
-            "Without slicing outputs should match with the outputs when slicing is manually disabled.",
-        )
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {
-            "VidTokEncoder3D",
-            "VidTokDecoder3D",
-        }
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+        assert (
+            output_without_slicing.detach().cpu().numpy().all()
+            == output_without_slicing_2.detach().cpu().numpy().all()
+        ), "Without slicing outputs should match when slicing is manually disabled."
 
     def test_forward_with_norm_groups(self):
-        r"""VidTok uses layernorm instead of groupnorm."""
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        """VidTok uses layernorm instead of groupnorm."""
+        init_dict = self.get_init_dict()
+        inputs_dict = self.get_dummy_inputs()
         model = self.model_class(**init_dict)
         model.to(torch_device)
         model.eval()
 
         with torch.no_grad():
             output = model(**inputs_dict)
-
             if isinstance(output, dict):
                 output = output.to_tuple()[0]
 
-        self.assertIsNotNone(output)
+        assert output is not None
         expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    @unittest.skip("Unsupported test.")
-    def test_outputs_equivalence(self):
-        pass
-
-    @unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
-    def test_layerwise_casting_training(self):
-        super().test_layerwise_casting_training()
+        assert output.shape == expected_shape, "Input and output shapes do not match"

From e531f623acafe19af91090182a47b1931b963f9b Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshankrithick305@gmail.com>
Date: Mon, 1 Jun 2026 19:15:51 -0700
Subject: [PATCH 7/7] remove unused base_precision and test_outputs_equivalence
 skips

---
 .../autoencoders/test_models_autoencoder_kl_kvae_video.py   | 6 ------
 .../models/autoencoders/test_models_autoencoder_oobleck.py  | 2 +-
 tests/models/autoencoders/test_models_autoencoder_tiny.py   | 6 +-----
 tests/models/autoencoders/test_models_autoencoder_vidtok.py | 4 +---
 .../autoencoders/test_models_consistency_decoder_vae.py     | 2 +-
 5 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py b/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py
index 1e04c6be5a5b..d810145c4da8 100644
--- a/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py
+++ b/tests/models/autoencoders/test_models_autoencoder_kl_kvae_video.py
@@ -77,12 +77,6 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderKLKVAEVideo(AutoencoderKLKVAEVideoTesterConfig, ModelTesterMixin):
-    base_precision = 1e-2
-
-    @pytest.mark.skip("Unsupported test.")
-    def test_outputs_equivalence(self):
-        super().test_outputs_equivalence()
-
     @pytest.mark.skip(
         "Multi-GPU inference is not supported due to the stateful cache_dict passing through the forward pass."
     )
diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
index 96b70bc29bb4..ccd67bc7c704 100644
--- a/tests/models/autoencoders/test_models_autoencoder_oobleck.py
+++ b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -74,7 +74,7 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderOobleck(AutoencoderOobleckTesterConfig, ModelTesterMixin):
-    base_precision = 1e-2
+    pass
 
 
 class TestAutoencoderOobleckTraining(AutoencoderOobleckTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/autoencoders/test_models_autoencoder_tiny.py b/tests/models/autoencoders/test_models_autoencoder_tiny.py
index 741a31a15f65..7fdab4aeb910 100644
--- a/tests/models/autoencoders/test_models_autoencoder_tiny.py
+++ b/tests/models/autoencoders/test_models_autoencoder_tiny.py
@@ -76,11 +76,7 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderTiny(AutoencoderTinyTesterConfig, ModelTesterMixin):
-    base_precision = 1e-2
-
-    @pytest.mark.skip("Test not supported.")
-    def test_outputs_equivalence(self):
-        super().test_outputs_equivalence()
+    pass
 
 
 class TestAutoencoderTinyTraining(AutoencoderTinyTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/autoencoders/test_models_autoencoder_vidtok.py b/tests/models/autoencoders/test_models_autoencoder_vidtok.py
index 0b1bd49b5ba7..eb2863121a21 100644
--- a/tests/models/autoencoders/test_models_autoencoder_vidtok.py
+++ b/tests/models/autoencoders/test_models_autoencoder_vidtok.py
@@ -70,9 +70,7 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderVidTok(AutoencoderVidTokTesterConfig, ModelTesterMixin):
-    base_precision = 1e-2
-
-    @pytest.mark.skip("Unsupported test.")
+    @pytest.mark.skip("VidTok output structure not compatible with recursive output check.")
     def test_outputs_equivalence(self):
         super().test_outputs_equivalence()
 
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
index ef37797bad12..0edb713d9a1f 100644
--- a/tests/models/autoencoders/test_models_consistency_decoder_vae.py
+++ b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -87,7 +87,7 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestConsistencyDecoderVAE(ConsistencyDecoderVAETesterConfig, ModelTesterMixin):
-    base_precision = 1e-2
+    pass
 
 
 class TestConsistencyDecoderVAETraining(ConsistencyDecoderVAETesterConfig, TrainingTesterMixin):