From 4d6a891baf2224cfa81bfe7632cf08be50812216 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Thu, 10 Aug 2023 14:56:13 +0000
Subject: [PATCH] Remove `set_default_dtype` from nn tests (#105775)

Part of #68972

Pull Request resolved: https://github.com/pytorch/pytorch/pull/105775
Approved by: https://github.com/ezyang
---
 test/test_jit.py                            | 175 ++--
 test/test_nn.py                             | 265 +++---
 torch/testing/_internal/common_nn.py        | 846 +++++++++++++-------
 torch/testing/_internal/hypothesis_utils.py |   6 +-
 4 files changed, 817 insertions(+), 475 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index df9bc735ee1721..03fc3679e2c3d6 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -16106,6 +16106,10 @@ def do_test(self):
         if not kwargs.get('check_jit', True):
             raise unittest.SkipTest('module test skipped on JIT')
 
+        default_dtype = torch.get_default_dtype()
+        if 'default_dtype' in kwargs and kwargs['default_dtype'] is not None:
+            default_dtype = kwargs['default_dtype']
+
         module_name = get_nn_module_name_from_kwargs(**kwargs)
 
         if 'constructor' in kwargs:
@@ -16116,89 +16120,96 @@ def do_test(self):
         if "FunctionalModule" in str(nn_module):
             return
 
-        if 'constructor_args_fn' in kwargs:
-            constructor_args = kwargs['constructor_args_fn']()
-        else:
-            constructor_args = kwargs.get('constructor_args', ())
-
-        def create_script_module(*args, **kwargs):
-            """Construct a script module that passes arguments through to self.submodule"""
-            formals, tensors, actuals = get_script_args(args)
-
-            method_args = ', '.join(['self'] + actuals)
-            call_args_str = ', '.join(actuals)
-            call = f"self.submodule({call_args_str})"
-            script = script_method_template.format(method_args, call)
-
-            submodule_constants = []
-            if kwargs.get('is_constant'):
-                submodule_constants = ['submodule']
-
-            # Create module to use the script method
-            class TheModule(torch.jit.ScriptModule):
-                __constants__ = submodule_constants
+        with set_default_dtype(default_dtype):
+            if 'constructor_args_fn' in kwargs:
+                constructor_args = kwargs['constructor_args_fn']()
+            else:
+                constructor_args = kwargs.get('constructor_args', ())
+
+            def create_script_module(*args, **kwargs):
+                """Construct a script module that passes arguments through to self.submodule"""
+                formals, tensors, actuals = get_script_args(args)
+
+                method_args = ', '.join(['self'] + actuals)
+                call_args_str = ', '.join(actuals)
+                call = f"self.submodule({call_args_str})"
+                script = script_method_template.format(method_args, call)
+
+                submodule_constants = []
+                if kwargs.get('is_constant'):
+                    submodule_constants = ['submodule']
+
+                # Create module to use the script method
+                class TheModule(torch.jit.ScriptModule):
+                    __constants__ = submodule_constants
+
+                    def __init__(self):
+                        super().__init__()
+                        self.submodule = nn_module(*constructor_args)
+
+                def make_module(script):
+                    module = TheModule()
+                    # check __repr__
+                    str(module)
+                    module.define(script)
+                    return module
+
+                module = make_module(script)
+                self.assertExportImportModule(module, tensors)
+                create_script_module.last_graph = module.graph
+                mod = module(*args)
+                return mod
+
+            # Construct a normal nn module to stay consistent with create_script_module
+            # and make use of a single global rng_state in module initialization
+            def create_nn_module(*args, **kwargs):
+                module = nn_module(*constructor_args)
+                return module(*args)
+
+            # Set up inputs from tuple of sizes or constructor fn
+            dtype = torch.get_default_dtype()
+            if 'input_fn' in kwargs:
+                input = kwargs['input_fn']()
+                if isinstance(input, Tensor):
+                    input = (input,)
+
+                if all(tensor.is_complex() for tensor in input):
+                    if dtype == torch.float:
+                        dtype = torch.cfloat
+                    elif dtype == torch.double:
+                        dtype = torch.cdouble
+                    else:
+                        raise AssertionError(f"default_dtype {default_dtype} is not supported")
 
-                def __init__(self):
-                    super().__init__()
-                    self.submodule = nn_module(*constructor_args)
-
-            def make_module(script):
-                module = TheModule()
-                # check __repr__
-                str(module)
-                module.define(script)
-                return module
-
-            module = make_module(script)
-            self.assertExportImportModule(module, tensors)
-            create_script_module.last_graph = module.graph
-            mod = module(*args)
-            return mod
-
-        # Construct a normal nn module to stay consistent with create_script_module
-        # and make use of a single global rng_state in module initialization
-        def create_nn_module(*args, **kwargs):
-            module = nn_module(*constructor_args)
-            return module(*args)
-
-        # Set up inputs from tuple of sizes or constructor fn
-        dtype = torch.float
-        if 'input_fn' in kwargs:
-            input = kwargs['input_fn']()
-            if isinstance(input, Tensor):
-                input = (input,)
-
-            if all(tensor.is_complex() for tensor in input):
-                dtype = torch.cfloat
-        else:
-            input = (kwargs['input_size'],)
-
-        if 'target_size' in kwargs:
-            input = input + (kwargs['target_size'],)
-        elif 'target_fn' in kwargs:
-            if torch.is_tensor(input):
-                input = (input,)
-            input = input + (kwargs['target_fn'](),)
-        elif 'target' in kwargs:
-            input = input + (kwargs['target'],)
-
-        # Extra parameters to forward()
-        if 'extra_args' in kwargs:
-            input = input + kwargs['extra_args']
-
-        args_variable, kwargs_variable = create_input(input, dtype=dtype)
-        f_args_variable = deepcopy(unpack_variables(args_variable))
-
-        # TODO(issue#52052) Neither this nor no_grad should be required
-        # if check_against_reference() is updated to check gradients
-        # w.r.t. weights and then only check w.r.t. inputs if any
-        # inputs require it.
-        any_requires_grad = any(input.requires_grad for input in f_args_variable)
-
-        # Check against Python module as reference
-        check_against_reference(self, create_script_module, create_nn_module,
-                                lambda x: x, f_args_variable,
-                                no_grad=no_grad or not any_requires_grad)
+            else:
+                input = (kwargs['input_size'],)
+
+            if 'target_size' in kwargs:
+                input = input + (kwargs['target_size'],)
+            elif 'target_fn' in kwargs:
+                if torch.is_tensor(input):
+                    input = (input,)
+                input = input + (kwargs['target_fn'](),)
+            elif 'target' in kwargs:
+                input = input + (kwargs['target'],)
+
+            # Extra parameters to forward()
+            if 'extra_args' in kwargs:
+                input = input + kwargs['extra_args']
+
+            args_variable, kwargs_variable = create_input(input, dtype=dtype)
+            f_args_variable = deepcopy(unpack_variables(args_variable))
+
+            # TODO(issue#52052) Neither this nor no_grad should be required
+            # if check_against_reference() is updated to check gradients
+            # w.r.t. weights and then only check w.r.t. inputs if any
+            # inputs require it.
+            any_requires_grad = any(input.requires_grad for input in f_args_variable)
+
+            # Check against Python module as reference
+            check_against_reference(self, create_script_module, create_nn_module,
+                                    lambda x: x, f_args_variable,
+                                    no_grad=no_grad or not any_requires_grad)
 
     if 'slowTest' in kwargs:
         do_test = slowTest(do_test)
diff --git a/test/test_nn.py b/test/test_nn.py
index 7d86573683aaf7..93c51a5bf88b2d 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -16,11 +16,6 @@
 from unittest import SkipTest
 
 import torch
-
-# TODO: remove this global setting
-# NN tests use double as the default dtype
-torch.set_default_dtype(torch.double)
-
 from torch import inf, nan
 import torch.autograd.forward_ad as fwAD
 import torch.backends.cudnn as cudnn
@@ -39,7 +34,7 @@
     download_file, get_function_arglist, load_tests, skipIfMps,\
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
-    skipIfTorchDynamo, IS_WINDOWS, gcIfJetson
+    skipIfTorchDynamo, IS_WINDOWS, gcIfJetson, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
@@ -1180,7 +1175,7 @@ def test_ParameterList_meta(self):
 Parameter containing:
 tensor(..., device='meta', size=(1,), requires_grad=True)""")
         pl = torch.nn.ParameterList([p])
-        self.assertExpectedInline(str(pl), """ParameterList(  (0): Parameter containing: [torch.float64 of size 1])""")
+        self.assertExpectedInline(str(pl), """ParameterList(  (0): Parameter containing: [torch.float32 of size 1])""")
 
     def test_ParameterList_replication(self):
         # The actual replication code from DP cannot be used on CPU so doing it manually here
@@ -1827,6 +1822,7 @@ def test_weight_norm_pickle(self):
         self.assertIsInstance(m, nn.Linear)
 
     @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
+    @set_default_dtype(torch.double)
     def test_spectral_norm(self):
         input = torch.randn(3, 5)
         m = nn.Linear(5, 7)
@@ -2185,7 +2181,7 @@ def test_nested_tensor_from_mask_error(self):
         self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
 
     def test_normalize(self):
-        inputs = torch.randn(1, 3, 4, 4, requires_grad=True)
+        inputs = torch.randn(1, 3, 4, 4, requires_grad=True, dtype=torch.double)
         self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,)))
         self.assertTrue(gradcheck(lambda x: F.normalize(x, p=2, dim=-2), (inputs,)))
 
@@ -2196,9 +2192,9 @@ def test_normalize(self):
     # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190
     @skipIfRocm
     def test_broadcast_double_backwards_gpu(self):
-        tensors = (torch.randn(4, 4, device='cuda', requires_grad=True),
-                   torch.randn(4, 4, device='cuda', requires_grad=True),
-                   torch.randn(4, 4, device='cuda', requires_grad=True))
+        tensors = (torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double),
+                   torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double),
+                   torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double))
         # TODO(#50743): the following segfaults with check_batched_grad=True
         _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), tensors,
                                      check_batched_grad=False)
@@ -2991,7 +2987,7 @@ def test_CTCLoss_long_targets(self):
         batch_size = 4
         target_length = 1200
 
-        log_probs = torch.randn(input_length, batch_size, vocab_size).log_softmax(2).requires_grad_()
+        log_probs = torch.randn(input_length, batch_size, vocab_size, dtype=torch.double).log_softmax(2).requires_grad_()
         targets = torch.randint(low=1, high=vocab_size - 1, size=(batch_size, target_length), dtype=torch.long)
         input_lengths = batch_size * [input_length]
         target_lengths = batch_size * [target_length]
@@ -3130,10 +3126,11 @@ def test_Transformer_cell(self):
                                                    [(bsz, tgt_length, d_model),
                                                     (tgt_length, bsz, d_model)]):
             transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers,
-                                         dim_feedforward, dropout, batch_first=batch_first)
-            src = torch.randn(src_size)
+                                         dim_feedforward, dropout, batch_first=batch_first,
+                                         dtype=torch.double)
+            src = torch.randn(src_size, dtype=torch.double)
             src_mask = transformer.generate_square_subsequent_mask(seq_length).double()
-            tgt = torch.randn(tgt_size)
+            tgt = torch.randn(tgt_size, dtype=torch.double)
             tgt_mask = transformer.generate_square_subsequent_mask(tgt_length).double()
             memory_mask = torch.randn(tgt_length, seq_length).double()
             src_key_padding_mask = torch.rand(bsz, seq_length) >= 0.5
@@ -4518,6 +4515,7 @@ def test_partial_flat_weights(self):
 
 
     @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
+    @set_default_dtype(torch.double)
     def test_RNN_dropout(self):
         # checking the assumption that cuDNN sticks dropout in between
         # RNN layers
@@ -4560,6 +4558,7 @@ def test_RNN_dropout(self):
                     self.assertEqual(hy.data[0][0][0], 10)
                     self.assertEqual(hy.data[1][0][0], output_val)
 
+    @set_default_dtype(torch.double)
     def test_error_RNN_seq_len_zero(self):
         # checking error message when RNN has seq_len = 0
         for module in (nn.RNN, nn.LSTM, nn.GRU):
@@ -4628,6 +4627,7 @@ def test_RNN_dropout_state(self):
                         self.assertNotEqual(hy1, hy3)
 
     @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
+    @set_default_dtype(torch.double)
     def test_RNN_change_dropout(self):
         for train, cuda in product((True, False), repeat=2):
             rnn = nn.RNN(100, 100, 2, dropout=0, nonlinearity='relu')
@@ -4775,6 +4775,7 @@ def test_pixel_shuffle_unshuffle_5D():
         test_pixel_shuffle_unshuffle_4D()
         test_pixel_shuffle_unshuffle_5D()
 
+    @set_default_dtype(torch.double)
     def test_pixel_shuffle_nhwc_cpu(self):
         input = torch.randn(3, 18, 4, 4, device='cpu')
         input = input.contiguous(memory_format=torch.channels_last).requires_grad_()
@@ -4799,7 +4800,7 @@ def test_pixel_shuffle_nhwc_cpu(self):
 
     # These tests should be OpInfo'd
     def test_elu_inplace_on_view(self):
-        v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True)
+        v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True, dtype=torch.double)
 
         def func(root):
             x = root.clone()
@@ -4812,7 +4813,7 @@ def func(root):
         gradgradcheck(func, [v])
 
     def test_elu_inplace_gradgrad(self):
-        v = torch.randn(8, requires_grad=True)
+        v = torch.randn(8, requires_grad=True, dtype=torch.double)
 
         def func(root):
             x = root.clone()
@@ -4822,7 +4823,7 @@ def func(root):
         gradgradcheck(func, [v])
 
     def test_relu_inplace_on_view(self):
-        v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True)
+        v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True, dtype=torch.double)
 
         def func(root):
             x = root.clone()
@@ -4940,8 +4941,8 @@ def test_bce_with_logits_gives_same_result_as_sigmoid_and_bce_loss_large_tensors
             self.assertEqual(output_sig.grad, output_logits.grad)
 
     def test_bce_with_logits_has_correct_forward_grad(self):
-        output = torch.randn(3, 5, requires_grad=True)
-        target = torch.randn(3, 5)
+        output = torch.randn(3, 5, requires_grad=True, dtype=torch.double)
+        target = torch.randn(3, 5, dtype=torch.double)
         for reduction in ('sum', 'mean', 'none'):
             gradcheck(lambda self, target: nn.BCEWithLogitsLoss(reduction=reduction)(self, target),
                       (output, target), check_forward_ad=True)
@@ -5038,7 +5039,7 @@ def test_bce_loss_broadcasts_weights(self):
         self.assertEqual(out1, out2)
 
     def test_hardtanh_inplace_gradgrad(self):
-        v = torch.randn(8, requires_grad=True)
+        v = torch.randn(8, requires_grad=True, dtype=torch.double)
 
         def func(root):
             x = root.clone()
@@ -5297,8 +5298,8 @@ def test_batchnorm_nhwc_cuda(self):
             self.assertTrue(torch.equal(out1, out2))
 
     def test_pairwise_distance(self):
-        input1 = torch.randn(4, 4, requires_grad=True)
-        input2 = torch.randn(4, 4, requires_grad=True)
+        input1 = torch.randn(4, 4, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(4, 4, requires_grad=True, dtype=torch.double)
         self.assertTrue(gradcheck(lambda x, y: F.pairwise_distance(x, y), (input1, input2)))
 
     # TODO: Create an OpInfo for pdist
@@ -5411,18 +5412,18 @@ def test_kl_div_log_softmax_target(self):
             )
 
     def test_cosine_embedding_loss_no_reduce(self):
-        input1 = torch.randn(15, 10, requires_grad=True)
-        input2 = torch.randn(15, 10, requires_grad=True)
-        target = torch.randn(15).sign()
+        input1 = torch.randn(15, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(15, 10, requires_grad=True, dtype=torch.double)
+        target = torch.randn(15, dtype=torch.double).sign()
         self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss(
             x, y, z, reduction='none'), (input1, input2, target)))
         self.assertEqual(F.cosine_embedding_loss(input1, input2, target, reduction='none'),
                          loss_reference_fns['CosineEmbeddingLoss'](input1, input2, target, reduction='none'))
 
     def test_cosine_embedding_loss_margin_no_reduce(self):
-        input1 = torch.randn(15, 10, requires_grad=True)
-        input2 = torch.randn(15, 10, requires_grad=True)
-        target = torch.randn(15).sign()
+        input1 = torch.randn(15, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(15, 10, requires_grad=True, dtype=torch.double)
+        target = torch.randn(15, dtype=torch.double).sign()
         self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss(
             x, y, z, margin=0.5, reduction='none'), (input1, input2, target)))
         self.assertEqual(F.cosine_embedding_loss(input1, input2, target, margin=0.5, reduction='none'),
@@ -5444,54 +5445,54 @@ def test_cosine_embedding_loss_invalid_shape(self):
             F.cosine_embedding_loss(torch.randn(2, 5), torch.randn(2, 5), torch.randn(()))
 
     def test_margin_ranking_loss_no_reduce(self):
-        input1 = torch.randn(15).mul_(10).requires_grad_()
-        input2 = torch.randn(15).mul_(10).requires_grad_()
-        target = torch.randn(15).sign()
+        input1 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_()
+        input2 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_()
+        target = torch.randn(15, dtype=torch.double).sign()
         self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss(
             x, y, z, reduction='none'), (input1, input2, target)))
         self.assertEqual(F.margin_ranking_loss(input1, input2, target, reduction='none'),
                          loss_reference_fns['MarginRankingLoss'](input1, input2, target, reduction='none'))
 
     def test_margin_ranking_loss_margin_no_reduce(self):
-        input1 = torch.randn(15).mul_(10).requires_grad_()
-        input2 = torch.randn(15).mul_(10).requires_grad_()
-        target = torch.randn(15).sign()
+        input1 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_()
+        input2 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_()
+        target = torch.randn(15, dtype=torch.double).sign()
         self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss(
             x, y, z, margin=0.5, reduction='none'), (input1, input2, target)))
         self.assertEqual(F.margin_ranking_loss(input1, input2, target, margin=0.5, reduction='none'),
                          loss_reference_fns['MarginRankingLoss'](input1, input2, target, margin=0.5, reduction='none'))
 
     def test_triplet_margin_loss(self):
-        input1 = torch.randn(5, 10, requires_grad=True)
-        input2 = torch.randn(5, 10, requires_grad=True)
-        input3 = torch.randn(5, 10, requires_grad=True)
+        input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
         self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
             x1, x2, x3), (input1, input2, input3)))
         self.assertEqual(F.triplet_margin_loss(input1, input2, input3),
                          loss_reference_fns['TripletMarginLoss'](input1, input2, input3))
 
     def test_triplet_margin_loss_swap(self):
-        input1 = torch.randn(5, 10, requires_grad=True)
-        input2 = torch.randn(5, 10, requires_grad=True)
-        input3 = torch.randn(5, 10, requires_grad=True)
+        input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
         self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
             x1, x2, x3, swap=True), (input1, input2, input3)))
         self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True),
                          loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True))
 
     def test_triplet_margin_loss_no_reduce(self):
-        input1 = torch.randn(5, 10, requires_grad=True)
-        input2 = torch.randn(5, 10, requires_grad=True)
-        input3 = torch.randn(5, 10, requires_grad=True)
+        input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
         self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
             x1, x2, x3, reduction='none'), (input1, input2, input3)))
         self.assertEqual(F.triplet_margin_loss(input1, input2, input3, reduction='none'),
                          loss_reference_fns['TripletMarginLoss'](input1, input2, input3, reduction='none'))
 
     def test_triplet_margin_loss_swap_no_reduce(self):
-        input1 = torch.randn(5, 10, requires_grad=True)
-        input2 = torch.randn(5, 10, requires_grad=True)
-        input3 = torch.randn(5, 10, requires_grad=True)
+        input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
+        input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
         self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss(
             x1, x2, x3, swap=True, reduction='none'), (input1, input2, input3)))
         self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'),
@@ -5511,11 +5512,11 @@ def test_pointwise_loss_broadcast(self):
             'huber_loss': lambda x, y, r: F.huber_loss(x, y, reduction=r),
         }
 
-        input = torch.randn(2, 1, requires_grad=True)
+        input = torch.randn(2, 1, requires_grad=True, dtype=torch.double)
         for fn in losses.values():
             for requires_grad in [True, False]:
                 # When target.requires_grad=True, its impl is in Python, while the other is in TH.
-                target = torch.randn(2, 10, requires_grad=requires_grad)
+                target = torch.randn(2, 10, requires_grad=requires_grad, dtype=torch.double)
                 for reduction in ['none', 'mean', 'sum']:
                     l = fn(input, target, reduction)
                     if reduction == 'none':
@@ -5571,6 +5572,7 @@ def test_huber_loss_zero_delta():
         test_huber_loss_negative_delta()
         test_huber_loss_zero_delta()
 
+    @set_default_dtype(torch.double)
     def test_cosine_similarity(self):
         # Check cosine_similarity input/output shapes
         input_size = (1, 3, 2, 1)
@@ -5715,6 +5717,7 @@ def test_affine_grid_error_checking(self):
         with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"):
             F.affine_grid(theta, torch.Size([1, 1, 2, 2, 2, 2]), align_corners=False)
 
+    @set_default_dtype(torch.double)
     def test_grid_sample(self):
         # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient,
         # so we test both cases.
@@ -6075,6 +6078,7 @@ def get_grid(device='cpu', data=None):
                             with cudnn.flags(enabled=False):
                                 test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad)
 
+    @set_default_dtype(torch.double)
     def test_grid_sample_3d(self):
         # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient,
         # so we test both cases.
@@ -6365,6 +6369,7 @@ def normalize_indices(indices_unnormalized: torch.Tensor, dim_size: int, align_c
                     )
                     self.assertEqual(output_tensor_2d_x[0, 0, 0, :], output_tensor_3d_z[0, 0, 0, 0, :], atol=0, rtol=0)
 
+    @set_default_dtype(torch.double)
     def test_affine_grid(self):
         # test known input on CPU
         input = torch.arange(1., 7).view(1, 2, 3)
@@ -6413,6 +6418,7 @@ def test_affine_grid(self):
                 self.assertEqual(out_cpu, out_cuda)
                 self.assertEqual(input_cpu.grad, input_gpu.grad)
 
+    @set_default_dtype(torch.double)
     def test_affine_grid_3d(self):
         # test known input on CPU
         input = torch.arange(1., 13).view(1, 3, 4)
@@ -6472,6 +6478,7 @@ def test_channel_shuffle_return_alias_of_self(self):
         output = torch.nn.ChannelShuffle(groups)(input_tensor)
         torch.testing.assert_close(output, input_tensor)
 
+    @set_default_dtype(torch.double)
     def test_upsamplingLinear1d(self):
         for align_corners in [True, False]:
             for recompute_scale_factor in [True, False]:
@@ -6502,6 +6509,7 @@ def test_upsamplingLinear1d_spatial_invariance(self):
             out_t_5 = m(in_t_9[:, :, :5])
         self.assertEqual(out_t_9[:, :, :15], out_t_5)
 
+    @set_default_dtype(torch.double)
     def test_upsampling_not_recompute_scale_factor(self):
         # test output against known input: result must match opencv
         in_t = torch.arange(8.).view(1, 2, 2, 2)
@@ -6568,7 +6576,7 @@ def test_upsamplingTrilinear3d_spatial_invariance(self):
 
     def test_upsampling_small_scale(self):
         m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear")
-        in_t = torch.arange(1, 5, dtype=torch.float64).reshape(1, 1, 2, 2)
+        in_t = torch.arange(1, 5, dtype=torch.get_default_dtype()).reshape(1, 1, 2, 2)
         out_t = m(in_t)
         expected_out_t = torch.tensor([[[[2.5]]]])
         self.assertEqual(expected_out_t, out_t)
@@ -6706,6 +6714,7 @@ def helper(size, dtype, mode, device, is_channels_last):
                         helper(size, dtype, mode, device, is_channels_last)
 
 
+    @set_default_dtype(torch.double)
     def test_interpolate(self):
         def _test_interpolate_non_integer_size_warning(in_t, out_size, dim, **kwargs):
             test_sizes = [float(out_size),
@@ -6858,15 +6867,15 @@ def run(input1_tp, input2_tp):
         self.assertEqual(g2, g2_nc)
 
     def test_bilinear_no_bias(self):
-        module = nn.Bilinear(10, 10, 8)
-        module_no_bias = nn.Bilinear(10, 10, 8, False)
+        module = nn.Bilinear(10, 10, 8, dtype=torch.double)
+        module_no_bias = nn.Bilinear(10, 10, 8, False, dtype=torch.double)
 
         module.bias.data.zero_()
         module.weight.data.copy_(module_no_bias.weight)
 
-        input1 = torch.randn(4, 10, requires_grad=True)
-        input2 = torch.randn(4, 10, requires_grad=True)
-        grad_output = torch.randn(4, 8)
+        input1 = torch.randn(4, 10, requires_grad=True, dtype=torch.double)
+        input2 = torch.randn(4, 10, requires_grad=True, dtype=torch.double)
+        grad_output = torch.randn(4, 8, dtype=torch.double)
 
         def run(net):
             input1.grad = input2.grad = None
@@ -7296,20 +7305,21 @@ def test_max_pool1d_invalid_output_size(self):
             res = arg_class(*arg_4)
 
 class TestFusionEval(TestCase):
-    @given(X=hu.tensor(shapes=((5, 3, 5, 5),)),
-           running_mean=hu.tensor(shapes=(6,)),
-           running_var=hu.tensor(shapes=(6,)))
+    @set_default_dtype(torch.double)
+    @given(X=hu.tensor(shapes=((5, 3, 5, 5),), dtype=np.double),
+           running_mean=hu.tensor(shapes=(6,), dtype=np.double),
+           running_var=hu.tensor(shapes=(6,), dtype=np.double))
     def test_fuse_module_eval_numerics(self, X, running_mean, running_var):
         inputs, _ = X
 
         iC, oC = inputs.shape[1], len(running_mean[0])
-        inputs = torch.from_numpy(inputs).to(torch.double)
+        inputs = torch.from_numpy(inputs)
         kernel_size = (3, 3)
 
         conv_ref = torch.nn.Conv2d(iC, oC, bias=True, kernel_size=kernel_size)
         bn_ref = torch.nn.BatchNorm2d(oC)
-        bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double)
-        bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double)
+        bn_ref.running_mean = torch.from_numpy(running_mean[0])
+        bn_ref.running_var = torch.from_numpy(running_var[0])
 
         conv_ref.eval()
         bn_ref.eval()
@@ -7322,8 +7332,8 @@ def test_fuse_module_eval_numerics(self, X, running_mean, running_var):
         self.assertEqual(Y_ref, Y_hat, msg="Conv+BN fusion results are off")
 
         na_bn_ref = torch.nn.BatchNorm2d(oC, affine=False)
-        na_bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double)
-        na_bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double)
+        na_bn_ref.running_mean = torch.from_numpy(running_mean[0])
+        na_bn_ref.running_var = torch.from_numpy(running_var[0])
         na_bn_ref.eval()
 
         Y_ref = na_bn_ref(conv_ref(inputs))
@@ -7566,20 +7576,23 @@ def forward(self, input):
         nn.MaxPool1d(2, return_indices=True),
         nn.MaxUnpool1d(2)),
     input_size=(1, 1, 4),
-    fullname='MaxUnpool1d_net',))
+    fullname='MaxUnpool1d_net',
+    default_dtype=torch.double,))
 add_test(NewModuleTest(
     constructor=lambda: UnpoolingNet(
         nn.MaxPool2d(2, return_indices=True),
         nn.MaxUnpool2d(2)),
     input_size=(1, 1, 2, 4),
-    fullname='MaxUnpool2d_net',))
+    fullname='MaxUnpool2d_net',
+    default_dtype=torch.double,))
 add_test(NewModuleTest(
     constructor=lambda: UnpoolingNet(
         nn.MaxPool3d(2, return_indices=True),
         nn.MaxUnpool3d(2)),
     input_size=(1, 1, 2, 4, 6),
     fullname='MaxUnpool3d_net',
-    check_gradgrad=False,))
+    check_gradgrad=False,
+    default_dtype=torch.double,))
 
 add_test(NewModuleTest(
     constructor=lambda: UnpoolingNet(
@@ -7587,14 +7600,16 @@ def forward(self, input):
         nn.MaxUnpool1d(2)),
     input_size=(1, 4),
     reference_fn=single_batch_reference_fn,
-    fullname='MaxUnpool1d_net_no_batch_dim',))
+    fullname='MaxUnpool1d_net_no_batch_dim',
+    default_dtype=torch.double,))
 add_test(NewModuleTest(
     constructor=lambda: UnpoolingNet(
         nn.MaxPool2d(2, return_indices=True),
         nn.MaxUnpool2d(2)),
     input_size=(1, 2, 4),
     reference_fn=single_batch_reference_fn,
-    fullname='MaxUnpool2d_net_no_batch_dim',))
+    fullname='MaxUnpool2d_net_no_batch_dim',
+    default_dtype=torch.double,))
 
 add_test(NewModuleTest(
     constructor=lambda: UnpoolingNet(
@@ -7603,7 +7618,8 @@ def forward(self, input):
     input_size=(1, 2, 4, 6),
     reference_fn=single_batch_reference_fn,
     fullname='MaxUnpool3d_net_no_batch_dim',
-    check_gradgrad=False))
+    check_gradgrad=False,
+    default_dtype=torch.double,))
 
 class _AdaptiveLogSoftmaxWithLoss(nn.AdaptiveLogSoftmaxWithLoss):
     def __call__(self, input):
@@ -7615,7 +7631,8 @@ def __call__(self, input):
     input_size=(4, 16),
     fullname='AdaptiveLogSoftmax',
     with_tf32=True,
-    tf32_precision=0.005))
+    tf32_precision=0.005,
+    default_dtype=torch.double))
 
 
 # The following are helpers for TestNN.test_affine_*
@@ -8804,8 +8821,9 @@ def test_TransformerEncoderLayer_empty(self, device):
         for training in (True, False):
             for batch_first, input_shape in [(True, (0, 10, 512)),
                                              (False, (10, 0, 512))]:
-                input = torch.rand(*input_shape, device=device)
-                encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+                input = torch.rand(*input_shape, device=device, dtype=torch.double)
+                encoder_layer = nn.TransformerEncoderLayer(
+                    d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device)
                 if not training:
                     encoder_layer = encoder_layer.eval()
                     with torch.no_grad():
@@ -8820,7 +8838,7 @@ def test_TransformerEncoderLayer_empty(self, device):
                                 nt = torch.nested.nested_tensor([], device=device)
                                 _test_module_empty_input(self, encoder_layer, nt, check_size=False, inference=True)
 
-                            nt = torch.nested.nested_tensor([torch.rand(0, 512, device=device)], device=device)
+                            nt = torch.nested.nested_tensor([torch.rand(0, 512, device=device, dtype=torch.double)], device=device)
                             _test_module_empty_input(self, encoder_layer, nt, check_size=False, inference=True)
                 else:
                     _test_module_empty_input(self, encoder_layer, input, check_size=False)
@@ -8830,8 +8848,8 @@ def test_TransformerEncoderLayer_empty(self, device):
     def test_TransformerEncoder_empty(self, device):
         for batch_first, input_shape in [(True, (0, 10, 512)),
                                          (False, (10, 0, 512))]:
-            input = torch.rand(*input_shape, device=device)
-            encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+            input = torch.rand(*input_shape, device=device, dtype=torch.double)
+            encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device)
             transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6).to(device)
             _test_module_empty_input(self, transformer_encoder, input, check_size=False)
 
@@ -8840,9 +8858,9 @@ def test_TransformerEncoder_empty(self, device):
     def test_TransformerDecoderLayer_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
                                                      (False, (10, 0, 512), (20, 0, 512))]:
-            memory = torch.rand(*memory_shape, device=device)
-            tgt = torch.rand(*tgt_shape, requires_grad=True, device=device)
-            decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+            memory = torch.rand(*memory_shape, device=device, dtype=torch.double)
+            tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double)
+            decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device)
             self._test_module_empty_inputs(decoder_layer, [tgt, memory])
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
@@ -8850,9 +8868,9 @@ def test_TransformerDecoderLayer_empty(self, device):
     def test_TransformerDecoder_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
                                                      (False, (10, 0, 512), (20, 0, 512))]:
-            memory = torch.rand(*memory_shape, device=device)
-            tgt = torch.rand(*tgt_shape, requires_grad=True, device=device)
-            decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+            memory = torch.rand(*memory_shape, device=device, dtype=torch.double)
+            tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double)
+            decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device)
             transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6).to(device)
             self._test_module_empty_inputs(transformer_decoder, [tgt, memory])
 
@@ -8860,9 +8878,9 @@ def test_TransformerDecoder_empty(self, device):
     @onlyNativeDeviceTypes
     def test_Transformer_empty(self, device):
         for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]:
-            transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12).to(device)
-            src = torch.rand(*src_shape, requires_grad=True, device=device)
-            tgt = torch.rand(*tgt_shape, requires_grad=True, device=device)
+            transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12, dtype=torch.double).to(device)
+            src = torch.rand(*src_shape, requires_grad=True, device=device, dtype=torch.double)
+            tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double)
             self._test_module_empty_inputs(transformer_model, [src, tgt])
 
     @onlyNativeDeviceTypes
@@ -9216,7 +9234,7 @@ def v(fn):
             v(lambda: F.hinge_embedding_loss(input, input, reduction=reduction))
             v(lambda: F.poisson_nll_loss(input, input, reduction=reduction))
             v(lambda: F.gaussian_nll_loss(input, input, var, reduction=reduction))
-            v(lambda: F.binary_cross_entropy(torch.sigmoid(input), input.gt(0).double(), reduction=reduction))
+            v(lambda: F.binary_cross_entropy(torch.sigmoid(input), input.gt(0).to(torch.get_default_dtype()), reduction=reduction))
             v(lambda: F.binary_cross_entropy_with_logits(input, input, reduction=reduction))
 
             zeros = torch.zeros_like(input).to(torch.int64)
@@ -9363,27 +9381,27 @@ def test_upsamplingNearest1d(self, device, mode):
         check_forward_ad = torch.device(device).type != 'xla'
 
         m = nn.Upsample(size=4, mode=mode)
-        in_t = torch.ones(1, 1, 2, device=device)
+        in_t = torch.ones(1, 1, 2, device=device, dtype=torch.double)
         in_uint8_t = torch.ones(1, 1, 2, dtype=torch.uint8, device=device)
         with warnings.catch_warnings(record=True) as w:
             out_t = m(in_t)
             out_uint8_t = m(in_uint8_t)
-        self.assertEqual(torch.ones(1, 1, 4, device=device), out_t.data)
+        self.assertEqual(torch.ones(1, 1, 4, device=device, dtype=torch.double), out_t.data)
         self.assertEqual(torch.ones(1, 1, 4, dtype=torch.uint8, device=device), out_uint8_t.data)
 
         # Checks upsampling
-        input = torch.randn(1, 1, 2, requires_grad=True, device=device)
+        input = torch.randn(1, 1, 2, requires_grad=True, device=device, dtype=torch.double)
         gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad)
         gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad)
 
         # Checks downsampling
-        input = torch.randn(1, 1, 20, requires_grad=True, device=device)
+        input = torch.randn(1, 1, 20, requires_grad=True, device=device, dtype=torch.double)
         gradcheck(lambda x: F.interpolate(x, 11, mode=mode), [input], check_forward_ad=check_forward_ad)
         gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad)
 
         # consistency CUDA/CPU check
         if torch.device(device).type == 'cuda':
-            input_cuda = torch.randn(1, 1, 20, device=device)
+            input_cuda = torch.randn(1, 1, 20, device=device, dtype=torch.double)
             input_cpu = input_cuda.cpu()
             output_cuda = F.interpolate(input_cuda, 4, mode=mode)
             output_cpu = F.interpolate(input_cpu, 4, mode=mode)
@@ -9459,32 +9477,36 @@ def test_upsamplingNearest2d(self, device, memory_format, mode):
         # Forward AD does not support XLA because XLA tensors don't have storage
         check_forward_ad = torch.device(device).type != 'xla'
 
-        in_t = torch.ones(1, 2, 2, 2, device=device).contiguous(memory_format=memory_format)
+        in_t = torch.ones(1, 2, 2, 2, device=device, dtype=torch.double).contiguous(memory_format=memory_format)
         in_uint8_t = torch.ones(1, 2, 2, 2, dtype=torch.uint8, device=device).contiguous(memory_format=memory_format)
         with warnings.catch_warnings(record=True) as w:
             out_t = F.interpolate(in_t, size=4, mode=mode)
             out_uint8_t = F.interpolate(in_uint8_t, size=4, mode=mode)
             self.assertEqual(len(w), 0)
-        self.assertEqual(torch.ones(1, 2, 4, 4, device=device), out_t)
+        self.assertEqual(torch.ones(1, 2, 4, 4, device=device, dtype=torch.double), out_t)
         self.assertEqual(torch.ones(1, 2, 4, 4, dtype=torch.uint8, device=device), out_uint8_t)
         # Assert that memory format is carried through to the output
         self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
 
         # test forward when input's height is not same as width
-        in_t = torch.ones(1, 2, 2, 1, device=device).contiguous(memory_format=memory_format).requires_grad_()
+        in_t = torch.ones(1, 2, 2, 1, device=device, dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
         out_t = F.interpolate(in_t, size=(4, 2), mode=mode)
-        self.assertEqual(torch.ones(1, 2, 4, 2, device=device), out_t)
+        self.assertEqual(torch.ones(1, 2, 4, 2, device=device, dtype=torch.double), out_t)
         self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
 
         out_t.backward(torch.randn_like(out_t))
         self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
 
         # test backward when input's height is not same as width
-        input = torch.ones(1, 2, 2, 1, requires_grad=True, device=device).contiguous(memory_format=memory_format)
+        input = torch.ones(
+            1, 2, 2, 1, requires_grad=True, device=device,
+            dtype=torch.double).contiguous(memory_format=memory_format)
         gradcheck(lambda x: F.interpolate(x, size=(4, 2), mode=mode), [input], check_forward_ad=check_forward_ad)
         gradgradcheck(lambda x: F.interpolate(x, size=(4, 2), mode=mode), [input], check_fwd_over_rev=check_forward_ad)
 
-        input = torch.randn(1, 2, 2, 2, requires_grad=True, device=device).contiguous(memory_format=memory_format)
+        input = torch.randn(
+            1, 2, 2, 2, requires_grad=True, device=device,
+            dtype=torch.double).contiguous(memory_format=memory_format)
         self.assertEqual(
             F.interpolate(input, 4, mode=mode),
             F.interpolate(input, scale_factor=2, mode=mode))
@@ -9497,7 +9519,9 @@ def test_upsamplingNearest2d(self, device, memory_format, mode):
             for shapes, scale_factor in product([
                 (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
             ], [0.5, 1.5, 2]):
-                a_cuda = torch.randn(*shapes, device=device).contiguous(memory_format=memory_format).requires_grad_()
+                a_cuda = torch.randn(
+                    *shapes, device=device,
+                    dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
                 a_cpu = a_cuda.detach().cpu().requires_grad_()
 
                 out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, mode=mode)
@@ -9565,14 +9589,14 @@ def test_upsamplingNearest3d(self, device, memory_format, mode):
         check_forward_ad = torch.device(device).type != 'xla'
 
         m = nn.Upsample(size=4, mode=mode)
-        in_t = torch.ones(1, 2, 2, 2, 2, device=device).contiguous(memory_format=memory_format).requires_grad_()
+        in_t = torch.ones(1, 2, 2, 2, 2, device=device, dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
         in_uint8_t = torch.ones(
             1, 2, 2, 2, 2, dtype=torch.uint8, device=device
         ).contiguous(memory_format=memory_format)
         with warnings.catch_warnings(record=True) as w:
             out_t = m(in_t)
             out_uint8_t = m(in_uint8_t)
-        expected_output = torch.ones(1, 2, 4, 4, 4, device=device)
+        expected_output = torch.ones(1, 2, 4, 4, 4, device=device, dtype=torch.double)
         self.assertEqual(expected_output, out_t)
         self.assertEqual(expected_output.to(torch.uint8), out_uint8_t)
         # Assert that memory format is carried through to the output
@@ -9581,7 +9605,7 @@ def test_upsamplingNearest3d(self, device, memory_format, mode):
         self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
 
         input = torch.randn(
-            1, 2, 2, 2, 2, requires_grad=True, device=device
+            1, 2, 2, 2, 2, requires_grad=True, device=device, dtype=torch.double
         ).contiguous(memory_format=memory_format)
         gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad)
         gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad)
@@ -9590,7 +9614,7 @@ def test_upsamplingNearest3d(self, device, memory_format, mode):
         # https://github.com/pytorch/pytorch/issues/54590
         if torch.device(device).type == 'cuda':
             a = torch.ones(
-                2, 2, 2, 3, 4, device=device, requires_grad=True
+                2, 2, 2, 3, 4, device=device, requires_grad=True, dtype=torch.double
             ).contiguous(memory_format=torch.channels_last_3d)
             # make the data asymmetric; ensure that cuda/cpu handle channels_last appropriately.
             a[1][1][1][2][2] = a[1][1][1][2][3] = 0
@@ -9670,11 +9694,13 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory
         kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias)
         # test float scale factor up & downsampling
         for scale_factor in [0.5, 1.5, 2]:
-            in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            in_t = torch.ones(
+                2, 3, 8, 8, device=device,
+                dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
             out_size = int(math.floor(in_t.shape[-1] * scale_factor))
             with warnings.catch_warnings(record=True) as w:
                 out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
-            expected_out = torch.ones(2, 3, out_size, out_size, device=device)
+            expected_out = torch.ones(2, 3, out_size, out_size, device=device, dtype=torch.double)
             self.assertEqual(expected_out, out_t)
             # Assert that memory format is carried through to the output
             self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
@@ -9687,7 +9713,9 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory
             else:
                 nondet_tol = 0.0
 
-            input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            input = torch.randn(
+                2, 3, 8, 8, device=device,
+                dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
             gradcheck(
                 lambda x: F.interpolate(x, out_size, **kwargs),
                 [input],
@@ -9705,7 +9733,7 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory
                     (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
                 ]:
                     a_cuda = torch.randn(
-                        *shapes, device=device
+                        *shapes, device=device, dtype=torch.double
                     ).contiguous(memory_format=memory_format).requires_grad_()
                     a_cpu = a_cuda.detach().cpu().requires_grad_()
 
@@ -9922,19 +9950,19 @@ def test_upsamplingTrilinear3d(self, device, align_corners):
             # test float scale factor up & downsampling
             for scale_factor in [0.5, 1.5, 2]:
                 m = nn.Upsample(scale_factor=scale_factor, **kwargs)
-                in_t = torch.ones(1, 2, 2, 2, 2, device=device)
+                in_t = torch.ones(1, 2, 2, 2, 2, device=device, dtype=torch.double)
                 in_t = in_t.contiguous(memory_format=memory_format).requires_grad_()
                 out_size = int(math.floor(in_t.shape[-1] * scale_factor))
                 with warnings.catch_warnings(record=True) as w:
                     out_t = m(in_t)
-                expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device)
+                expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device, dtype=torch.double)
                 self.assertEqual(expected_out, out_t)
                 # Assert that memory format is carried through to the output
                 self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
                 out_t.backward(torch.randn_like(out_t))
                 self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
 
-                input = torch.randn(1, 2, 2, 2, 2, requires_grad=True)
+                input = torch.randn(1, 2, 2, 2, 2, requires_grad=True, dtype=torch.double)
                 self.assertEqual(
                     F.interpolate(input, (out_size, out_size, out_size), **kwargs),
                     F.interpolate(input, scale_factor=scale_factor, **kwargs))
@@ -10922,14 +10950,14 @@ def test_layernorm_weight_bias(self):
         self.assertEqual(out_none_bias, out_zero_bias)
 
     def test_hardsigmoid_grad(self, device):
-        inputs = (torch.randn(4, 16, 16, device=device) - 0.5) * 10
+        inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10
         inputs.requires_grad = True
         self.assertTrue(gradcheck(F.hardsigmoid, (inputs,)))
 
     # currently fails on XLA
     @onlyNativeDeviceTypes
     def test_hardswish_grad(self, device):
-        inputs = (torch.randn(4, 16, 16, device=device) - 0.5) * 10
+        inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10
         inputs.requires_grad = True
         self.assertTrue(gradcheck(F.hardswish, (inputs,)))
 
@@ -11654,6 +11682,7 @@ def test_cross_entropy_label_smoothing_errors(self, device):
                                         r"label_smoothing must be between 0\.0"):
                 loss(*input_arg)
 
+    @set_default_dtype(torch.double)
     def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, device):
         N, C = 10, 4
         ks = range(5)
@@ -11827,7 +11856,7 @@ def func(x):
         seeds = (44, 83, 71, 25, 999)
         for sd in seeds:
             torch.manual_seed(sd)
-            x = torch.randn(1, 12, 12, device=device, requires_grad=True)
+            x = torch.randn(1, 12, 12, device=device, requires_grad=True, dtype=torch.double)
             gradcheck(func, [x], check_forward_ad=True)
             gradgradcheck(func, [x], check_fwd_over_rev=True)
             if device == 'cpu':
@@ -12103,9 +12132,9 @@ def test_triplet_margin_with_distance_loss_default_parity(self, device):
                 itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')):
             kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]}
 
-            anchor = torch.randn(5, 10, device=device, requires_grad=True)
-            positive = torch.randn(5, 10, device=device, requires_grad=True)
-            negative = torch.randn(5, 10, device=device, requires_grad=True)
+            anchor = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double)
+            positive = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double)
+            negative = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double)
 
             # Test forward, functional
             expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs)
@@ -12144,9 +12173,9 @@ def cosine_distance(x, y):
 
         for distance_fn, reduction, margin, swap \
                 in itertools.product(distance_functions, reductions, margins, swaps):
-            anchor = torch.randn(5, 10, device=device, requires_grad=True)
-            positive = torch.randn(5, 10, device=device, requires_grad=True)
-            negative = torch.randn(5, 10, device=device, requires_grad=True)
+            anchor = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double)
+            positive = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double)
+            negative = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double)
 
             # Test backward
             self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss(
@@ -12454,7 +12483,7 @@ def test_transformerencoderlayer_fast_path(self, device, dtype):
         model.eval()
 
         # Batched inputs
-        src = torch.rand(batch_size, src_len, 512)
+        src = torch.rand(batch_size, src_len, 512, dtype=dtype)
 
         # Attention mask of shape (src_len, src_len)
         src_mask = torch.zeros(src_len, src_len).to(torch.bool)
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 504748f7d58f90..b6898121bfa343 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -15,7 +15,7 @@
 import torch.nn.functional as F
 from torch.nn import _reduction as _Reduction
 from torch.testing._internal.common_utils import TestCase, to_gpu, freeze_rng_state, is_iterable, \
-    gradcheck, gradgradcheck
+    gradcheck, gradgradcheck, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.autograd.gradcheck import _get_numerical_jacobian, _iter_tensors
 from torch.autograd import Variable
@@ -106,6 +106,7 @@ def get_weight(m):
         reference_fn=lambda i, p, _: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Linear',
@@ -116,11 +117,13 @@ def get_weight(m):
         reference_fn=lambda i, p, _: torch.mm(i, p[0].t()),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='RReLU',
         input_size=(1, 2, 2),
         test_cuda=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='RReLU',
@@ -129,11 +132,13 @@ def get_weight(m):
         input_size=(4, 4, 5),
         desc='with_up_down',
         test_cuda=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Flatten',
         input_size=(2, 3, 4, 5),
-        reference_fn=lambda i, *_: torch.flatten(i, 1)
+        reference_fn=lambda i, *_: torch.flatten(i, 1),
+        default_dtype=torch.double,
     ),
     # TODO: reference function
     dict(
@@ -144,6 +149,7 @@ def get_weight(m):
         check_gradgrad=False,
         # TODO(#50743): Figure out the error. "RuntimeError: Unrecognized tensor type ID: Batched"
         check_batched_grad=False,
+        default_dtype=torch.double,
     ),
 ]
 
@@ -174,11 +180,12 @@ def poissonnllloss_no_reduce_test():
         input_fn=lambda: torch.rand(10, 10),
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_: i.exp() - t.mul(i),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def bceloss_no_reduce_test():
-    t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype()))
+    t = Variable(torch.randn(15, 10).gt(0).to(torch.double))
     return dict(
         fullname='BCELoss_no_reduce',
         constructor=wrap_functional(
@@ -189,11 +196,12 @@ def bceloss_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
         pickle=False,
-        precision=7e-4)
+        precision=7e-4,
+        default_dtype=torch.double)
 
 
 def bceloss_no_reduce_scalar_test():
-    t = torch.randn(()).gt(0).to(torch.get_default_dtype())
+    t = torch.randn(()).gt(0).to(torch.double)
     return dict(
         fullname='BCELoss_no_reduce_scalar',
         constructor=wrap_functional(
@@ -203,12 +211,13 @@ def bceloss_no_reduce_scalar_test():
         input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def bceloss_weights_no_reduce_test():
-    t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype()))
-    weights = torch.rand(10)
+    t = Variable(torch.randn(15, 10, dtype=torch.double).gt(0).to(torch.double))
+    weights = torch.rand(10, dtype=torch.double)
     return dict(
         fullname='BCELoss_weights_no_reduce',
         constructor=wrap_functional(
@@ -221,13 +230,14 @@ def bceloss_weights_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
         reference_fn=lambda i, p, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
         pickle=False,
-        precision=3e-4
+        precision=3e-4,
+        default_dtype=torch.double,
     )
 
 
 def bceloss_weights_no_reduce_scalar_test():
-    t = torch.randn(()).gt(0).to(torch.get_default_dtype())
-    weights = torch.rand(())
+    t = torch.randn(()).gt(0).to(torch.double)
+    weights = torch.rand((), dtype=torch.double)
     return dict(
         fullname='BCELoss_weights_no_reduce_scalar',
         constructor=wrap_functional(
@@ -239,12 +249,13 @@ def bceloss_weights_no_reduce_scalar_test():
         cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
         input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
-        pickle=False
+        pickle=False,
+        default_dtype=torch.double,
     )
 
 
 def bce_with_logistic_legacy_enum_test():
-    t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype()))
+    t = Variable(torch.randn(15, 10).gt(0).to(torch.double))
     sigmoid = nn.Sigmoid()
     return dict(
         fullname='BCEWithLogitsLoss_legacy_enum',
@@ -257,11 +268,12 @@ def bce_with_logistic_legacy_enum_test():
         reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
         check_gradgrad=False,
         pickle=False,
+        default_dtype=torch.double,
     )
 
 
 def bce_with_logistic_no_reduce_test():
-    t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype()))
+    t = Variable(torch.randn(15, 10).gt(0).to(torch.double))
     sigmoid = nn.Sigmoid()
     return dict(
         fullname='BCEWithLogitsLoss_no_reduce',
@@ -274,11 +286,12 @@ def bce_with_logistic_no_reduce_test():
         reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
         check_gradgrad=False,
         pickle=False,
+        default_dtype=torch.double,
     )
 
 
 def bce_with_logistic_no_reduce_scalar_test():
-    t = torch.randn(()).gt(0).to(torch.get_default_dtype())
+    t = torch.randn(()).gt(0).to(torch.double)
     sigmoid = nn.Sigmoid()
     return dict(
         fullname='BCEWithLogitsLoss_no_reduce_scalar',
@@ -290,12 +303,13 @@ def bce_with_logistic_no_reduce_scalar_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
         check_gradgrad=False,
-        pickle=False
+        pickle=False,
+        default_dtype=torch.double,
     )
 
 
 def kldivloss_with_target_no_reduce_test():
-    t = torch.rand(10, 10)
+    t = torch.rand(10, 10, dtype=torch.double)
     return dict(
         fullname='KLDivLoss_with_target_no_reduce',
         constructor=wrap_functional(
@@ -306,11 +320,12 @@ def kldivloss_with_target_no_reduce_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def kldivloss_no_reduce_test():
-    t = torch.rand(10, 10)
+    t = torch.rand(10, 10, dtype=torch.double)
     return dict(
         fullname='KLDivLoss_no_reduce',
         constructor=wrap_functional(
@@ -322,11 +337,12 @@ def kldivloss_no_reduce_test():
             loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
         pickle=False,
+        default_dtype=torch.double,
     )
 
 
 def kldivloss_no_reduce_scalar_test():
-    t = torch.rand(())
+    t = torch.rand((), dtype=torch.double)
     return dict(
         fullname='KLDivLoss_no_reduce_scalar',
         constructor=wrap_functional(
@@ -337,11 +353,12 @@ def kldivloss_no_reduce_scalar_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def kldivloss_with_log_target_no_reduce_test():
-    t = torch.rand(10, 10).log()
+    t = torch.rand(10, 10, dtype=torch.double).log()
     return dict(
         fullname='KLDivLoss_with_log_target_no_reduce',
         constructor=wrap_functional(
@@ -352,11 +369,12 @@ def kldivloss_with_log_target_no_reduce_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def kldivloss_no_reduce_log_target_test():
-    t = torch.rand(10, 10).log()
+    t = torch.rand(10, 10, dtype=torch.double).log()
     return dict(
         fullname='KLDivLoss_no_reduce_log_target',
         constructor=wrap_functional(
@@ -368,11 +386,12 @@ def kldivloss_no_reduce_log_target_test():
             loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
         pickle=False,
+        default_dtype=torch.double,
     )
 
 
 def kldivloss_no_reduce_scalar_log_target_test():
-    t = torch.rand(()).log()
+    t = torch.rand((), dtype=torch.double).log()
     return dict(
         fullname='KLDivLoss_no_reduce_scalar_log_target',
         constructor=wrap_functional(
@@ -383,11 +402,12 @@ def kldivloss_no_reduce_scalar_log_target_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def l1loss_no_reduce_test():
-    t = torch.randn(2, 3, 4)
+    t = torch.randn(2, 3, 4, dtype=torch.double)
     return dict(
         fullname='L1Loss_no_reduce',
         constructor=wrap_functional(
@@ -397,7 +417,8 @@ def l1loss_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def l1loss_no_reduce_complex_test():
@@ -415,7 +436,7 @@ def l1loss_no_reduce_complex_test():
 
 
 def l1loss_no_reduce_scalar_test():
-    t = torch.randn(())
+    t = torch.randn((), dtype=torch.double)
     return dict(
         fullname='L1Loss_no_reduce_scalar',
         constructor=wrap_functional(
@@ -425,12 +446,13 @@ def l1loss_no_reduce_scalar_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def mseloss_no_reduce_test():
     input_size = (2, 3, 4, 5)
-    target = torch.randn(*input_size)
+    target = torch.randn(*input_size, dtype=torch.double)
     return dict(
         fullname='MSELoss_no_reduce',
         constructor=wrap_functional(
@@ -440,12 +462,13 @@ def mseloss_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 'target': target},
         reference_fn=lambda i, *_: (i - target).pow(2),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def mseloss_no_reduce_scalar_test():
     input_size = ()
-    target = torch.randn(input_size)
+    target = torch.randn(input_size, dtype=torch.double)
     return dict(
         fullname='MSELoss_no_reduce_scalar',
         constructor=wrap_functional(
@@ -455,7 +478,8 @@ def mseloss_no_reduce_scalar_test():
         cpp_var_map={'i': '_get_input()', 'target': target},
         reference_fn=lambda i, *_: (i - target).pow(2),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss_no_reduce_test():
@@ -471,7 +495,8 @@ def nllloss_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss_no_reduce_ignore_index_test():
@@ -488,7 +513,8 @@ def nllloss_no_reduce_ignore_index_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss_no_reduce_weights_test():
@@ -509,7 +535,8 @@ def kwargs(i):
         cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss_no_reduce_weights_ignore_index_test():
@@ -531,7 +558,8 @@ def kwargs(i):
         cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss_no_reduce_weights_ignore_index_neg_test():
@@ -549,11 +577,12 @@ def kwargs(i):
         cpp_function_call='''F::nll_loss(
             i, t.to(i.options()).to(torch::kLong),
             F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone).ignore_index(-1))''',
-        input=torch.rand(15, 10).add(1e-2).log(),
+        input=torch.rand(15, 10, dtype=torch.double).add(1e-2).log(),
         cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss2d_no_reduce_test():
@@ -569,7 +598,8 @@ def nllloss2d_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss2d_no_reduce_ignore_index_test():
@@ -586,7 +616,8 @@ def nllloss2d_no_reduce_ignore_index_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nllloss2d_no_reduce_weights_test():
@@ -607,7 +638,8 @@ def kwargs(i):
         cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nlllossNd_no_reduce_test():
@@ -623,7 +655,8 @@ def nlllossNd_no_reduce_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nlllossNd_no_reduce_ignore_index_test():
@@ -640,7 +673,8 @@ def nlllossNd_no_reduce_ignore_index_test():
         cpp_var_map={'i': '_get_input()', 't': t},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def nlllossNd_no_reduce_weights_test():
@@ -661,11 +695,12 @@ def kwargs(i):
         cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
         reference_fn=lambda i, *_:
             loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def smoothl1loss_no_reduce_test():
-    t = torch.randn(2, 3, 4)
+    t = torch.randn(2, 3, 4, dtype=torch.double)
     return dict(
         fullname='SmoothL1Loss_no_reduce',
         constructor=wrap_functional(
@@ -677,11 +712,12 @@ def smoothl1loss_no_reduce_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def smoothl1loss_no_reduce_scalar_test():
-    t = torch.randn(())
+    t = torch.randn((), dtype=torch.double)
     return dict(
         fullname='SmoothL1Loss_no_reduce_scalar',
         constructor=wrap_functional(
@@ -693,11 +729,12 @@ def smoothl1loss_no_reduce_scalar_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def smoothl1loss_beta_test():
-    t = torch.randn(2, 3, 4)
+    t = torch.randn(2, 3, 4, dtype=torch.double)
     return dict(
         fullname='SmoothL1Loss_beta',
         constructor=wrap_functional(
@@ -709,11 +746,12 @@ def smoothl1loss_beta_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0.5),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def smoothl1loss_zero_beta_test():
-    t = torch.randn(2, 3, 4)
+    t = torch.randn(2, 3, 4, dtype=torch.double)
     return dict(
         fullname='SmoothL1Loss_zero_beta',
         constructor=wrap_functional(
@@ -725,7 +763,8 @@ def smoothl1loss_zero_beta_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def huberloss_delta_test():
@@ -741,7 +780,8 @@ def huberloss_delta_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['HuberLoss'](i, t.type_as(i), reduction='none', delta=0.5),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multilabelmarginloss_0d_no_reduce_test():
@@ -775,7 +815,8 @@ def multilabelmarginloss_1d_no_reduce_test():
             loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multilabelmarginloss_index_neg_test():
@@ -792,7 +833,8 @@ def multilabelmarginloss_index_neg_test():
             loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multilabelmarginloss_no_reduce_test():
@@ -809,11 +851,12 @@ def multilabelmarginloss_no_reduce_test():
             loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def hingeembeddingloss_no_reduce_test():
-    t = Variable(torch.randn(10).gt(0).to(torch.get_default_dtype()).mul_(2).sub(1))
+    t = Variable(torch.randn(10).gt(0).to(torch.double).mul_(2).sub(1))
     return dict(
         fullname='HingeEmbeddingLoss_no_reduce',
         constructor=wrap_functional(
@@ -825,11 +868,12 @@ def hingeembeddingloss_no_reduce_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), reduction='none'),
         check_sum_reduction=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def hingeembeddingloss_margin_no_reduce_test():
-    t = Variable(torch.randn(10).gt(0).to(torch.get_default_dtype()).mul_(2).sub(1))
+    t = Variable(torch.randn(10).gt(0).to(torch.double).mul_(2).sub(1))
     return dict(
         fullname='HingeEmbeddingLoss_margin_no_reduce',
         constructor=wrap_functional(
@@ -841,11 +885,12 @@ def hingeembeddingloss_margin_no_reduce_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), margin=0.5, reduction='none'),
         check_sum_reduction=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def softmarginloss_no_reduce_test():
-    t = torch.randn(5, 5)
+    t = torch.randn(5, 5, dtype=torch.double)
     return dict(
         fullname='SoftMarginLoss_no_reduce',
         constructor=wrap_functional(
@@ -857,7 +902,8 @@ def softmarginloss_no_reduce_test():
         reference_fn=lambda i, *_:
             loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'),
         supports_forward_ad=True,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multilabelsoftmarginloss_no_reduce_test():
@@ -873,7 +919,8 @@ def multilabelsoftmarginloss_no_reduce_test():
         reference_fn=lambda i, *_:
             (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log())).sum(dim=1) / i.size(1),
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multilabelsoftmarginloss_weights_no_reduce_test():
@@ -893,7 +940,8 @@ def multilabelsoftmarginloss_weights_no_reduce_test():
             (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights).sum(dim=1) / i.size(1),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multimarginloss_no_reduce_test():
@@ -910,7 +958,8 @@ def multimarginloss_no_reduce_test():
             loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multimarginloss_1d_no_reduce_test():
@@ -927,7 +976,8 @@ def multimarginloss_1d_no_reduce_test():
             loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multimarginloss_1d_input_0d_target_no_reduce_test():
@@ -944,7 +994,8 @@ def multimarginloss_1d_input_0d_target_no_reduce_test():
             loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multimarginloss_p_no_reduce_test():
@@ -961,7 +1012,8 @@ def multimarginloss_p_no_reduce_test():
             loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), p=2, reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multimarginloss_margin_no_reduce_test():
@@ -980,12 +1032,13 @@ def multimarginloss_margin_no_reduce_test():
                                                   margin=0.5, reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def multimarginloss_weights_no_reduce_test():
     t = torch.rand(5).mul(8).floor().long()
-    weights = torch.rand(10)
+    weights = torch.rand(10, dtype=torch.double)
     return dict(
         fullname='MultiMarginLoss_weights_no_reduce',
         constructor=wrap_functional(
@@ -1001,7 +1054,8 @@ def multimarginloss_weights_no_reduce_test():
                                                   weight=weights, reduction='none'),
         check_sum_reduction=True,
         check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        default_dtype=torch.double)
 
 
 def single_batch_reference_fn(input, parameters, module):
@@ -1080,6 +1134,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv1d',
@@ -1090,6 +1145,7 @@ def unsqueeze_inp(inp):
         desc='stride',
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv1d',
@@ -1100,6 +1156,7 @@ def unsqueeze_inp(inp):
         desc='pad1',
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv1d',
@@ -1110,6 +1167,7 @@ def unsqueeze_inp(inp):
         desc='pad2',
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv1d',
@@ -1120,6 +1178,7 @@ def unsqueeze_inp(inp):
         desc='pad1size1',
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv1d',
@@ -1130,6 +1189,7 @@ def unsqueeze_inp(inp):
         desc='pad2size1',
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv1d',
@@ -1148,6 +1208,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 10),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv1d_groups',
@@ -1157,6 +1218,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv1d_pad_valid',
@@ -1166,6 +1228,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv1d_pad_same',
@@ -1175,6 +1238,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv1d_pad_same2',
@@ -1184,6 +1248,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv1d_pad_same_dilated',
@@ -1193,6 +1258,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='ConvTranspose1d',
@@ -1202,6 +1268,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 3, 7),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose1d',
@@ -1213,6 +1280,7 @@ def unsqueeze_inp(inp):
         desc='no_bias',
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose1d',
@@ -1224,6 +1292,7 @@ def unsqueeze_inp(inp):
         desc='dilated',
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='ConvTranspose1d_groups',
@@ -1234,6 +1303,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 7),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv2d',
@@ -1244,6 +1314,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv2d',
@@ -1255,6 +1326,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv2d',
@@ -1266,6 +1338,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv2d',
@@ -1277,6 +1350,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv2d',
@@ -1289,6 +1363,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.015,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv2d',
@@ -1309,6 +1384,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.015,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_groups_thnn',
@@ -1318,6 +1394,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.015,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_pad_valid',
@@ -1327,6 +1404,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_pad_same',
@@ -1336,6 +1414,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_pad_same_dilated',
@@ -1345,6 +1424,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose2d',
@@ -1356,6 +1436,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose2d',
@@ -1373,6 +1454,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose2d',
@@ -1385,6 +1467,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='ConvTranspose2d_groups',
@@ -1395,6 +1478,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.01,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_depthwise',
@@ -1403,6 +1487,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 6, 6),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_depthwise_with_multiplier',
@@ -1411,6 +1496,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 6, 6),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_depthwise_strided',
@@ -1419,6 +1505,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 6, 6),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_depthwise_padded',
@@ -1427,6 +1514,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 6, 6),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv2d_depthwise_dilated',
@@ -1435,6 +1523,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 5, 5),
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv3d',
@@ -1445,6 +1534,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv3d',
@@ -1457,6 +1547,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv3d',
@@ -1469,6 +1560,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=False,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv3d',
@@ -1480,6 +1572,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv3d',
@@ -1491,6 +1584,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Conv3d',
@@ -1511,6 +1605,7 @@ def unsqueeze_inp(inp):
         check_with_long_tensor=True,
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv3d_dilated',
@@ -1519,6 +1614,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 3, 5, 5, 5),
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv3d_dilated_strided',
@@ -1526,7 +1622,8 @@ def unsqueeze_inp(inp):
         cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)',
         input_size=(2, 3, 5, 5, 5),
         with_tf32=True,
-        tf32_precision=0.05
+        tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv3d_pad_valid',
@@ -1536,6 +1633,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv3d_pad_same',
@@ -1545,6 +1643,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Conv3d_pad_same_dilated',
@@ -1554,6 +1653,7 @@ def unsqueeze_inp(inp):
         cudnn=True,
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -1562,7 +1662,8 @@ def unsqueeze_inp(inp):
         cudnn=True,
         input_size=(1, 2, 4, 5, 4),
         with_tf32=True,
-        tf32_precision=0.05
+        tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -1573,13 +1674,15 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 5, 4),
         desc='dilated',
         with_tf32=True,
-        tf32_precision=0.05
+        tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ReplicationPad3d',
         constructor_args=((1, 2, 3, 3, 2, 1),),
         cpp_constructor_args='torch::nn::ReplicationPad3dOptions({1, 2, 3, 3, 2, 1})',
         input_size=(2, 3, 2, 2, 2),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ReplicationPad3d',
@@ -1588,6 +1691,7 @@ def unsqueeze_inp(inp):
         input_size=(3, 2, 2, 2),
         reference_fn=single_batch_reference_fn,
         desc='no_batch_dim',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='ReplicationPad3d',
@@ -1603,6 +1707,7 @@ def unsqueeze_inp(inp):
         cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)',
         input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Embedding',
@@ -1610,7 +1715,8 @@ def unsqueeze_inp(inp):
         cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)',
         input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512),
         check_gradgrad=False,
-        desc='discontiguous'
+        desc='discontiguous',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='EmbeddingBag',
@@ -1619,6 +1725,7 @@ def unsqueeze_inp(inp):
         input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
         check_gradgrad=False,
         desc='mean',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='EmbeddingBag',
@@ -1627,6 +1734,7 @@ def unsqueeze_inp(inp):
         input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512),
         check_gradgrad=False,
         desc='discontiguous',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='EmbeddingBag',
@@ -1636,6 +1744,7 @@ def unsqueeze_inp(inp):
         input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
         check_gradgrad=False,
         desc='sum',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='EmbeddingBag',
@@ -1645,6 +1754,7 @@ def unsqueeze_inp(inp):
         input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
         check_gradgrad=False,
         desc='max',
+        default_dtype=torch.double,
     ),
     dict(
         fullname='EmbeddingBag_mean_padding_idx',
@@ -1652,6 +1762,7 @@ def unsqueeze_inp(inp):
         cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).padding_idx(1)',
         input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]),
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='EmbeddingBag_sum_padding_idx',
@@ -1660,6 +1771,7 @@ def unsqueeze_inp(inp):
                                 .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kSum).padding_idx(1)''',
         input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]),
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='EmbeddingBag_max_padding_idx',
@@ -1668,17 +1780,18 @@ def unsqueeze_inp(inp):
                                 .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kMax).padding_idx(1)''',
         input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]),
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='EmbeddingBag_sparse',
-        constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True),
+        constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True, dtype=torch.double),
         cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).sparse(true)',
         input_fn=lambda: torch.randperm(2).repeat(1, 2),
         check_gradgrad=False,
         has_sparse_gradients=True,
     ),
     dict(
-        constructor=lambda: nn.Embedding(4, 3, sparse=True),
+        constructor=lambda: nn.Embedding(4, 3, dtype=torch.double, sparse=True),
         cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3).sparse(true)',
         input_fn=lambda: torch.randperm(2).repeat(1, 2),
         fullname='Embedding_sparse',
@@ -1690,12 +1803,14 @@ def unsqueeze_inp(inp):
         constructor_args=(3,),
         cpp_constructor_args='torch::nn::PixelShuffleOptions(3)',
         input_size=(1, 9, 4, 4),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PixelUnshuffle',
         constructor_args=(3,),
         cpp_constructor_args='torch::nn::PixelUnshuffleOptions(3)',
         input_size=(1, 1, 12, 12),
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
@@ -1704,6 +1819,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4),
         fullname='interpolate_nearest_1d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
@@ -1720,6 +1836,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 3),
         fullname='interpolate_nearest_tuple_1d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
@@ -1728,6 +1845,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4),
         fullname='interpolate_nearest_scale_1d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False),
@@ -1739,6 +1857,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4),
         fullname='interpolate_linear_1d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=(4, ), scale_factor=None, mode='linear', align_corners=False),
@@ -1750,6 +1869,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 3),
         fullname='interpolate_linear_tuple_1d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=False),
@@ -1761,6 +1881,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4),
         fullname='interpolate_linear_scale_1d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False),
@@ -1783,6 +1904,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4),
         fullname='interpolate_linear_1d_align_corners',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=True),
@@ -1794,6 +1916,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4),
         fullname='interpolate_linear_scale_1d_align_corners',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=2, scale_factor=None, mode='nearest'),
@@ -1804,6 +1927,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 128, 1, 1),
         fullname='interpolate_nearest_2d_launch_configs',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
@@ -1814,6 +1938,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_nearest_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=(12, 16), scale_factor=None, mode='nearest'),
@@ -1824,6 +1949,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 3, 4),
         fullname='interpolate_nearest_tuple_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
@@ -1834,6 +1960,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_nearest_scale_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
@@ -1855,6 +1982,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bilinear_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bilinear', align_corners=False),
@@ -1878,6 +2006,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 2, 3),
         fullname='interpolate_bilinear_tuple_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4.,
@@ -1890,6 +2019,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bilinear_scale_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.),
@@ -1902,6 +2032,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bilinear_scale_tuple_shared_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
@@ -1914,6 +2045,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bilinear_scale_tuple_skewed_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bilinear', align_corners=True),
@@ -1925,6 +2057,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bilinear_tuple_2d_align_corners',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
@@ -1937,6 +2070,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bilinear_scale_tuple_skewed_2d_align_corners',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False),
@@ -1948,6 +2082,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bicubic_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False),
@@ -1971,6 +2106,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 2, 3),
         fullname='interpolate_bicubic_tuple_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='bicubic', align_corners=False),
@@ -1982,6 +2118,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bicubic_scale_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.),
@@ -1994,6 +2131,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bicubic_scale_tuple_shared_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
@@ -2006,6 +2144,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bicubic_scale_tuple_skewed_2d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bicubic', align_corners=True),
@@ -2017,6 +2156,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bicubic_tuple_2d_align_corners',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
@@ -2029,6 +2169,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4),
         fullname='interpolate_bicubic_scale_tuple_skewed_2d_align_corners',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
@@ -2039,6 +2180,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4, 4),
         fullname='interpolate_nearest_3d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
@@ -2059,6 +2201,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 3, 4, 4),
         fullname='interpolate_nearest_tuple_3d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
@@ -2069,6 +2212,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4, 4),
         fullname='interpolate_nearest_scale_3d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False),
@@ -2080,6 +2224,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 4, 4, 4),
         fullname='interpolate_trilinear_3d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False),
@@ -2103,6 +2248,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 2, 3, 3),
         fullname='interpolate_trilinear_tuple_3d',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=False),
@@ -2116,6 +2262,7 @@ def unsqueeze_inp(inp):
         # See https://github.com/pytorch/pytorch/issues/5006
         precision=3e-4,
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=(4, 6, 6), scale_factor=None,
@@ -2128,6 +2275,7 @@ def unsqueeze_inp(inp):
         input_size=(1, 2, 2, 3, 3),
         fullname='interpolate_trilinear_tuple_3d_align_corners',
         pickle=False,
+        default_dtype=torch.double
     ),
     dict(
         constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=True),
@@ -2141,6 +2289,7 @@ def unsqueeze_inp(inp):
         # See https://github.com/pytorch/pytorch/issues/5006
         precision=3e-4,
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=-1),
@@ -2148,6 +2297,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 128),  # trigger the last-dim algo in CUDA
         fullname='softmax_lastdim',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64),
@@ -2155,7 +2305,8 @@ def unsqueeze_inp(inp):
         input_size=(2, 128),
         fullname='softmax_lastdim_dtype',
         pickle=False,
-        test_cuda=False
+        test_cuda=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=1),
@@ -2170,6 +2321,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 2, 4, 4),  # regular spatial algorithm
         fullname='softmax_spatial',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64),
@@ -2177,7 +2329,8 @@ def unsqueeze_inp(inp):
         input_size=(2, 2, 4, 4),  # regular spatial algorithm
         fullname='softmax_spatial_dtype',
         pickle=False,
-        test_cuda=False
+        test_cuda=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=0),
@@ -2186,6 +2339,7 @@ def unsqueeze_inp(inp):
         fullname='softmax_functional_dim0',
         test_cuda=False,
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=3),
@@ -2194,6 +2348,7 @@ def unsqueeze_inp(inp):
         fullname='softmax_functional_dim3',
         test_cuda=False,
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.softmax, dim=-1),
@@ -2209,6 +2364,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 128),  # trigger the last-dim algo in CUDA
         fullname='log_softmax_lastdim',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=1),
@@ -2216,6 +2372,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
         fullname='log_softmax_spatial_special',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=1),
@@ -2223,6 +2380,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 2, 4, 4),  # regular spatial algorithm
         fullname='log_softmax_spatial',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=0),
@@ -2230,6 +2388,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 3, 4, 5),
         fullname='log_softmax_dim0',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=3),
@@ -2237,6 +2396,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 3, 4, 5),
         fullname='log_softmax_dim3',
         pickle=False,
+        default_dtype=torch.double,
     ),
     dict(
         constructor=wrap_functional(F.log_softmax, dim=0),
@@ -2252,6 +2412,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 3, 3),
         check_gradgrad=False,
         test_cuda=True,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Fold',
@@ -2260,6 +2421,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 16, 4),
         check_gradgrad=False,
         test_cuda=True,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Fold_no_batch_dim_input',
@@ -2269,6 +2431,7 @@ def unsqueeze_inp(inp):
         check_gradgrad=False,
         ref=single_batch_reference_fn,
         test_cuda=True,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Unfold_int_input',
@@ -2277,6 +2440,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 4, 3, 3),
         check_gradgrad=False,
         test_cuda=True,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Fold_int_input',
@@ -2285,6 +2449,7 @@ def unsqueeze_inp(inp):
         input_size=(2, 16, 4),
         check_gradgrad=False,
         test_cuda=True,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='Fold_no_batch_dim_int_input',
@@ -2294,6 +2459,7 @@ def unsqueeze_inp(inp):
         ref=single_batch_reference_fn,
         check_gradgrad=False,
         test_cuda=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='RReLU',
@@ -2302,20 +2468,24 @@ def unsqueeze_inp(inp):
         input_size=(),
         desc='with_up_down_scalar',
         test_cuda=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PairwiseDistance',
         input_fn=lambda: (torch.randn(10, 8), torch.randn(10, 8)),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PairwiseDistance',
         input_fn=lambda: (torch.randn(10, 1), torch.randn(10, 8)),
-        desc='broadcast_lhs'
+        desc='broadcast_lhs',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PairwiseDistance',
         input_fn=lambda: (torch.randn(10, 8), torch.randn(1, 8)),
-        desc='broadcast_rhs'
+        desc='broadcast_rhs',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PairwiseDistance',
@@ -2323,12 +2493,14 @@ def unsqueeze_inp(inp):
         cpp_constructor_args='torch::nn::PairwiseDistanceOptions().p(1.5).eps(1e-05).keepdim(true)',
         input_fn=lambda: (torch.randn(10, 8), torch.randn(10, 8)),
         desc='with_non_default_args',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PairwiseDistance',
         input_fn=lambda: (torch.randn(8), torch.randn(8)),
         reference_fn=single_batch_reference_fn,
         desc='no_batch_dim',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='TransformerEncoderLayer',
@@ -2345,6 +2517,7 @@ def unsqueeze_inp(inp):
         # at non-singleton dimension 2
         check_batched_grad=False,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='TransformerEncoderLayer',
@@ -2358,6 +2531,7 @@ def unsqueeze_inp(inp):
         desc='gelu_activation',
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='TransformerDecoderLayer',
@@ -2370,6 +2544,7 @@ def unsqueeze_inp(inp):
         desc='relu_activation',
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='TransformerDecoderLayer',
@@ -2383,6 +2558,7 @@ def unsqueeze_inp(inp):
         desc='gelu_activation',
         with_tf32=True,
         tf32_precision=0.05,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Transformer',
@@ -2400,6 +2576,7 @@ def unsqueeze_inp(inp):
         desc='multilayer_coder',
         with_tf32=True,
         tf32_precision=0.03,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Linear',
@@ -2410,6 +2587,7 @@ def unsqueeze_inp(inp):
         desc="no_batch_dim",
         with_tf32=True,
         tf32_precision=0.005,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Flatten',
@@ -2418,6 +2596,7 @@ def unsqueeze_inp(inp):
         input_size=(3, 4, 5),
         reference_fn=single_batch_reference_fn,
         desc="no_batch_dim",
+        default_dtype=torch.double,
     ),
     dict(
         module_name='Unflatten',
@@ -2426,6 +2605,7 @@ def unsqueeze_inp(inp):
         input_size=(3, 4, 5),
         reference_fn=single_batch_reference_fn,
         desc="no_batch_dim",
+        default_dtype=torch.double,
     ),
     dict(
         module_name='LayerNorm',
@@ -2473,7 +2653,8 @@ def unsqueeze_inp(inp):
                 cudnn=True,
                 desc=f'{padding_mode}_stride2_pad2',
                 with_tf32=True,
-                tf32_precision=0.05
+                tf32_precision=0.05,
+                default_dtype=torch.double,
             ),
         )
 
@@ -2485,13 +2666,32 @@ def unsqueeze_inp(inp):
     'Tanhshrink', 'Threshold'
 ]
 non_linear_activations_extra_info: Dict[str, dict] = {
-    'CELU': {'constructor_args': (2.,)},
+    'CELU': {'constructor_args': (2.,), 'default_dtype': torch.double},
     'Threshold': {'constructor_args': (2., 1.)},
-    'Hardsigmoid': {'check_gradgrad': False, 'check_jit': False},
-    'Hardswish': {'check_gradgrad': False, 'check_jit': False},
+    'Hardsigmoid': {'check_gradgrad': False, 'check_jit': False, 'default_dtype': torch.double},
+    'Hardswish': {'check_gradgrad': False, 'check_jit': False, 'default_dtype': torch.double},
     # For RRelu, test that compare CPU and GPU results fail because RNG
     # is different between CPU and GPU
-    'RReLU': {'test_cuda': False},
+    'RReLU': {'test_cuda': False, 'default_dtype': torch.double},
+    'ELU': {'default_dtype': torch.double},
+    'GELU': {'default_dtype': torch.double},
+    'GLU': {'default_dtype': torch.double},
+    'Hardshrink': {'default_dtype': torch.double},
+    'Hardtanh': {'default_dtype': torch.double},
+    'LeakyReLU': {'default_dtype': torch.double},
+    'LogSigmoid': {'default_dtype': torch.double},
+    'Mish': {'default_dtype': torch.double},
+    'PReLU': {'default_dtype': torch.double},
+    'ReLU6': {'default_dtype': torch.double},
+    'ReLU': {'default_dtype': torch.double},
+    'SELU': {'default_dtype': torch.double},
+    'SiLU': {'default_dtype': torch.double},
+    'Sigmoid': {'default_dtype': torch.double},
+    'Softplus': {'default_dtype': torch.double},
+    'Softshrink': {'default_dtype': torch.double},
+    'Softsign': {'default_dtype': torch.double},
+    'Tanh': {'default_dtype': torch.double},
+    'Tanhshrink': {'default_dtype': torch.double},
 }
 for non_linear_activation in non_linear_activations_no_batch:
     activation_test_info = dict(
@@ -2885,6 +3085,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, _: 1. / i.numel() *
         sum((a - b).abs().sum() for a, b in zip(i, t)),
         check_complex=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -2894,6 +3095,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             nllloss_reference(i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -2904,6 +3106,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, _: nllloss_reference(i, t, ignore_index=2),
         desc='ignore_index',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -2915,6 +3118,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             nllloss_reference(i, t, weight=get_weight(m)),
         desc='weights',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -2926,6 +3130,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             nllloss_reference(i, t, weight=get_weight(m), ignore_index=2),
         desc='weights_ignore_index',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -2937,6 +3142,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             nllloss_reference(i, t, weight=get_weight(m), ignore_index=-1),
         desc='weights_ignore_index_neg',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='KLDivLoss',
@@ -2945,6 +3151,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             kldivloss_reference(i, t, get_reduction(m)),
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='KLDivLoss',
@@ -2956,14 +3163,16 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             kldivloss_log_target_reference(i, t, get_reduction(m)),
         check_sum_reduction=True,
         desc='log_target',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MSELoss',
         input_size=(2, 3, 4, 5),
-        target_fn=lambda: torch.randn((2, 3, 4, 5), requires_grad=True),
+        target_fn=lambda: torch.randn((2, 3, 4, 5), dtype=torch.double, requires_grad=True),
         reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() / (i.numel()
                                       if get_reduction(m) == 'mean' else 1)),
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='BCELoss',
@@ -2972,6 +3181,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() /
             (i.numel() if get_reduction(m) else 1),
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='BCELoss',
@@ -2983,11 +3193,13 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             (i.numel() if get_reduction(m) else 1),
         desc='weights',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
         input_size=(15, 10),
         target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -2996,6 +3208,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         input_size=(15, 10),
         target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(),
         desc='weights',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='HingeEmbeddingLoss',
@@ -3004,6 +3217,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             hingeembeddingloss_reference(i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='HingeEmbeddingLoss',
@@ -3015,6 +3229,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             hingeembeddingloss_reference(i, t, margin=0.5, reduction=get_reduction(m)),
         desc='margin',
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiLabelMarginLoss',
@@ -3026,6 +3241,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         check_gradgrad=False,
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiLabelMarginLoss',
@@ -3036,6 +3252,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         check_gradgrad=False,
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiLabelSoftMarginLoss',
@@ -3043,6 +3260,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
         reference_fn=lambda i, t, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()).sum() / i.numel(),
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiMarginLoss',
@@ -3052,6 +3270,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             multimarginloss_reference(i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiMarginLoss',
@@ -3062,6 +3281,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         desc='1d',
         check_sum_reduction=True,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiMarginLoss',
@@ -3074,6 +3294,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         desc='p',
         check_sum_reduction=True,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiMarginLoss',
@@ -3087,12 +3308,13 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         desc='margin',
         check_sum_reduction=True,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiMarginLoss',
-        constructor_args=(1, 1., torch.rand(10).to(torch.get_default_dtype())),
+        constructor_args=(1, 1., torch.rand(10, dtype=torch.double)),
         cpp_constructor_args='torch::nn::MultiMarginLossOptions().p(1).margin(1.).weight(torch::rand(10))',
-        legacy_constructor_args=(1, torch.rand(10).to(torch.get_default_dtype())),
+        legacy_constructor_args=(1, torch.rand(10, dtype=torch.double)),
         input_size=(5, 10),
         target_fn=lambda: torch.rand(5).mul(8).floor().long(),
         reference_fn=lambda i, t, m:
@@ -3100,6 +3322,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         desc='weights',
         check_sum_reduction=True,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='SmoothL1Loss',
@@ -3108,6 +3331,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         reference_fn=lambda i, t, m, b=1.0:
             smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='HuberLoss',
@@ -3118,6 +3342,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_bfloat16=True,
         reference_fn=lambda i, t, m:
             huberloss_reference(i, t, reduction=get_reduction(m)),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='SoftMarginLoss',
@@ -3126,11 +3351,12 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             softmarginloss_reference(i, t, reduction=get_reduction(m)),
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CosineEmbeddingLoss',
-        input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
-        target_fn=lambda: torch.randn(15).sign(),
+        input_fn=lambda: (torch.rand(15, 10, dtype=torch.double), torch.rand(15, 10, dtype=torch.double)),
+        target_fn=lambda: torch.randn(15, dtype=torch.double).sign(),
         reference_fn=lambda i, t, m:
             cosineembeddingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
         check_sum_reduction=True,
@@ -3139,8 +3365,8 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         module_name='CosineEmbeddingLoss',
         constructor_args=(0.7,),
         cpp_constructor_args='torch::nn::CosineEmbeddingLossOptions().margin(0.7)',
-        input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
-        target_fn=lambda: torch.randn(15).sign(),
+        input_fn=lambda: (torch.rand(15, 10, dtype=torch.double), torch.rand(15, 10, dtype=torch.double)),
+        target_fn=lambda: torch.randn(15, dtype=torch.double).sign(),
         reference_fn=lambda i, t, m:
             cosineembeddingloss_reference(i[0], i[1], t, margin=0.7, reduction=get_reduction(m)),
         desc='margin',
@@ -3153,6 +3379,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             marginrankingloss_reference(i[0], i[1], t, reduction=get_reduction(m)),
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MarginRankingLoss',
@@ -3164,27 +3391,31 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             marginrankingloss_reference(i[0], i[1], t, margin=0.5, reduction=get_reduction(m)),
         desc='margin',
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='BCEWithLogitsLoss',
         input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
         target_fn=lambda: torch.randn(15, 10).gt(0).to(torch.get_default_dtype()),
+        default_dtype=torch.double,
     ),
     dict(
         module_name='BCEWithLogitsLoss',
-        constructor_args=(torch.rand(10),),
+        constructor_args=(torch.rand(10, dtype=torch.double),),
         cpp_constructor_args='torch::nn::BCEWithLogitsLossOptions().weight(torch::rand(10))',
         input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
         target_fn=lambda: torch.randn(15, 10).gt(0).to(torch.get_default_dtype()),
         desc='weights',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='BCEWithLogitsLoss',
-        constructor_args=(torch.rand(()),),
+        constructor_args=(torch.rand((), dtype=torch.double),),
         cpp_constructor_args='torch::nn::BCEWithLogitsLossOptions().weight(torch::rand({}))',
         input_fn=lambda: torch.rand(()).clamp_(1e-2, 1 - 1e-2),
         target_fn=lambda: torch.randn(()).gt(0).to(torch.get_default_dtype()),
-        desc='scalar_weights'
+        desc='scalar_weights',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -3195,6 +3426,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='2d',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -3206,6 +3438,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             loss_reference_fns['NLLLossNd'](i, t, weight=get_weight(m)),
         desc='2d_weights',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -3217,6 +3450,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             loss_reference_fns['NLLLossNd'](i, t, ignore_index=1),
         desc='2d_ignore_index',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -3227,6 +3461,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='higher_dim',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='NLLLoss',
@@ -3237,6 +3472,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='dim_is_3',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3247,6 +3483,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='2d',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3258,6 +3495,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             loss_reference_fns['CrossEntropyLoss'](i, t, weight=get_weight(m)),
         desc='2d_weights',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3269,6 +3507,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             loss_reference_fns['CrossEntropyLoss'](i, t, ignore_index=1),
         desc='2d_ignore_index',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3279,6 +3518,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='higher_dim',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3289,6 +3529,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='dim_is_3',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3299,6 +3540,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='2d_prob_target',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3309,6 +3551,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='3d_prob_target',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3319,6 +3562,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='4d_prob_target',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_prob_target_smoothing_sum_reduction',
@@ -3330,6 +3574,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_prob_target_smoothing',
@@ -3340,6 +3585,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_prob_target_smoothing_weight',
@@ -3351,6 +3597,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_3d_prob_target_smoothing_sum_reduction',
@@ -3362,6 +3609,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_3d_prob_target_smoothing',
@@ -3372,6 +3620,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_3d_indices_target_smoothing',
@@ -3382,6 +3631,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_3d_indices_target_smoothing_ignore_index',
@@ -3392,6 +3642,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction',
@@ -3402,6 +3653,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction_ignore_index',
@@ -3413,6 +3665,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_indices_target_smoothing',
@@ -3423,6 +3676,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_indices_target_smoothing_sum_reduction',
@@ -3433,6 +3687,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_indices_target_smoothing_ignore_index',
@@ -3443,6 +3698,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=3),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         fullname='CrossEntropyLoss_2d_indices_target_smoothing_weight',
@@ -3454,6 +3710,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m:
             loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15),
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3466,6 +3723,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='2d_prob_target_weights',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3478,6 +3736,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='3d_prob_target_weights',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CrossEntropyLoss',
@@ -3490,6 +3749,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         desc='4d_prob_target_weights',
         check_bfloat16=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PoissonNLLLoss',  # Default is log_input=True, full=False
@@ -3497,6 +3757,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
         reference_fn=lambda i, t, _: (i.exp() - t.mul(i)).mean(),
         desc='no_full_loss',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PoissonNLLLoss',
@@ -3506,6 +3767,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(),
         reference_fn=lambda i, t, _: (i - t.mul((i + 1e-8).log())).mean(),
         desc='no_full_loss_no_log_input',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PoissonNLLLoss',
@@ -3516,6 +3778,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, _:
             (i.exp() - t.mul(i) + (t.mul(t.log()) - t + 0.5 * (2. * pi * t).log()).masked_fill(t <= 1, 0)).mean(),
         desc='full_loss',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='PoissonNLLLoss',
@@ -3527,6 +3790,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             i - t.mul((i + 1e-8).log()) + (t.mul(t.log()) - t + 0.5 * (2. * pi * t).log()).masked_fill(t <= 1, 0)
         ).mean(),
         desc='full_loss_no_log_input',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='L1Loss',
@@ -3535,6 +3799,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, _: 1. / i.numel() * (i - t).abs().sum(),
         desc='scalar',
         check_complex=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='KLDivLoss',
@@ -3544,6 +3809,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             kldivloss_reference(i, t, get_reduction(m)),
         check_sum_reduction=True,
         desc='scalar',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='KLDivLoss',
@@ -3555,16 +3821,18 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             kldivloss_log_target_reference(i, t, get_reduction(m)),
         check_sum_reduction=True,
         desc='scalar_log_target',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MSELoss',
         input_size=(),
-        target_fn=lambda: torch.randn((), requires_grad=True),
+        target_fn=lambda: torch.randn((), requires_grad=True, dtype=torch.double),
         reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() /
                                       (i.numel() if get_reduction(m) == 'mean' else 1)),
         check_sum_reduction=True,
         desc='scalar',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MSELoss',
@@ -3586,6 +3854,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
             (i.numel() if get_reduction(m) == 'mean' else 1),
         desc='scalar_weights',
         check_bfloat16=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='HingeEmbeddingLoss',
@@ -3595,6 +3864,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         target_fn=lambda: torch.randn(()).gt(0).to(torch.get_default_dtype()).mul_(2).sub(1),
         desc='scalar_margin',
         check_sum_reduction=True,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='SmoothL1Loss',
@@ -3604,6 +3874,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         reference_fn=lambda i, t, m, b=1.0:
             smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b),
         desc='scalar',
+        default_dtype=torch.double,
     ),
     dict(
         module_name='MultiLabelSoftMarginLoss',
@@ -3616,6 +3887,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         desc='weights',
         check_sum_reduction=True,
         check_gradgrad=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CTCLoss',
@@ -3633,6 +3905,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths`
         test_cpp_api_parity=False,
         check_jit=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CTCLoss',
@@ -3648,6 +3921,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
+        default_dtype=torch.double,
     ),
     # Test is flaky
     # See https://github.com/pytorch/pytorch/issues/29380.
@@ -3680,6 +3954,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths`
         test_cpp_api_parity=False,
         check_jit=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CTCLoss',
@@ -3695,6 +3970,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
+        default_dtype=torch.double,
     ),
     dict(
         module_name='CTCLoss',
@@ -3710,6 +3986,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
+        default_dtype=torch.double,
     ),
 ]
 
@@ -3760,6 +4037,7 @@ def flatten(xs):
         target_size=(3, ),
         reference_fn=single_batch_reference_criterion_fn,
         test_cpp_api_parity=False,
+        default_dtype=torch.double,
     )
     criterion_tests.append(regression_test_info)
 
@@ -3772,6 +4050,7 @@ def flatten(xs):
         target_fn=lambda: torch.rand((3,)),
         reference_fn=single_batch_reference_criterion_fn,
         test_cpp_api_parity=False,
+        default_dtype=torch.double,
     )
     criterion_tests.append(regression_test_info)
 
@@ -3779,18 +4058,30 @@ def flatten(xs):
 # Check that classification criterion work with no batch dimensions
 # List of tuples of (name, input_fn, target_fn)
 classification_criterion_no_batch = [
-    ('BCELoss', lambda: torch.sigmoid(torch.randn(9)), lambda: torch.randn(9).gt(0).to(torch.get_default_dtype())),
-    ('BCEWithLogitsLoss', lambda: torch.randn(9), lambda: torch.randn(9)),
-    ('HingeEmbeddingLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)),
-    ('MultiLabelMarginLoss', lambda: torch.randn(4), lambda: torch.tensor([3, 0, -1, 1])),
-    ('SoftMarginLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)),
-    ('NLLLoss', lambda: F.log_softmax(torch.randn(3), dim=0), lambda: torch.tensor(1)),
-    ('CosineEmbeddingLoss', lambda: (torch.randn(9), torch.randn(9)), lambda: torch.tensor(1)),
+    (
+        'BCELoss',
+        lambda: torch.sigmoid(torch.randn(9, dtype=torch.double)),
+        lambda: torch.randn(9, dtype=torch.double).gt(0).to(torch.double)
+    ),
+    ('BCEWithLogitsLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.randn(9, dtype=torch.double)),
+    ('HingeEmbeddingLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.tensor([-1, 1, 1] * 3)),
+    ('MultiLabelMarginLoss', lambda: torch.randn(4, dtype=torch.double), lambda: torch.tensor([3, 0, -1, 1])),
+    ('SoftMarginLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.tensor([-1, 1, 1] * 3)),
+    ('NLLLoss', lambda: F.log_softmax(torch.randn(3, dtype=torch.double), dim=0), lambda: torch.tensor(1)),
+    (
+        'CosineEmbeddingLoss',
+        lambda: (torch.randn(9, dtype=torch.double), torch.randn(9, dtype=torch.double)),
+        lambda: torch.tensor(1, dtype=torch.double)
+    ),
     # For MarginRankingLoss, input_fn : (x1, x2) and target_fn : target
     ('MarginRankingLoss', lambda: (torch.randn(()), torch.randn(())), lambda: torch.randn(()).sign()),
     # For TripletMarginLoss, input_fn : (anchor, positive) and target_fn : negative
-    ('TripletMarginLoss', lambda: (torch.randn(9), torch.randn(9)), lambda: torch.randn(9)),
-    ('MultiLabelSoftMarginLoss', lambda: torch.randn(9), lambda: torch.randn(9)),
+    (
+        'TripletMarginLoss',
+        lambda: (torch.randn(9, dtype=torch.double), torch.randn(9, dtype=torch.double)),
+        lambda: torch.randn(9, dtype=torch.double)
+    ),
+    ('MultiLabelSoftMarginLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.randn(9)),
 ]
 classification_criterion_no_batch_extra_info: Dict[str, dict] = {
     'MultiLabelMarginLoss': {'check_gradgrad': False},
@@ -4037,31 +4328,35 @@ def __init__(self, *args, **kwargs):
             kwargs.get('FIXME_no_cuda_gradgrad_comparison', False)
         self.precision = kwargs.get('precision', 2e-4)
         self.check_forward_only = kwargs.get('check_forward_only', False)
+        self.default_dtype = kwargs.get('default_dtype', None)
+        if self.default_dtype is None:
+            self.default_dtype = torch.get_default_dtype()
 
     def __call__(self, test_case):
-        module = self.constructor(*self.constructor_args)
-        input = self._get_input()
-
-        if self.reference_fn is not None:
-            out = test_case._forward(module, input)
-            ref_input = deepcopy(input)
-            ref_module = deepcopy(module)
-            expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0], ref_module)
-            test_case.assertEqual(out, expected_out, exact_dtype=False)
-        if self.check_forward_only:
-            return
-        self.test_noncontig(test_case, module, input)
-
-        if self.should_test_pickle:
-            # TODO: do this with in-memory files as soon as torch.save will support it
-            with tempfile.TemporaryFile() as f:
-                test_case._forward(module, input)
-                torch.save(module, f)
-                f.seek(0)
-                module_copy = torch.load(f)
-                test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input))
-
-        self._do_test(test_case, module, input)
+        with set_default_dtype(self.default_dtype):
+            module = self.constructor(*self.constructor_args)
+            input = self._get_input()
+
+            if self.reference_fn is not None:
+                out = test_case._forward(module, input)
+                ref_input = deepcopy(input)
+                ref_module = deepcopy(module)
+                expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0], ref_module)
+                test_case.assertEqual(out, expected_out, exact_dtype=False)
+            if self.check_forward_only:
+                return
+            self.test_noncontig(test_case, module, input)
+
+            if self.should_test_pickle:
+                # TODO: do this with in-memory files as soon as torch.save will support it
+                with tempfile.TemporaryFile() as f:
+                    test_case._forward(module, input)
+                    torch.save(module, f)
+                    f.seek(0)
+                    module_copy = torch.load(f)
+                    test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input))
+
+            self._do_test(test_case, module, input)
 
     def noncontiguize(self, obj):
         if isinstance(obj, list):
@@ -4125,92 +4420,94 @@ def test_cuda(self, test_case):
         if not TEST_CUDA or not self.should_test_cuda:
             raise unittest.SkipTest('Excluded from CUDA tests')
 
-        cpu_input = self._get_input()
-        type_map = {torch.double: torch.float}
-        cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,)
-
-        is_any_input_complex = any(isinstance(t, torch.Tensor) and t.dtype.is_complex for t in cpu_input_tuple)
-
-        gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map)
-
-        cpu_module = self.constructor(*self.constructor_args)
-        gpu_module = self.constructor(*self.constructor_args).float().cuda()
-        cpu_param = test_case._get_parameters(cpu_module)
-        gpu_param = test_case._get_parameters(gpu_module)
-        for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
-            gpu_p.data.copy_(cpu_p)
-
-        test_case._zero_grad_input(cpu_input_tuple)
-        test_case._zero_grad_input(gpu_input_tuple)
-        test_case._zero_grad_parameters(cpu_module)
-        test_case._zero_grad_parameters(gpu_module)
-        cpu_output = test_case._forward(cpu_module, cpu_input_tuple)
-        gpu_output = test_case._forward(gpu_module, gpu_input_tuple)
-        if getattr(cpu_module, "return_indices", False):
-            cpu_output = cpu_output[0]
-            gpu_output = gpu_output[0]
-        test_case.assertEqual(cpu_output, gpu_output, atol=self.precision, rtol=0, exact_dtype=False)
-
-        # Run backwards on CPU and GPU and compare results
-        for _ in range(5):
-            cpu_gradOutput = cpu_output.clone().normal_()
-            gpu_gradOutput = cpu_gradOutput.type_as(gpu_output)
-            cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
-            gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
-            test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-            for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
-                test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
-
-        # Run double-backwards on CPU and GPU and compare results
-        if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
-            cpu_output = cpu_module(*cpu_input_tuple)
-            gpu_output = gpu_module(*gpu_input_tuple)
+        with set_default_dtype(self.default_dtype):
+            cpu_input = self._get_input()
+
+            type_map = {torch.double: torch.float}
+            cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,)
+
+            is_any_input_complex = any(isinstance(t, torch.Tensor) and t.dtype.is_complex for t in cpu_input_tuple)
+
+            gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map)
+
+            cpu_module = self.constructor(*self.constructor_args)
+            gpu_module = self.constructor(*self.constructor_args).float().cuda()
+            cpu_param = test_case._get_parameters(cpu_module)
+            gpu_param = test_case._get_parameters(gpu_module)
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+                gpu_p.data.copy_(cpu_p)
+
+            test_case._zero_grad_input(cpu_input_tuple)
+            test_case._zero_grad_input(gpu_input_tuple)
+            test_case._zero_grad_parameters(cpu_module)
+            test_case._zero_grad_parameters(gpu_module)
+            cpu_output = test_case._forward(cpu_module, cpu_input_tuple)
+            gpu_output = test_case._forward(gpu_module, gpu_input_tuple)
             if getattr(cpu_module, "return_indices", False):
                 cpu_output = cpu_output[0]
                 gpu_output = gpu_output[0]
+            test_case.assertEqual(cpu_output, gpu_output, atol=self.precision, rtol=0, exact_dtype=False)
+
+            # Run backwards on CPU and GPU and compare results
+            for _ in range(5):
+                cpu_gradOutput = cpu_output.clone().normal_()
+                gpu_gradOutput = cpu_gradOutput.type_as(gpu_output)
+                cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
+                gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
+                test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                    test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
+
+            # Run double-backwards on CPU and GPU and compare results
+            if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
+                cpu_output = cpu_module(*cpu_input_tuple)
+                gpu_output = gpu_module(*gpu_input_tuple)
+                if getattr(cpu_module, "return_indices", False):
+                    cpu_output = cpu_output[0]
+                    gpu_output = gpu_output[0]
+
+                cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
+                gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach()
+                gpu_gradOutput.requires_grad = True
+
+                cpu_gradInputs = torch.autograd.grad(
+                    cpu_output,
+                    cpu_input_tuple + tuple(cpu_module.parameters()),
+                    cpu_gradOutput,
+                    create_graph=True)
+                gpu_gradInputs = torch.autograd.grad(
+                    gpu_output,
+                    gpu_input_tuple + tuple(gpu_module.parameters()),
+                    gpu_gradOutput,
+                    create_graph=True)
+
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                    test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
+
+                # We mix output into the second backwards computation so that
+                # torch.autograd.grad doesn't complain that some inputs
+                # are unreachable (which can happen if you differentiate
+                # only on the gradient.
+                if is_any_input_complex:
+                    outputs_cpu = cpu_output.sum().abs() + sum(x.sum().abs() for x in cpu_gradInputs)
+                    outputs_gpu = gpu_output.sum().abs() + sum(x.sum().abs() for x in gpu_gradInputs)
+                else:
+                    outputs_cpu = cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs)
+                    outputs_gpu = gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs)
 
-            cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
-            gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach()
-            gpu_gradOutput.requires_grad = True
-
-            cpu_gradInputs = torch.autograd.grad(
-                cpu_output,
-                cpu_input_tuple + tuple(cpu_module.parameters()),
-                cpu_gradOutput,
-                create_graph=True)
-            gpu_gradInputs = torch.autograd.grad(
-                gpu_output,
-                gpu_input_tuple + tuple(gpu_module.parameters()),
-                gpu_gradOutput,
-                create_graph=True)
-
-            for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
-                test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
-
-            # We mix output into the second backwards computation so that
-            # torch.autograd.grad doesn't complain that some inputs
-            # are unreachable (which can happen if you differentiate
-            # only on the gradient.
-            if is_any_input_complex:
-                outputs_cpu = cpu_output.sum().abs() + sum(x.sum().abs() for x in cpu_gradInputs)
-                outputs_gpu = gpu_output.sum().abs() + sum(x.sum().abs() for x in gpu_gradInputs)
-            else:
-                outputs_cpu = cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs)
-                outputs_gpu = gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs)
-
-            cpu_gg = torch.autograd.grad(
-                outputs_cpu,
-                cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()),
-                retain_graph=True)
-            gpu_gg = torch.autograd.grad(
-                outputs_gpu,
-                gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
-                retain_graph=True)
-            test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-            for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
-                test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
+                cpu_gg = torch.autograd.grad(
+                    outputs_cpu,
+                    cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()),
+                    retain_graph=True)
+                gpu_gg = torch.autograd.grad(
+                    outputs_gpu,
+                    gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
+                    retain_graph=True)
+                test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                    test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
 
-        self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
+            self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
 
 
 class InputVariableMixin:
@@ -4445,42 +4742,46 @@ def __init__(self, *args, **kwargs):
         self.with_tf32 = kwargs.get('with_tf32', True)
         self.tf32_precision = kwargs.get('tf32_precision', 0.001)
         self.check_batched_grad = kwargs.get('check_batched_grad', True)
+        self.default_dtype = kwargs.get('default_dtype', None)
+        if self.default_dtype is None:
+            self.default_dtype = torch.get_default_dtype()
 
     def __call__(self, test_case):
-        module = self.constructor(*self.constructor_args)
-        input = self._get_input()
+        with set_default_dtype(self.default_dtype):
+            module = self.constructor(*self.constructor_args)
+            input = self._get_input()
 
-        # Check that these methods don't raise errors
-        module.__repr__()
-        str(module)
+            # Check that these methods don't raise errors
+            module.__repr__()
+            str(module)
 
-        target = self._get_target()
+            target = self._get_target()
 
-        if self.reference_fn is not None:
-            out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args)
-            ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,)
-            expected_out = self.reference_fn(*ref_args)
-            test_case.assertEqual(out, expected_out)
+            if self.reference_fn is not None:
+                out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args)
+                ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,)
+                expected_out = self.reference_fn(*ref_args)
+                test_case.assertEqual(out, expected_out)
 
-        if self.check_forward_only:
-            return
+            if self.check_forward_only:
+                return
 
-        params = tuple(x for x in module.parameters())
-        if not isinstance(input, tuple):
-            inputs = (input,) + params + (target,)
+            params = tuple(x for x in module.parameters())
+            if not isinstance(input, tuple):
+                inputs = (input,) + params + (target,)
 
-            def apply_fn(input, target, *params):
-                return module(input, target)
-        else:
-            inputs = input + params + (target,)
+                def apply_fn(input, target, *params):
+                    return module(input, target)
+            else:
+                inputs = input + params + (target,)
 
-            def apply_fn(input1, input2, target, *params):  # type: ignore[misc]
-                return module(input1, input2, target)
+                def apply_fn(input1, input2, target, *params):  # type: ignore[misc]
+                    return module(input1, input2, target)
 
-        gradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad)
+            gradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad)
 
-        if self.check_gradgrad:
-            gradgradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad)
+            if self.check_gradgrad:
+                gradgradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad)
 
     def test_cuda(self, test_case, dtype, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
@@ -4494,43 +4795,44 @@ def convert_dtype(obj, dtype, requires_grad=False):
         if not TEST_CUDA or not self.should_test_cuda:
             raise unittest.SkipTest('Excluded from CUDA tests')
 
-        cpu_input = self._get_input()
-        cpu_target = self._get_target()
-        cpu_module = self.constructor(*self.constructor_args)
-        gpu_module = self.constructor(*self.constructor_args)
-
-        # Convert input, target and module parameters to dtype
-        cpu_input = convert_dtype(cpu_input, dtype, True)
-        if cpu_target.is_floating_point() or cpu_target.is_complex():
-            cpu_target = convert_dtype(cpu_target, dtype)
-        cpu_module.type(dtype)
-        gpu_module.type(dtype)
-
-        # GPU setup
-        gpu_input = to_gpu(cpu_input)
-        gpu_target = to_gpu(cpu_target)
-        gpu_module.cuda()
-
-        # torch.HalfTensor doesn't support most operations, converting back to default
-        if dtype in {torch.half, torch.bfloat16}:
+        with set_default_dtype(self.default_dtype):
             cpu_input = self._get_input()
             cpu_target = self._get_target()
-            # Loss modules with weights require consistent input/module weight types
             cpu_module = self.constructor(*self.constructor_args)
-
-        cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
-        gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
-        # dtype used to be able to be None, so set precision in this way instead of a precision map
-        test_case.assertEqual(cpu_output, gpu_output,
-                              atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False)
-
-        cpu_gradInput = test_case._backward_criterion(
-            cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args)
-        gpu_gradInput = test_case._backward_criterion(
-            gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args)
-        # dtype used to be able to be None, so set precision in this way instead of a precision map
-        test_case.assertEqual(cpu_gradInput, gpu_gradInput,
-                              atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False)
+            gpu_module = self.constructor(*self.constructor_args)
+
+            # Convert input, target and module parameters to dtype
+            cpu_input = convert_dtype(cpu_input, dtype, True)
+            if cpu_target.is_floating_point() or cpu_target.is_complex():
+                cpu_target = convert_dtype(cpu_target, dtype)
+            cpu_module.type(dtype)
+            gpu_module.type(dtype)
+
+            # GPU setup
+            gpu_input = to_gpu(cpu_input)
+            gpu_target = to_gpu(cpu_target)
+            gpu_module.cuda()
+
+            # torch.HalfTensor doesn't support most operations, converting back to default
+            if dtype in {torch.half, torch.bfloat16}:
+                cpu_input = self._get_input()
+                cpu_target = self._get_target()
+                # Loss modules with weights require consistent input/module weight types
+                cpu_module = self.constructor(*self.constructor_args)
+
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
+            # dtype used to be able to be None, so set precision in this way instead of a precision map
+            test_case.assertEqual(cpu_output, gpu_output,
+                                  atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False)
+
+            cpu_gradInput = test_case._backward_criterion(
+                cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args)
+            gpu_gradInput = test_case._backward_criterion(
+                gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args)
+            # dtype used to be able to be None, so set precision in this way instead of a precision map
+            test_case.assertEqual(cpu_gradInput, gpu_gradInput,
+                                  atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False)
 
     def _get_target(self):
         return self._get_arg('target', False)
diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py
index 15e7b4512a42a3..4ace78f7594e33 100644
--- a/torch/testing/_internal/hypothesis_utils.py
+++ b/torch/testing/_internal/hypothesis_utils.py
@@ -189,7 +189,7 @@ def array_shapes(draw, min_dims=1, max_dims=None, min_side=1, max_side=None, max
         (If `qparams` arg is None), returns None.
 """
 @st.composite
-def tensor(draw, shapes=None, elements=None, qparams=None):
+def tensor(draw, shapes=None, elements=None, qparams=None, dtype=np.float32):
     if isinstance(shapes, SearchStrategy):
         _shape = draw(shapes)
     else:
@@ -197,7 +197,7 @@ def tensor(draw, shapes=None, elements=None, qparams=None):
     if qparams is None:
         if elements is None:
             elements = floats(-1e6, 1e6, allow_nan=False, width=32)
-        X = draw(stnp.arrays(dtype=np.float32, elements=elements, shape=_shape))
+        X = draw(stnp.arrays(dtype=dtype, elements=elements, shape=_shape))
         assume(not (np.isnan(X).any() or np.isinf(X).any()))
         return X, None
     qparams = draw(qparams)
@@ -205,7 +205,7 @@ def tensor(draw, shapes=None, elements=None, qparams=None):
         min_value, max_value = _get_valid_min_max(qparams)
         elements = floats(min_value, max_value, allow_infinity=False,
                           allow_nan=False, width=32)
-    X = draw(stnp.arrays(dtype=np.float32, elements=elements, shape=_shape))
+    X = draw(stnp.arrays(dtype=dtype, elements=elements, shape=_shape))
     # Recompute the scale and zero_points according to the X statistics.
     scale, zp = _calculate_dynamic_qparams(X, qparams[2])
     enforced_zp = _ENFORCED_ZERO_POINT.get(qparams[2], None)