From 4d6a891baf2224cfa81bfe7632cf08be50812216 Mon Sep 17 00:00:00 2001 From: Kurt Mohler Date: Thu, 10 Aug 2023 14:56:13 +0000 Subject: [PATCH] Remove `set_default_dtype` from nn tests (#105775) Part of #68972 Pull Request resolved: https://github.com/pytorch/pytorch/pull/105775 Approved by: https://github.com/ezyang --- test/test_jit.py | 175 ++-- test/test_nn.py | 265 +++--- torch/testing/_internal/common_nn.py | 846 +++++++++++++------- torch/testing/_internal/hypothesis_utils.py | 6 +- 4 files changed, 817 insertions(+), 475 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index df9bc735ee1721..03fc3679e2c3d6 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -16106,6 +16106,10 @@ def do_test(self): if not kwargs.get('check_jit', True): raise unittest.SkipTest('module test skipped on JIT') + default_dtype = torch.get_default_dtype() + if 'default_dtype' in kwargs and kwargs['default_dtype'] is not None: + default_dtype = kwargs['default_dtype'] + module_name = get_nn_module_name_from_kwargs(**kwargs) if 'constructor' in kwargs: @@ -16116,89 +16120,96 @@ def do_test(self): if "FunctionalModule" in str(nn_module): return - if 'constructor_args_fn' in kwargs: - constructor_args = kwargs['constructor_args_fn']() - else: - constructor_args = kwargs.get('constructor_args', ()) - - def create_script_module(*args, **kwargs): - """Construct a script module that passes arguments through to self.submodule""" - formals, tensors, actuals = get_script_args(args) - - method_args = ', '.join(['self'] + actuals) - call_args_str = ', '.join(actuals) - call = f"self.submodule({call_args_str})" - script = script_method_template.format(method_args, call) - - submodule_constants = [] - if kwargs.get('is_constant'): - submodule_constants = ['submodule'] - - # Create module to use the script method - class TheModule(torch.jit.ScriptModule): - __constants__ = submodule_constants + with set_default_dtype(default_dtype): + if 'constructor_args_fn' in kwargs: + constructor_args = kwargs['constructor_args_fn']() + else: + constructor_args = kwargs.get('constructor_args', ()) + + def create_script_module(*args, **kwargs): + """Construct a script module that passes arguments through to self.submodule""" + formals, tensors, actuals = get_script_args(args) + + method_args = ', '.join(['self'] + actuals) + call_args_str = ', '.join(actuals) + call = f"self.submodule({call_args_str})" + script = script_method_template.format(method_args, call) + + submodule_constants = [] + if kwargs.get('is_constant'): + submodule_constants = ['submodule'] + + # Create module to use the script method + class TheModule(torch.jit.ScriptModule): + __constants__ = submodule_constants + + def __init__(self): + super().__init__() + self.submodule = nn_module(*constructor_args) + + def make_module(script): + module = TheModule() + # check __repr__ + str(module) + module.define(script) + return module + + module = make_module(script) + self.assertExportImportModule(module, tensors) + create_script_module.last_graph = module.graph + mod = module(*args) + return mod + + # Construct a normal nn module to stay consistent with create_script_module + # and make use of a single global rng_state in module initialization + def create_nn_module(*args, **kwargs): + module = nn_module(*constructor_args) + return module(*args) + + # Set up inputs from tuple of sizes or constructor fn + dtype = torch.get_default_dtype() + if 'input_fn' in kwargs: + input = kwargs['input_fn']() + if isinstance(input, Tensor): + input = (input,) + + if all(tensor.is_complex() for tensor in input): + if dtype == torch.float: + dtype = torch.cfloat + elif dtype == torch.double: + dtype = torch.cdouble + else: + raise AssertionError(f"default_dtype {default_dtype} is not supported") - def __init__(self): - super().__init__() - self.submodule = nn_module(*constructor_args) - - def make_module(script): - module = TheModule() - # check __repr__ - str(module) - module.define(script) - return module - - module = make_module(script) - self.assertExportImportModule(module, tensors) - create_script_module.last_graph = module.graph - mod = module(*args) - return mod - - # Construct a normal nn module to stay consistent with create_script_module - # and make use of a single global rng_state in module initialization - def create_nn_module(*args, **kwargs): - module = nn_module(*constructor_args) - return module(*args) - - # Set up inputs from tuple of sizes or constructor fn - dtype = torch.float - if 'input_fn' in kwargs: - input = kwargs['input_fn']() - if isinstance(input, Tensor): - input = (input,) - - if all(tensor.is_complex() for tensor in input): - dtype = torch.cfloat - else: - input = (kwargs['input_size'],) - - if 'target_size' in kwargs: - input = input + (kwargs['target_size'],) - elif 'target_fn' in kwargs: - if torch.is_tensor(input): - input = (input,) - input = input + (kwargs['target_fn'](),) - elif 'target' in kwargs: - input = input + (kwargs['target'],) - - # Extra parameters to forward() - if 'extra_args' in kwargs: - input = input + kwargs['extra_args'] - - args_variable, kwargs_variable = create_input(input, dtype=dtype) - f_args_variable = deepcopy(unpack_variables(args_variable)) - - # TODO(issue#52052) Neither this nor no_grad should be required - # if check_against_reference() is updated to check gradients - # w.r.t. weights and then only check w.r.t. inputs if any - # inputs require it. - any_requires_grad = any(input.requires_grad for input in f_args_variable) - - # Check against Python module as reference - check_against_reference(self, create_script_module, create_nn_module, - lambda x: x, f_args_variable, - no_grad=no_grad or not any_requires_grad) + else: + input = (kwargs['input_size'],) + + if 'target_size' in kwargs: + input = input + (kwargs['target_size'],) + elif 'target_fn' in kwargs: + if torch.is_tensor(input): + input = (input,) + input = input + (kwargs['target_fn'](),) + elif 'target' in kwargs: + input = input + (kwargs['target'],) + + # Extra parameters to forward() + if 'extra_args' in kwargs: + input = input + kwargs['extra_args'] + + args_variable, kwargs_variable = create_input(input, dtype=dtype) + f_args_variable = deepcopy(unpack_variables(args_variable)) + + # TODO(issue#52052) Neither this nor no_grad should be required + # if check_against_reference() is updated to check gradients + # w.r.t. weights and then only check w.r.t. inputs if any + # inputs require it. + any_requires_grad = any(input.requires_grad for input in f_args_variable) + + # Check against Python module as reference + check_against_reference(self, create_script_module, create_nn_module, + lambda x: x, f_args_variable, + no_grad=no_grad or not any_requires_grad) if 'slowTest' in kwargs: do_test = slowTest(do_test) diff --git a/test/test_nn.py b/test/test_nn.py index 7d86573683aaf7..93c51a5bf88b2d 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -16,11 +16,6 @@ from unittest import SkipTest import torch - -# TODO: remove this global setting -# NN tests use double as the default dtype -torch.set_default_dtype(torch.double) - from torch import inf, nan import torch.autograd.forward_ad as fwAD import torch.backends.cudnn as cudnn @@ -39,7 +34,7 @@ download_file, get_function_arglist, load_tests, skipIfMps,\ IS_PPC, \ parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \ - skipIfTorchDynamo, IS_WINDOWS, gcIfJetson + skipIfTorchDynamo, IS_WINDOWS, gcIfJetson, set_default_dtype from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \ module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \ @@ -1180,7 +1175,7 @@ def test_ParameterList_meta(self): Parameter containing: tensor(..., device='meta', size=(1,), requires_grad=True)""") pl = torch.nn.ParameterList([p]) - self.assertExpectedInline(str(pl), """ParameterList( (0): Parameter containing: [torch.float64 of size 1])""") + self.assertExpectedInline(str(pl), """ParameterList( (0): Parameter containing: [torch.float32 of size 1])""") def test_ParameterList_replication(self): # The actual replication code from DP cannot be used on CPU so doing it manually here @@ -1827,6 +1822,7 @@ def test_weight_norm_pickle(self): self.assertIsInstance(m, nn.Linear) @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons") + @set_default_dtype(torch.double) def test_spectral_norm(self): input = torch.randn(3, 5) m = nn.Linear(5, 7) @@ -2185,7 +2181,7 @@ def test_nested_tensor_from_mask_error(self): self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) def test_normalize(self): - inputs = torch.randn(1, 3, 4, 4, requires_grad=True) + inputs = torch.randn(1, 3, 4, 4, requires_grad=True, dtype=torch.double) self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,))) self.assertTrue(gradcheck(lambda x: F.normalize(x, p=2, dim=-2), (inputs,))) @@ -2196,9 +2192,9 @@ def test_normalize(self): # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190 @skipIfRocm def test_broadcast_double_backwards_gpu(self): - tensors = (torch.randn(4, 4, device='cuda', requires_grad=True), - torch.randn(4, 4, device='cuda', requires_grad=True), - torch.randn(4, 4, device='cuda', requires_grad=True)) + tensors = (torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double), + torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double), + torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double)) # TODO(#50743): the following segfaults with check_batched_grad=True _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), tensors, check_batched_grad=False) @@ -2991,7 +2987,7 @@ def test_CTCLoss_long_targets(self): batch_size = 4 target_length = 1200 - log_probs = torch.randn(input_length, batch_size, vocab_size).log_softmax(2).requires_grad_() + log_probs = torch.randn(input_length, batch_size, vocab_size, dtype=torch.double).log_softmax(2).requires_grad_() targets = torch.randint(low=1, high=vocab_size - 1, size=(batch_size, target_length), dtype=torch.long) input_lengths = batch_size * [input_length] target_lengths = batch_size * [target_length] @@ -3130,10 +3126,11 @@ def test_Transformer_cell(self): [(bsz, tgt_length, d_model), (tgt_length, bsz, d_model)]): transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, - dim_feedforward, dropout, batch_first=batch_first) - src = torch.randn(src_size) + dim_feedforward, dropout, batch_first=batch_first, + dtype=torch.double) + src = torch.randn(src_size, dtype=torch.double) src_mask = transformer.generate_square_subsequent_mask(seq_length).double() - tgt = torch.randn(tgt_size) + tgt = torch.randn(tgt_size, dtype=torch.double) tgt_mask = transformer.generate_square_subsequent_mask(tgt_length).double() memory_mask = torch.randn(tgt_length, seq_length).double() src_key_padding_mask = torch.rand(bsz, seq_length) >= 0.5 @@ -4518,6 +4515,7 @@ def test_partial_flat_weights(self): @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1") + @set_default_dtype(torch.double) def test_RNN_dropout(self): # checking the assumption that cuDNN sticks dropout in between # RNN layers @@ -4560,6 +4558,7 @@ def test_RNN_dropout(self): self.assertEqual(hy.data[0][0][0], 10) self.assertEqual(hy.data[1][0][0], output_val) + @set_default_dtype(torch.double) def test_error_RNN_seq_len_zero(self): # checking error message when RNN has seq_len = 0 for module in (nn.RNN, nn.LSTM, nn.GRU): @@ -4628,6 +4627,7 @@ def test_RNN_dropout_state(self): self.assertNotEqual(hy1, hy3) @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1") + @set_default_dtype(torch.double) def test_RNN_change_dropout(self): for train, cuda in product((True, False), repeat=2): rnn = nn.RNN(100, 100, 2, dropout=0, nonlinearity='relu') @@ -4775,6 +4775,7 @@ def test_pixel_shuffle_unshuffle_5D(): test_pixel_shuffle_unshuffle_4D() test_pixel_shuffle_unshuffle_5D() + @set_default_dtype(torch.double) def test_pixel_shuffle_nhwc_cpu(self): input = torch.randn(3, 18, 4, 4, device='cpu') input = input.contiguous(memory_format=torch.channels_last).requires_grad_() @@ -4799,7 +4800,7 @@ def test_pixel_shuffle_nhwc_cpu(self): # These tests should be OpInfo'd def test_elu_inplace_on_view(self): - v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True) + v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True, dtype=torch.double) def func(root): x = root.clone() @@ -4812,7 +4813,7 @@ def func(root): gradgradcheck(func, [v]) def test_elu_inplace_gradgrad(self): - v = torch.randn(8, requires_grad=True) + v = torch.randn(8, requires_grad=True, dtype=torch.double) def func(root): x = root.clone() @@ -4822,7 +4823,7 @@ def func(root): gradgradcheck(func, [v]) def test_relu_inplace_on_view(self): - v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True) + v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True, dtype=torch.double) def func(root): x = root.clone() @@ -4940,8 +4941,8 @@ def test_bce_with_logits_gives_same_result_as_sigmoid_and_bce_loss_large_tensors self.assertEqual(output_sig.grad, output_logits.grad) def test_bce_with_logits_has_correct_forward_grad(self): - output = torch.randn(3, 5, requires_grad=True) - target = torch.randn(3, 5) + output = torch.randn(3, 5, requires_grad=True, dtype=torch.double) + target = torch.randn(3, 5, dtype=torch.double) for reduction in ('sum', 'mean', 'none'): gradcheck(lambda self, target: nn.BCEWithLogitsLoss(reduction=reduction)(self, target), (output, target), check_forward_ad=True) @@ -5038,7 +5039,7 @@ def test_bce_loss_broadcasts_weights(self): self.assertEqual(out1, out2) def test_hardtanh_inplace_gradgrad(self): - v = torch.randn(8, requires_grad=True) + v = torch.randn(8, requires_grad=True, dtype=torch.double) def func(root): x = root.clone() @@ -5297,8 +5298,8 @@ def test_batchnorm_nhwc_cuda(self): self.assertTrue(torch.equal(out1, out2)) def test_pairwise_distance(self): - input1 = torch.randn(4, 4, requires_grad=True) - input2 = torch.randn(4, 4, requires_grad=True) + input1 = torch.randn(4, 4, requires_grad=True, dtype=torch.double) + input2 = torch.randn(4, 4, requires_grad=True, dtype=torch.double) self.assertTrue(gradcheck(lambda x, y: F.pairwise_distance(x, y), (input1, input2))) # TODO: Create an OpInfo for pdist @@ -5411,18 +5412,18 @@ def test_kl_div_log_softmax_target(self): ) def test_cosine_embedding_loss_no_reduce(self): - input1 = torch.randn(15, 10, requires_grad=True) - input2 = torch.randn(15, 10, requires_grad=True) - target = torch.randn(15).sign() + input1 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) + target = torch.randn(15, dtype=torch.double).sign() self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss( x, y, z, reduction='none'), (input1, input2, target))) self.assertEqual(F.cosine_embedding_loss(input1, input2, target, reduction='none'), loss_reference_fns['CosineEmbeddingLoss'](input1, input2, target, reduction='none')) def test_cosine_embedding_loss_margin_no_reduce(self): - input1 = torch.randn(15, 10, requires_grad=True) - input2 = torch.randn(15, 10, requires_grad=True) - target = torch.randn(15).sign() + input1 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) + target = torch.randn(15, dtype=torch.double).sign() self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss( x, y, z, margin=0.5, reduction='none'), (input1, input2, target))) self.assertEqual(F.cosine_embedding_loss(input1, input2, target, margin=0.5, reduction='none'), @@ -5444,54 +5445,54 @@ def test_cosine_embedding_loss_invalid_shape(self): F.cosine_embedding_loss(torch.randn(2, 5), torch.randn(2, 5), torch.randn(())) def test_margin_ranking_loss_no_reduce(self): - input1 = torch.randn(15).mul_(10).requires_grad_() - input2 = torch.randn(15).mul_(10).requires_grad_() - target = torch.randn(15).sign() + input1 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() + input2 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() + target = torch.randn(15, dtype=torch.double).sign() self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss( x, y, z, reduction='none'), (input1, input2, target))) self.assertEqual(F.margin_ranking_loss(input1, input2, target, reduction='none'), loss_reference_fns['MarginRankingLoss'](input1, input2, target, reduction='none')) def test_margin_ranking_loss_margin_no_reduce(self): - input1 = torch.randn(15).mul_(10).requires_grad_() - input2 = torch.randn(15).mul_(10).requires_grad_() - target = torch.randn(15).sign() + input1 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() + input2 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() + target = torch.randn(15, dtype=torch.double).sign() self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss( x, y, z, margin=0.5, reduction='none'), (input1, input2, target))) self.assertEqual(F.margin_ranking_loss(input1, input2, target, margin=0.5, reduction='none'), loss_reference_fns['MarginRankingLoss'](input1, input2, target, margin=0.5, reduction='none')) def test_triplet_margin_loss(self): - input1 = torch.randn(5, 10, requires_grad=True) - input2 = torch.randn(5, 10, requires_grad=True) - input3 = torch.randn(5, 10, requires_grad=True) + input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( x1, x2, x3), (input1, input2, input3))) self.assertEqual(F.triplet_margin_loss(input1, input2, input3), loss_reference_fns['TripletMarginLoss'](input1, input2, input3)) def test_triplet_margin_loss_swap(self): - input1 = torch.randn(5, 10, requires_grad=True) - input2 = torch.randn(5, 10, requires_grad=True) - input3 = torch.randn(5, 10, requires_grad=True) + input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( x1, x2, x3, swap=True), (input1, input2, input3))) self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True), loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True)) def test_triplet_margin_loss_no_reduce(self): - input1 = torch.randn(5, 10, requires_grad=True) - input2 = torch.randn(5, 10, requires_grad=True) - input3 = torch.randn(5, 10, requires_grad=True) + input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( x1, x2, x3, reduction='none'), (input1, input2, input3))) self.assertEqual(F.triplet_margin_loss(input1, input2, input3, reduction='none'), loss_reference_fns['TripletMarginLoss'](input1, input2, input3, reduction='none')) def test_triplet_margin_loss_swap_no_reduce(self): - input1 = torch.randn(5, 10, requires_grad=True) - input2 = torch.randn(5, 10, requires_grad=True) - input3 = torch.randn(5, 10, requires_grad=True) + input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) + input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( x1, x2, x3, swap=True, reduction='none'), (input1, input2, input3))) self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'), @@ -5511,11 +5512,11 @@ def test_pointwise_loss_broadcast(self): 'huber_loss': lambda x, y, r: F.huber_loss(x, y, reduction=r), } - input = torch.randn(2, 1, requires_grad=True) + input = torch.randn(2, 1, requires_grad=True, dtype=torch.double) for fn in losses.values(): for requires_grad in [True, False]: # When target.requires_grad=True, its impl is in Python, while the other is in TH. - target = torch.randn(2, 10, requires_grad=requires_grad) + target = torch.randn(2, 10, requires_grad=requires_grad, dtype=torch.double) for reduction in ['none', 'mean', 'sum']: l = fn(input, target, reduction) if reduction == 'none': @@ -5571,6 +5572,7 @@ def test_huber_loss_zero_delta(): test_huber_loss_negative_delta() test_huber_loss_zero_delta() + @set_default_dtype(torch.double) def test_cosine_similarity(self): # Check cosine_similarity input/output shapes input_size = (1, 3, 2, 1) @@ -5715,6 +5717,7 @@ def test_affine_grid_error_checking(self): with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"): F.affine_grid(theta, torch.Size([1, 1, 2, 2, 2, 2]), align_corners=False) + @set_default_dtype(torch.double) def test_grid_sample(self): # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient, # so we test both cases. @@ -6075,6 +6078,7 @@ def get_grid(device='cpu', data=None): with cudnn.flags(enabled=False): test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad) + @set_default_dtype(torch.double) def test_grid_sample_3d(self): # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient, # so we test both cases. @@ -6365,6 +6369,7 @@ def normalize_indices(indices_unnormalized: torch.Tensor, dim_size: int, align_c ) self.assertEqual(output_tensor_2d_x[0, 0, 0, :], output_tensor_3d_z[0, 0, 0, 0, :], atol=0, rtol=0) + @set_default_dtype(torch.double) def test_affine_grid(self): # test known input on CPU input = torch.arange(1., 7).view(1, 2, 3) @@ -6413,6 +6418,7 @@ def test_affine_grid(self): self.assertEqual(out_cpu, out_cuda) self.assertEqual(input_cpu.grad, input_gpu.grad) + @set_default_dtype(torch.double) def test_affine_grid_3d(self): # test known input on CPU input = torch.arange(1., 13).view(1, 3, 4) @@ -6472,6 +6478,7 @@ def test_channel_shuffle_return_alias_of_self(self): output = torch.nn.ChannelShuffle(groups)(input_tensor) torch.testing.assert_close(output, input_tensor) + @set_default_dtype(torch.double) def test_upsamplingLinear1d(self): for align_corners in [True, False]: for recompute_scale_factor in [True, False]: @@ -6502,6 +6509,7 @@ def test_upsamplingLinear1d_spatial_invariance(self): out_t_5 = m(in_t_9[:, :, :5]) self.assertEqual(out_t_9[:, :, :15], out_t_5) + @set_default_dtype(torch.double) def test_upsampling_not_recompute_scale_factor(self): # test output against known input: result must match opencv in_t = torch.arange(8.).view(1, 2, 2, 2) @@ -6568,7 +6576,7 @@ def test_upsamplingTrilinear3d_spatial_invariance(self): def test_upsampling_small_scale(self): m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear") - in_t = torch.arange(1, 5, dtype=torch.float64).reshape(1, 1, 2, 2) + in_t = torch.arange(1, 5, dtype=torch.get_default_dtype()).reshape(1, 1, 2, 2) out_t = m(in_t) expected_out_t = torch.tensor([[[[2.5]]]]) self.assertEqual(expected_out_t, out_t) @@ -6706,6 +6714,7 @@ def helper(size, dtype, mode, device, is_channels_last): helper(size, dtype, mode, device, is_channels_last) + @set_default_dtype(torch.double) def test_interpolate(self): def _test_interpolate_non_integer_size_warning(in_t, out_size, dim, **kwargs): test_sizes = [float(out_size), @@ -6858,15 +6867,15 @@ def run(input1_tp, input2_tp): self.assertEqual(g2, g2_nc) def test_bilinear_no_bias(self): - module = nn.Bilinear(10, 10, 8) - module_no_bias = nn.Bilinear(10, 10, 8, False) + module = nn.Bilinear(10, 10, 8, dtype=torch.double) + module_no_bias = nn.Bilinear(10, 10, 8, False, dtype=torch.double) module.bias.data.zero_() module.weight.data.copy_(module_no_bias.weight) - input1 = torch.randn(4, 10, requires_grad=True) - input2 = torch.randn(4, 10, requires_grad=True) - grad_output = torch.randn(4, 8) + input1 = torch.randn(4, 10, requires_grad=True, dtype=torch.double) + input2 = torch.randn(4, 10, requires_grad=True, dtype=torch.double) + grad_output = torch.randn(4, 8, dtype=torch.double) def run(net): input1.grad = input2.grad = None @@ -7296,20 +7305,21 @@ def test_max_pool1d_invalid_output_size(self): res = arg_class(*arg_4) class TestFusionEval(TestCase): - @given(X=hu.tensor(shapes=((5, 3, 5, 5),)), - running_mean=hu.tensor(shapes=(6,)), - running_var=hu.tensor(shapes=(6,))) + @set_default_dtype(torch.double) + @given(X=hu.tensor(shapes=((5, 3, 5, 5),), dtype=np.double), + running_mean=hu.tensor(shapes=(6,), dtype=np.double), + running_var=hu.tensor(shapes=(6,), dtype=np.double)) def test_fuse_module_eval_numerics(self, X, running_mean, running_var): inputs, _ = X iC, oC = inputs.shape[1], len(running_mean[0]) - inputs = torch.from_numpy(inputs).to(torch.double) + inputs = torch.from_numpy(inputs) kernel_size = (3, 3) conv_ref = torch.nn.Conv2d(iC, oC, bias=True, kernel_size=kernel_size) bn_ref = torch.nn.BatchNorm2d(oC) - bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double) - bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double) + bn_ref.running_mean = torch.from_numpy(running_mean[0]) + bn_ref.running_var = torch.from_numpy(running_var[0]) conv_ref.eval() bn_ref.eval() @@ -7322,8 +7332,8 @@ def test_fuse_module_eval_numerics(self, X, running_mean, running_var): self.assertEqual(Y_ref, Y_hat, msg="Conv+BN fusion results are off") na_bn_ref = torch.nn.BatchNorm2d(oC, affine=False) - na_bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double) - na_bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double) + na_bn_ref.running_mean = torch.from_numpy(running_mean[0]) + na_bn_ref.running_var = torch.from_numpy(running_var[0]) na_bn_ref.eval() Y_ref = na_bn_ref(conv_ref(inputs)) @@ -7566,20 +7576,23 @@ def forward(self, input): nn.MaxPool1d(2, return_indices=True), nn.MaxUnpool1d(2)), input_size=(1, 1, 4), - fullname='MaxUnpool1d_net',)) + fullname='MaxUnpool1d_net', + default_dtype=torch.double,)) add_test(NewModuleTest( constructor=lambda: UnpoolingNet( nn.MaxPool2d(2, return_indices=True), nn.MaxUnpool2d(2)), input_size=(1, 1, 2, 4), - fullname='MaxUnpool2d_net',)) + fullname='MaxUnpool2d_net', + default_dtype=torch.double,)) add_test(NewModuleTest( constructor=lambda: UnpoolingNet( nn.MaxPool3d(2, return_indices=True), nn.MaxUnpool3d(2)), input_size=(1, 1, 2, 4, 6), fullname='MaxUnpool3d_net', - check_gradgrad=False,)) + check_gradgrad=False, + default_dtype=torch.double,)) add_test(NewModuleTest( constructor=lambda: UnpoolingNet( @@ -7587,14 +7600,16 @@ def forward(self, input): nn.MaxUnpool1d(2)), input_size=(1, 4), reference_fn=single_batch_reference_fn, - fullname='MaxUnpool1d_net_no_batch_dim',)) + fullname='MaxUnpool1d_net_no_batch_dim', + default_dtype=torch.double,)) add_test(NewModuleTest( constructor=lambda: UnpoolingNet( nn.MaxPool2d(2, return_indices=True), nn.MaxUnpool2d(2)), input_size=(1, 2, 4), reference_fn=single_batch_reference_fn, - fullname='MaxUnpool2d_net_no_batch_dim',)) + fullname='MaxUnpool2d_net_no_batch_dim', + default_dtype=torch.double,)) add_test(NewModuleTest( constructor=lambda: UnpoolingNet( @@ -7603,7 +7618,8 @@ def forward(self, input): input_size=(1, 2, 4, 6), reference_fn=single_batch_reference_fn, fullname='MaxUnpool3d_net_no_batch_dim', - check_gradgrad=False)) + check_gradgrad=False, + default_dtype=torch.double,)) class _AdaptiveLogSoftmaxWithLoss(nn.AdaptiveLogSoftmaxWithLoss): def __call__(self, input): @@ -7615,7 +7631,8 @@ def __call__(self, input): input_size=(4, 16), fullname='AdaptiveLogSoftmax', with_tf32=True, - tf32_precision=0.005)) + tf32_precision=0.005, + default_dtype=torch.double)) # The following are helpers for TestNN.test_affine_* @@ -8804,8 +8821,9 @@ def test_TransformerEncoderLayer_empty(self, device): for training in (True, False): for batch_first, input_shape in [(True, (0, 10, 512)), (False, (10, 0, 512))]: - input = torch.rand(*input_shape, device=device) - encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device) + input = torch.rand(*input_shape, device=device, dtype=torch.double) + encoder_layer = nn.TransformerEncoderLayer( + d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) if not training: encoder_layer = encoder_layer.eval() with torch.no_grad(): @@ -8820,7 +8838,7 @@ def test_TransformerEncoderLayer_empty(self, device): nt = torch.nested.nested_tensor([], device=device) _test_module_empty_input(self, encoder_layer, nt, check_size=False, inference=True) - nt = torch.nested.nested_tensor([torch.rand(0, 512, device=device)], device=device) + nt = torch.nested.nested_tensor([torch.rand(0, 512, device=device, dtype=torch.double)], device=device) _test_module_empty_input(self, encoder_layer, nt, check_size=False, inference=True) else: _test_module_empty_input(self, encoder_layer, input, check_size=False) @@ -8830,8 +8848,8 @@ def test_TransformerEncoderLayer_empty(self, device): def test_TransformerEncoder_empty(self, device): for batch_first, input_shape in [(True, (0, 10, 512)), (False, (10, 0, 512))]: - input = torch.rand(*input_shape, device=device) - encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device) + input = torch.rand(*input_shape, device=device, dtype=torch.double) + encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6).to(device) _test_module_empty_input(self, transformer_encoder, input, check_size=False) @@ -8840,9 +8858,9 @@ def test_TransformerEncoder_empty(self, device): def test_TransformerDecoderLayer_empty(self, device): for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)), (False, (10, 0, 512), (20, 0, 512))]: - memory = torch.rand(*memory_shape, device=device) - tgt = torch.rand(*tgt_shape, requires_grad=True, device=device) - decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device) + memory = torch.rand(*memory_shape, device=device, dtype=torch.double) + tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double) + decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) self._test_module_empty_inputs(decoder_layer, [tgt, memory]) @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] @@ -8850,9 +8868,9 @@ def test_TransformerDecoderLayer_empty(self, device): def test_TransformerDecoder_empty(self, device): for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)), (False, (10, 0, 512), (20, 0, 512))]: - memory = torch.rand(*memory_shape, device=device) - tgt = torch.rand(*tgt_shape, requires_grad=True, device=device) - decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device) + memory = torch.rand(*memory_shape, device=device, dtype=torch.double) + tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double) + decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6).to(device) self._test_module_empty_inputs(transformer_decoder, [tgt, memory]) @@ -8860,9 +8878,9 @@ def test_TransformerDecoder_empty(self, device): @onlyNativeDeviceTypes def test_Transformer_empty(self, device): for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]: - transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12).to(device) - src = torch.rand(*src_shape, requires_grad=True, device=device) - tgt = torch.rand(*tgt_shape, requires_grad=True, device=device) + transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12, dtype=torch.double).to(device) + src = torch.rand(*src_shape, requires_grad=True, device=device, dtype=torch.double) + tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double) self._test_module_empty_inputs(transformer_model, [src, tgt]) @onlyNativeDeviceTypes @@ -9216,7 +9234,7 @@ def v(fn): v(lambda: F.hinge_embedding_loss(input, input, reduction=reduction)) v(lambda: F.poisson_nll_loss(input, input, reduction=reduction)) v(lambda: F.gaussian_nll_loss(input, input, var, reduction=reduction)) - v(lambda: F.binary_cross_entropy(torch.sigmoid(input), input.gt(0).double(), reduction=reduction)) + v(lambda: F.binary_cross_entropy(torch.sigmoid(input), input.gt(0).to(torch.get_default_dtype()), reduction=reduction)) v(lambda: F.binary_cross_entropy_with_logits(input, input, reduction=reduction)) zeros = torch.zeros_like(input).to(torch.int64) @@ -9363,27 +9381,27 @@ def test_upsamplingNearest1d(self, device, mode): check_forward_ad = torch.device(device).type != 'xla' m = nn.Upsample(size=4, mode=mode) - in_t = torch.ones(1, 1, 2, device=device) + in_t = torch.ones(1, 1, 2, device=device, dtype=torch.double) in_uint8_t = torch.ones(1, 1, 2, dtype=torch.uint8, device=device) with warnings.catch_warnings(record=True) as w: out_t = m(in_t) out_uint8_t = m(in_uint8_t) - self.assertEqual(torch.ones(1, 1, 4, device=device), out_t.data) + self.assertEqual(torch.ones(1, 1, 4, device=device, dtype=torch.double), out_t.data) self.assertEqual(torch.ones(1, 1, 4, dtype=torch.uint8, device=device), out_uint8_t.data) # Checks upsampling - input = torch.randn(1, 1, 2, requires_grad=True, device=device) + input = torch.randn(1, 1, 2, requires_grad=True, device=device, dtype=torch.double) gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad) gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) # Checks downsampling - input = torch.randn(1, 1, 20, requires_grad=True, device=device) + input = torch.randn(1, 1, 20, requires_grad=True, device=device, dtype=torch.double) gradcheck(lambda x: F.interpolate(x, 11, mode=mode), [input], check_forward_ad=check_forward_ad) gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) # consistency CUDA/CPU check if torch.device(device).type == 'cuda': - input_cuda = torch.randn(1, 1, 20, device=device) + input_cuda = torch.randn(1, 1, 20, device=device, dtype=torch.double) input_cpu = input_cuda.cpu() output_cuda = F.interpolate(input_cuda, 4, mode=mode) output_cpu = F.interpolate(input_cpu, 4, mode=mode) @@ -9459,32 +9477,36 @@ def test_upsamplingNearest2d(self, device, memory_format, mode): # Forward AD does not support XLA because XLA tensors don't have storage check_forward_ad = torch.device(device).type != 'xla' - in_t = torch.ones(1, 2, 2, 2, device=device).contiguous(memory_format=memory_format) + in_t = torch.ones(1, 2, 2, 2, device=device, dtype=torch.double).contiguous(memory_format=memory_format) in_uint8_t = torch.ones(1, 2, 2, 2, dtype=torch.uint8, device=device).contiguous(memory_format=memory_format) with warnings.catch_warnings(record=True) as w: out_t = F.interpolate(in_t, size=4, mode=mode) out_uint8_t = F.interpolate(in_uint8_t, size=4, mode=mode) self.assertEqual(len(w), 0) - self.assertEqual(torch.ones(1, 2, 4, 4, device=device), out_t) + self.assertEqual(torch.ones(1, 2, 4, 4, device=device, dtype=torch.double), out_t) self.assertEqual(torch.ones(1, 2, 4, 4, dtype=torch.uint8, device=device), out_uint8_t) # Assert that memory format is carried through to the output self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) # test forward when input's height is not same as width - in_t = torch.ones(1, 2, 2, 1, device=device).contiguous(memory_format=memory_format).requires_grad_() + in_t = torch.ones(1, 2, 2, 1, device=device, dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() out_t = F.interpolate(in_t, size=(4, 2), mode=mode) - self.assertEqual(torch.ones(1, 2, 4, 2, device=device), out_t) + self.assertEqual(torch.ones(1, 2, 4, 2, device=device, dtype=torch.double), out_t) self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) out_t.backward(torch.randn_like(out_t)) self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) # test backward when input's height is not same as width - input = torch.ones(1, 2, 2, 1, requires_grad=True, device=device).contiguous(memory_format=memory_format) + input = torch.ones( + 1, 2, 2, 1, requires_grad=True, device=device, + dtype=torch.double).contiguous(memory_format=memory_format) gradcheck(lambda x: F.interpolate(x, size=(4, 2), mode=mode), [input], check_forward_ad=check_forward_ad) gradgradcheck(lambda x: F.interpolate(x, size=(4, 2), mode=mode), [input], check_fwd_over_rev=check_forward_ad) - input = torch.randn(1, 2, 2, 2, requires_grad=True, device=device).contiguous(memory_format=memory_format) + input = torch.randn( + 1, 2, 2, 2, requires_grad=True, device=device, + dtype=torch.double).contiguous(memory_format=memory_format) self.assertEqual( F.interpolate(input, 4, mode=mode), F.interpolate(input, scale_factor=2, mode=mode)) @@ -9497,7 +9519,9 @@ def test_upsamplingNearest2d(self, device, memory_format, mode): for shapes, scale_factor in product([ (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2) ], [0.5, 1.5, 2]): - a_cuda = torch.randn(*shapes, device=device).contiguous(memory_format=memory_format).requires_grad_() + a_cuda = torch.randn( + *shapes, device=device, + dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() a_cpu = a_cuda.detach().cpu().requires_grad_() out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, mode=mode) @@ -9565,14 +9589,14 @@ def test_upsamplingNearest3d(self, device, memory_format, mode): check_forward_ad = torch.device(device).type != 'xla' m = nn.Upsample(size=4, mode=mode) - in_t = torch.ones(1, 2, 2, 2, 2, device=device).contiguous(memory_format=memory_format).requires_grad_() + in_t = torch.ones(1, 2, 2, 2, 2, device=device, dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() in_uint8_t = torch.ones( 1, 2, 2, 2, 2, dtype=torch.uint8, device=device ).contiguous(memory_format=memory_format) with warnings.catch_warnings(record=True) as w: out_t = m(in_t) out_uint8_t = m(in_uint8_t) - expected_output = torch.ones(1, 2, 4, 4, 4, device=device) + expected_output = torch.ones(1, 2, 4, 4, 4, device=device, dtype=torch.double) self.assertEqual(expected_output, out_t) self.assertEqual(expected_output.to(torch.uint8), out_uint8_t) # Assert that memory format is carried through to the output @@ -9581,7 +9605,7 @@ def test_upsamplingNearest3d(self, device, memory_format, mode): self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) input = torch.randn( - 1, 2, 2, 2, 2, requires_grad=True, device=device + 1, 2, 2, 2, 2, requires_grad=True, device=device, dtype=torch.double ).contiguous(memory_format=memory_format) gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad) gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) @@ -9590,7 +9614,7 @@ def test_upsamplingNearest3d(self, device, memory_format, mode): # https://github.com/pytorch/pytorch/issues/54590 if torch.device(device).type == 'cuda': a = torch.ones( - 2, 2, 2, 3, 4, device=device, requires_grad=True + 2, 2, 2, 3, 4, device=device, requires_grad=True, dtype=torch.double ).contiguous(memory_format=torch.channels_last_3d) # make the data asymmetric; ensure that cuda/cpu handle channels_last appropriately. a[1][1][1][2][2] = a[1][1][1][2][3] = 0 @@ -9670,11 +9694,13 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias) # test float scale factor up & downsampling for scale_factor in [0.5, 1.5, 2]: - in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_() + in_t = torch.ones( + 2, 3, 8, 8, device=device, + dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() out_size = int(math.floor(in_t.shape[-1] * scale_factor)) with warnings.catch_warnings(record=True) as w: out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) - expected_out = torch.ones(2, 3, out_size, out_size, device=device) + expected_out = torch.ones(2, 3, out_size, out_size, device=device, dtype=torch.double) self.assertEqual(expected_out, out_t) # Assert that memory format is carried through to the output self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) @@ -9687,7 +9713,9 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory else: nondet_tol = 0.0 - input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_() + input = torch.randn( + 2, 3, 8, 8, device=device, + dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() gradcheck( lambda x: F.interpolate(x, out_size, **kwargs), [input], @@ -9705,7 +9733,7 @@ def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2) ]: a_cuda = torch.randn( - *shapes, device=device + *shapes, device=device, dtype=torch.double ).contiguous(memory_format=memory_format).requires_grad_() a_cpu = a_cuda.detach().cpu().requires_grad_() @@ -9922,19 +9950,19 @@ def test_upsamplingTrilinear3d(self, device, align_corners): # test float scale factor up & downsampling for scale_factor in [0.5, 1.5, 2]: m = nn.Upsample(scale_factor=scale_factor, **kwargs) - in_t = torch.ones(1, 2, 2, 2, 2, device=device) + in_t = torch.ones(1, 2, 2, 2, 2, device=device, dtype=torch.double) in_t = in_t.contiguous(memory_format=memory_format).requires_grad_() out_size = int(math.floor(in_t.shape[-1] * scale_factor)) with warnings.catch_warnings(record=True) as w: out_t = m(in_t) - expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device) + expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device, dtype=torch.double) self.assertEqual(expected_out, out_t) # Assert that memory format is carried through to the output self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) out_t.backward(torch.randn_like(out_t)) self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) - input = torch.randn(1, 2, 2, 2, 2, requires_grad=True) + input = torch.randn(1, 2, 2, 2, 2, requires_grad=True, dtype=torch.double) self.assertEqual( F.interpolate(input, (out_size, out_size, out_size), **kwargs), F.interpolate(input, scale_factor=scale_factor, **kwargs)) @@ -10922,14 +10950,14 @@ def test_layernorm_weight_bias(self): self.assertEqual(out_none_bias, out_zero_bias) def test_hardsigmoid_grad(self, device): - inputs = (torch.randn(4, 16, 16, device=device) - 0.5) * 10 + inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10 inputs.requires_grad = True self.assertTrue(gradcheck(F.hardsigmoid, (inputs,))) # currently fails on XLA @onlyNativeDeviceTypes def test_hardswish_grad(self, device): - inputs = (torch.randn(4, 16, 16, device=device) - 0.5) * 10 + inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10 inputs.requires_grad = True self.assertTrue(gradcheck(F.hardswish, (inputs,))) @@ -11654,6 +11682,7 @@ def test_cross_entropy_label_smoothing_errors(self, device): r"label_smoothing must be between 0\.0"): loss(*input_arg) + @set_default_dtype(torch.double) def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, device): N, C = 10, 4 ks = range(5) @@ -11827,7 +11856,7 @@ def func(x): seeds = (44, 83, 71, 25, 999) for sd in seeds: torch.manual_seed(sd) - x = torch.randn(1, 12, 12, device=device, requires_grad=True) + x = torch.randn(1, 12, 12, device=device, requires_grad=True, dtype=torch.double) gradcheck(func, [x], check_forward_ad=True) gradgradcheck(func, [x], check_fwd_over_rev=True) if device == 'cpu': @@ -12103,9 +12132,9 @@ def test_triplet_margin_with_distance_loss_default_parity(self, device): itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')): kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]} - anchor = torch.randn(5, 10, device=device, requires_grad=True) - positive = torch.randn(5, 10, device=device, requires_grad=True) - negative = torch.randn(5, 10, device=device, requires_grad=True) + anchor = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) + positive = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) + negative = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) # Test forward, functional expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs) @@ -12144,9 +12173,9 @@ def cosine_distance(x, y): for distance_fn, reduction, margin, swap \ in itertools.product(distance_functions, reductions, margins, swaps): - anchor = torch.randn(5, 10, device=device, requires_grad=True) - positive = torch.randn(5, 10, device=device, requires_grad=True) - negative = torch.randn(5, 10, device=device, requires_grad=True) + anchor = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) + positive = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) + negative = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) # Test backward self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( @@ -12454,7 +12483,7 @@ def test_transformerencoderlayer_fast_path(self, device, dtype): model.eval() # Batched inputs - src = torch.rand(batch_size, src_len, 512) + src = torch.rand(batch_size, src_len, 512, dtype=dtype) # Attention mask of shape (src_len, src_len) src_mask = torch.zeros(src_len, src_len).to(torch.bool) diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 504748f7d58f90..b6898121bfa343 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -15,7 +15,7 @@ import torch.nn.functional as F from torch.nn import _reduction as _Reduction from torch.testing._internal.common_utils import TestCase, to_gpu, freeze_rng_state, is_iterable, \ - gradcheck, gradgradcheck + gradcheck, gradgradcheck, set_default_dtype from torch.testing._internal.common_cuda import TEST_CUDA from torch.autograd.gradcheck import _get_numerical_jacobian, _iter_tensors from torch.autograd import Variable @@ -106,6 +106,7 @@ def get_weight(m): reference_fn=lambda i, p, _: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Linear', @@ -116,11 +117,13 @@ def get_weight(m): reference_fn=lambda i, p, _: torch.mm(i, p[0].t()), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='RReLU', input_size=(1, 2, 2), test_cuda=False, + default_dtype=torch.double, ), dict( module_name='RReLU', @@ -129,11 +132,13 @@ def get_weight(m): input_size=(4, 4, 5), desc='with_up_down', test_cuda=False, + default_dtype=torch.double, ), dict( module_name='Flatten', input_size=(2, 3, 4, 5), - reference_fn=lambda i, *_: torch.flatten(i, 1) + reference_fn=lambda i, *_: torch.flatten(i, 1), + default_dtype=torch.double, ), # TODO: reference function dict( @@ -144,6 +149,7 @@ def get_weight(m): check_gradgrad=False, # TODO(#50743): Figure out the error. "RuntimeError: Unrecognized tensor type ID: Batched" check_batched_grad=False, + default_dtype=torch.double, ), ] @@ -174,11 +180,12 @@ def poissonnllloss_no_reduce_test(): input_fn=lambda: torch.rand(10, 10), cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: i.exp() - t.mul(i), - pickle=False) + pickle=False, + default_dtype=torch.double) def bceloss_no_reduce_test(): - t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype())) + t = Variable(torch.randn(15, 10).gt(0).to(torch.double)) return dict( fullname='BCELoss_no_reduce', constructor=wrap_functional( @@ -189,11 +196,12 @@ def bceloss_no_reduce_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()), pickle=False, - precision=7e-4) + precision=7e-4, + default_dtype=torch.double) def bceloss_no_reduce_scalar_test(): - t = torch.randn(()).gt(0).to(torch.get_default_dtype()) + t = torch.randn(()).gt(0).to(torch.double) return dict( fullname='BCELoss_no_reduce_scalar', constructor=wrap_functional( @@ -203,12 +211,13 @@ def bceloss_no_reduce_scalar_test(): input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2), cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()), - pickle=False) + pickle=False, + default_dtype=torch.double) def bceloss_weights_no_reduce_test(): - t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype())) - weights = torch.rand(10) + t = Variable(torch.randn(15, 10, dtype=torch.double).gt(0).to(torch.double)) + weights = torch.rand(10, dtype=torch.double) return dict( fullname='BCELoss_weights_no_reduce', constructor=wrap_functional( @@ -221,13 +230,14 @@ def bceloss_weights_no_reduce_test(): cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights}, reference_fn=lambda i, p, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights, pickle=False, - precision=3e-4 + precision=3e-4, + default_dtype=torch.double, ) def bceloss_weights_no_reduce_scalar_test(): - t = torch.randn(()).gt(0).to(torch.get_default_dtype()) - weights = torch.rand(()) + t = torch.randn(()).gt(0).to(torch.double) + weights = torch.rand((), dtype=torch.double) return dict( fullname='BCELoss_weights_no_reduce_scalar', constructor=wrap_functional( @@ -239,12 +249,13 @@ def bceloss_weights_no_reduce_scalar_test(): cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights}, input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2), reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()) * weights, - pickle=False + pickle=False, + default_dtype=torch.double, ) def bce_with_logistic_legacy_enum_test(): - t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype())) + t = Variable(torch.randn(15, 10).gt(0).to(torch.double)) sigmoid = nn.Sigmoid() return dict( fullname='BCEWithLogitsLoss_legacy_enum', @@ -257,11 +268,12 @@ def bce_with_logistic_legacy_enum_test(): reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()), check_gradgrad=False, pickle=False, + default_dtype=torch.double, ) def bce_with_logistic_no_reduce_test(): - t = Variable(torch.randn(15, 10).gt(0).to(torch.get_default_dtype())) + t = Variable(torch.randn(15, 10).gt(0).to(torch.double)) sigmoid = nn.Sigmoid() return dict( fullname='BCEWithLogitsLoss_no_reduce', @@ -274,11 +286,12 @@ def bce_with_logistic_no_reduce_test(): reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()), check_gradgrad=False, pickle=False, + default_dtype=torch.double, ) def bce_with_logistic_no_reduce_scalar_test(): - t = torch.randn(()).gt(0).to(torch.get_default_dtype()) + t = torch.randn(()).gt(0).to(torch.double) sigmoid = nn.Sigmoid() return dict( fullname='BCEWithLogitsLoss_no_reduce_scalar', @@ -290,12 +303,13 @@ def bce_with_logistic_no_reduce_scalar_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()), check_gradgrad=False, - pickle=False + pickle=False, + default_dtype=torch.double, ) def kldivloss_with_target_no_reduce_test(): - t = torch.rand(10, 10) + t = torch.rand(10, 10, dtype=torch.double) return dict( fullname='KLDivLoss_with_target_no_reduce', constructor=wrap_functional( @@ -306,11 +320,12 @@ def kldivloss_with_target_no_reduce_test(): reference_fn=lambda i, *_: loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def kldivloss_no_reduce_test(): - t = torch.rand(10, 10) + t = torch.rand(10, 10, dtype=torch.double) return dict( fullname='KLDivLoss_no_reduce', constructor=wrap_functional( @@ -322,11 +337,12 @@ def kldivloss_no_reduce_test(): loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, pickle=False, + default_dtype=torch.double, ) def kldivloss_no_reduce_scalar_test(): - t = torch.rand(()) + t = torch.rand((), dtype=torch.double) return dict( fullname='KLDivLoss_no_reduce_scalar', constructor=wrap_functional( @@ -337,11 +353,12 @@ def kldivloss_no_reduce_scalar_test(): reference_fn=lambda i, *_: loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def kldivloss_with_log_target_no_reduce_test(): - t = torch.rand(10, 10).log() + t = torch.rand(10, 10, dtype=torch.double).log() return dict( fullname='KLDivLoss_with_log_target_no_reduce', constructor=wrap_functional( @@ -352,11 +369,12 @@ def kldivloss_with_log_target_no_reduce_test(): reference_fn=lambda i, *_: loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def kldivloss_no_reduce_log_target_test(): - t = torch.rand(10, 10).log() + t = torch.rand(10, 10, dtype=torch.double).log() return dict( fullname='KLDivLoss_no_reduce_log_target', constructor=wrap_functional( @@ -368,11 +386,12 @@ def kldivloss_no_reduce_log_target_test(): loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, pickle=False, + default_dtype=torch.double, ) def kldivloss_no_reduce_scalar_log_target_test(): - t = torch.rand(()).log() + t = torch.rand((), dtype=torch.double).log() return dict( fullname='KLDivLoss_no_reduce_scalar_log_target', constructor=wrap_functional( @@ -383,11 +402,12 @@ def kldivloss_no_reduce_scalar_log_target_test(): reference_fn=lambda i, *_: loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def l1loss_no_reduce_test(): - t = torch.randn(2, 3, 4) + t = torch.randn(2, 3, 4, dtype=torch.double) return dict( fullname='L1Loss_no_reduce', constructor=wrap_functional( @@ -397,7 +417,8 @@ def l1loss_no_reduce_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: (i - t.type_as(i)).abs(), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def l1loss_no_reduce_complex_test(): @@ -415,7 +436,7 @@ def l1loss_no_reduce_complex_test(): def l1loss_no_reduce_scalar_test(): - t = torch.randn(()) + t = torch.randn((), dtype=torch.double) return dict( fullname='L1Loss_no_reduce_scalar', constructor=wrap_functional( @@ -425,12 +446,13 @@ def l1loss_no_reduce_scalar_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: (i - t.type_as(i)).abs(), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def mseloss_no_reduce_test(): input_size = (2, 3, 4, 5) - target = torch.randn(*input_size) + target = torch.randn(*input_size, dtype=torch.double) return dict( fullname='MSELoss_no_reduce', constructor=wrap_functional( @@ -440,12 +462,13 @@ def mseloss_no_reduce_test(): cpp_var_map={'i': '_get_input()', 'target': target}, reference_fn=lambda i, *_: (i - target).pow(2), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def mseloss_no_reduce_scalar_test(): input_size = () - target = torch.randn(input_size) + target = torch.randn(input_size, dtype=torch.double) return dict( fullname='MSELoss_no_reduce_scalar', constructor=wrap_functional( @@ -455,7 +478,8 @@ def mseloss_no_reduce_scalar_test(): cpp_var_map={'i': '_get_input()', 'target': target}, reference_fn=lambda i, *_: (i - target).pow(2), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss_no_reduce_test(): @@ -471,7 +495,8 @@ def nllloss_no_reduce_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss_no_reduce_ignore_index_test(): @@ -488,7 +513,8 @@ def nllloss_no_reduce_ignore_index_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss_no_reduce_weights_test(): @@ -509,7 +535,8 @@ def kwargs(i): cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight}, reference_fn=lambda i, *_: loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss_no_reduce_weights_ignore_index_test(): @@ -531,7 +558,8 @@ def kwargs(i): cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight}, reference_fn=lambda i, *_: loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss_no_reduce_weights_ignore_index_neg_test(): @@ -549,11 +577,12 @@ def kwargs(i): cpp_function_call='''F::nll_loss( i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone).ignore_index(-1))''', - input=torch.rand(15, 10).add(1e-2).log(), + input=torch.rand(15, 10, dtype=torch.double).add(1e-2).log(), cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight}, reference_fn=lambda i, *_: loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss2d_no_reduce_test(): @@ -569,7 +598,8 @@ def nllloss2d_no_reduce_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss2d_no_reduce_ignore_index_test(): @@ -586,7 +616,8 @@ def nllloss2d_no_reduce_ignore_index_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs), - pickle=False) + pickle=False, + default_dtype=torch.double) def nllloss2d_no_reduce_weights_test(): @@ -607,7 +638,8 @@ def kwargs(i): cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight}, reference_fn=lambda i, *_: loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)), - pickle=False) + pickle=False, + default_dtype=torch.double) def nlllossNd_no_reduce_test(): @@ -623,7 +655,8 @@ def nlllossNd_no_reduce_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs), - pickle=False) + pickle=False, + default_dtype=torch.double) def nlllossNd_no_reduce_ignore_index_test(): @@ -640,7 +673,8 @@ def nlllossNd_no_reduce_ignore_index_test(): cpp_var_map={'i': '_get_input()', 't': t}, reference_fn=lambda i, *_: loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs), - pickle=False) + pickle=False, + default_dtype=torch.double) def nlllossNd_no_reduce_weights_test(): @@ -661,11 +695,12 @@ def kwargs(i): cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight}, reference_fn=lambda i, *_: loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)), - pickle=False) + pickle=False, + default_dtype=torch.double) def smoothl1loss_no_reduce_test(): - t = torch.randn(2, 3, 4) + t = torch.randn(2, 3, 4, dtype=torch.double) return dict( fullname='SmoothL1Loss_no_reduce', constructor=wrap_functional( @@ -677,11 +712,12 @@ def smoothl1loss_no_reduce_test(): reference_fn=lambda i, *_: loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def smoothl1loss_no_reduce_scalar_test(): - t = torch.randn(()) + t = torch.randn((), dtype=torch.double) return dict( fullname='SmoothL1Loss_no_reduce_scalar', constructor=wrap_functional( @@ -693,11 +729,12 @@ def smoothl1loss_no_reduce_scalar_test(): reference_fn=lambda i, *_: loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def smoothl1loss_beta_test(): - t = torch.randn(2, 3, 4) + t = torch.randn(2, 3, 4, dtype=torch.double) return dict( fullname='SmoothL1Loss_beta', constructor=wrap_functional( @@ -709,11 +746,12 @@ def smoothl1loss_beta_test(): reference_fn=lambda i, *_: loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0.5), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def smoothl1loss_zero_beta_test(): - t = torch.randn(2, 3, 4) + t = torch.randn(2, 3, 4, dtype=torch.double) return dict( fullname='SmoothL1Loss_zero_beta', constructor=wrap_functional( @@ -725,7 +763,8 @@ def smoothl1loss_zero_beta_test(): reference_fn=lambda i, *_: loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def huberloss_delta_test(): @@ -741,7 +780,8 @@ def huberloss_delta_test(): reference_fn=lambda i, *_: loss_reference_fns['HuberLoss'](i, t.type_as(i), reduction='none', delta=0.5), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def multilabelmarginloss_0d_no_reduce_test(): @@ -775,7 +815,8 @@ def multilabelmarginloss_1d_no_reduce_test(): loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multilabelmarginloss_index_neg_test(): @@ -792,7 +833,8 @@ def multilabelmarginloss_index_neg_test(): loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multilabelmarginloss_no_reduce_test(): @@ -809,11 +851,12 @@ def multilabelmarginloss_no_reduce_test(): loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def hingeembeddingloss_no_reduce_test(): - t = Variable(torch.randn(10).gt(0).to(torch.get_default_dtype()).mul_(2).sub(1)) + t = Variable(torch.randn(10).gt(0).to(torch.double).mul_(2).sub(1)) return dict( fullname='HingeEmbeddingLoss_no_reduce', constructor=wrap_functional( @@ -825,11 +868,12 @@ def hingeembeddingloss_no_reduce_test(): reference_fn=lambda i, *_: loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), reduction='none'), check_sum_reduction=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def hingeembeddingloss_margin_no_reduce_test(): - t = Variable(torch.randn(10).gt(0).to(torch.get_default_dtype()).mul_(2).sub(1)) + t = Variable(torch.randn(10).gt(0).to(torch.double).mul_(2).sub(1)) return dict( fullname='HingeEmbeddingLoss_margin_no_reduce', constructor=wrap_functional( @@ -841,11 +885,12 @@ def hingeembeddingloss_margin_no_reduce_test(): reference_fn=lambda i, *_: loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), margin=0.5, reduction='none'), check_sum_reduction=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def softmarginloss_no_reduce_test(): - t = torch.randn(5, 5) + t = torch.randn(5, 5, dtype=torch.double) return dict( fullname='SoftMarginLoss_no_reduce', constructor=wrap_functional( @@ -857,7 +902,8 @@ def softmarginloss_no_reduce_test(): reference_fn=lambda i, *_: loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'), supports_forward_ad=True, - pickle=False) + pickle=False, + default_dtype=torch.double) def multilabelsoftmarginloss_no_reduce_test(): @@ -873,7 +919,8 @@ def multilabelsoftmarginloss_no_reduce_test(): reference_fn=lambda i, *_: (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log())).sum(dim=1) / i.size(1), check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multilabelsoftmarginloss_weights_no_reduce_test(): @@ -893,7 +940,8 @@ def multilabelsoftmarginloss_weights_no_reduce_test(): (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights).sum(dim=1) / i.size(1), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multimarginloss_no_reduce_test(): @@ -910,7 +958,8 @@ def multimarginloss_no_reduce_test(): loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multimarginloss_1d_no_reduce_test(): @@ -927,7 +976,8 @@ def multimarginloss_1d_no_reduce_test(): loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multimarginloss_1d_input_0d_target_no_reduce_test(): @@ -944,7 +994,8 @@ def multimarginloss_1d_input_0d_target_no_reduce_test(): loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multimarginloss_p_no_reduce_test(): @@ -961,7 +1012,8 @@ def multimarginloss_p_no_reduce_test(): loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), p=2, reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multimarginloss_margin_no_reduce_test(): @@ -980,12 +1032,13 @@ def multimarginloss_margin_no_reduce_test(): margin=0.5, reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def multimarginloss_weights_no_reduce_test(): t = torch.rand(5).mul(8).floor().long() - weights = torch.rand(10) + weights = torch.rand(10, dtype=torch.double) return dict( fullname='MultiMarginLoss_weights_no_reduce', constructor=wrap_functional( @@ -1001,7 +1054,8 @@ def multimarginloss_weights_no_reduce_test(): weight=weights, reduction='none'), check_sum_reduction=True, check_gradgrad=False, - pickle=False) + pickle=False, + default_dtype=torch.double) def single_batch_reference_fn(input, parameters, module): @@ -1080,6 +1134,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv1d', @@ -1090,6 +1145,7 @@ def unsqueeze_inp(inp): desc='stride', with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv1d', @@ -1100,6 +1156,7 @@ def unsqueeze_inp(inp): desc='pad1', with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( module_name='Conv1d', @@ -1110,6 +1167,7 @@ def unsqueeze_inp(inp): desc='pad2', with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv1d', @@ -1120,6 +1178,7 @@ def unsqueeze_inp(inp): desc='pad1size1', with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv1d', @@ -1130,6 +1189,7 @@ def unsqueeze_inp(inp): desc='pad2size1', with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv1d', @@ -1148,6 +1208,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 10), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv1d_groups', @@ -1157,6 +1218,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv1d_pad_valid', @@ -1166,6 +1228,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv1d_pad_same', @@ -1175,6 +1238,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv1d_pad_same2', @@ -1184,6 +1248,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv1d_pad_same_dilated', @@ -1193,6 +1258,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='ConvTranspose1d', @@ -1202,6 +1268,7 @@ def unsqueeze_inp(inp): input_size=(1, 3, 7), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='ConvTranspose1d', @@ -1213,6 +1280,7 @@ def unsqueeze_inp(inp): desc='no_bias', with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='ConvTranspose1d', @@ -1224,6 +1292,7 @@ def unsqueeze_inp(inp): desc='dilated', with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='ConvTranspose1d_groups', @@ -1234,6 +1303,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 7), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv2d', @@ -1244,6 +1314,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv2d', @@ -1255,6 +1326,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv2d', @@ -1266,6 +1338,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv2d', @@ -1277,6 +1350,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv2d', @@ -1289,6 +1363,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.015, + default_dtype=torch.double, ), dict( module_name='Conv2d', @@ -1309,6 +1384,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.015, + default_dtype=torch.double, ), dict( fullname='Conv2d_groups_thnn', @@ -1318,6 +1394,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.015, + default_dtype=torch.double, ), dict( fullname='Conv2d_pad_valid', @@ -1327,6 +1404,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv2d_pad_same', @@ -1336,6 +1414,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( fullname='Conv2d_pad_same_dilated', @@ -1345,6 +1424,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( module_name='ConvTranspose2d', @@ -1356,6 +1436,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( module_name='ConvTranspose2d', @@ -1373,6 +1454,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( module_name='ConvTranspose2d', @@ -1385,6 +1467,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( fullname='ConvTranspose2d_groups', @@ -1395,6 +1478,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.01, + default_dtype=torch.double, ), dict( fullname='Conv2d_depthwise', @@ -1403,6 +1487,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 6, 6), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv2d_depthwise_with_multiplier', @@ -1411,6 +1496,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 6, 6), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv2d_depthwise_strided', @@ -1419,6 +1505,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 6, 6), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv2d_depthwise_padded', @@ -1427,6 +1514,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 6, 6), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv2d_depthwise_dilated', @@ -1435,6 +1523,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 5, 5), with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Conv3d', @@ -1445,6 +1534,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='Conv3d', @@ -1457,6 +1547,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='Conv3d', @@ -1469,6 +1560,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=False, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='Conv3d', @@ -1480,6 +1572,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='Conv3d', @@ -1491,6 +1584,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='Conv3d', @@ -1511,6 +1605,7 @@ def unsqueeze_inp(inp): check_with_long_tensor=True, with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( fullname='Conv3d_dilated', @@ -1519,6 +1614,7 @@ def unsqueeze_inp(inp): input_size=(2, 3, 5, 5, 5), with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( fullname='Conv3d_dilated_strided', @@ -1526,7 +1622,8 @@ def unsqueeze_inp(inp): cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)', input_size=(2, 3, 5, 5, 5), with_tf32=True, - tf32_precision=0.05 + tf32_precision=0.05, + default_dtype=torch.double, ), dict( fullname='Conv3d_pad_valid', @@ -1536,6 +1633,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( fullname='Conv3d_pad_same', @@ -1545,6 +1643,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( fullname='Conv3d_pad_same_dilated', @@ -1554,6 +1653,7 @@ def unsqueeze_inp(inp): cudnn=True, with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='ConvTranspose3d', @@ -1562,7 +1662,8 @@ def unsqueeze_inp(inp): cudnn=True, input_size=(1, 2, 4, 5, 4), with_tf32=True, - tf32_precision=0.05 + tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='ConvTranspose3d', @@ -1573,13 +1674,15 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 5, 4), desc='dilated', with_tf32=True, - tf32_precision=0.05 + tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='ReplicationPad3d', constructor_args=((1, 2, 3, 3, 2, 1),), cpp_constructor_args='torch::nn::ReplicationPad3dOptions({1, 2, 3, 3, 2, 1})', input_size=(2, 3, 2, 2, 2), + default_dtype=torch.double, ), dict( module_name='ReplicationPad3d', @@ -1588,6 +1691,7 @@ def unsqueeze_inp(inp): input_size=(3, 2, 2, 2), reference_fn=single_batch_reference_fn, desc='no_batch_dim', + default_dtype=torch.double, ), dict( module_name='ReplicationPad3d', @@ -1603,6 +1707,7 @@ def unsqueeze_inp(inp): cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)', input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4), check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='Embedding', @@ -1610,7 +1715,8 @@ def unsqueeze_inp(inp): cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)', input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512), check_gradgrad=False, - desc='discontiguous' + desc='discontiguous', + default_dtype=torch.double, ), dict( module_name='EmbeddingBag', @@ -1619,6 +1725,7 @@ def unsqueeze_inp(inp): input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4), check_gradgrad=False, desc='mean', + default_dtype=torch.double, ), dict( module_name='EmbeddingBag', @@ -1627,6 +1734,7 @@ def unsqueeze_inp(inp): input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512), check_gradgrad=False, desc='discontiguous', + default_dtype=torch.double, ), dict( module_name='EmbeddingBag', @@ -1636,6 +1744,7 @@ def unsqueeze_inp(inp): input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4), check_gradgrad=False, desc='sum', + default_dtype=torch.double, ), dict( module_name='EmbeddingBag', @@ -1645,6 +1754,7 @@ def unsqueeze_inp(inp): input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4), check_gradgrad=False, desc='max', + default_dtype=torch.double, ), dict( fullname='EmbeddingBag_mean_padding_idx', @@ -1652,6 +1762,7 @@ def unsqueeze_inp(inp): cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).padding_idx(1)', input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]), check_gradgrad=False, + default_dtype=torch.double, ), dict( fullname='EmbeddingBag_sum_padding_idx', @@ -1660,6 +1771,7 @@ def unsqueeze_inp(inp): .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kSum).padding_idx(1)''', input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]), check_gradgrad=False, + default_dtype=torch.double, ), dict( fullname='EmbeddingBag_max_padding_idx', @@ -1668,17 +1780,18 @@ def unsqueeze_inp(inp): .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kMax).padding_idx(1)''', input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]), check_gradgrad=False, + default_dtype=torch.double, ), dict( fullname='EmbeddingBag_sparse', - constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True), + constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True, dtype=torch.double), cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).sparse(true)', input_fn=lambda: torch.randperm(2).repeat(1, 2), check_gradgrad=False, has_sparse_gradients=True, ), dict( - constructor=lambda: nn.Embedding(4, 3, sparse=True), + constructor=lambda: nn.Embedding(4, 3, dtype=torch.double, sparse=True), cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3).sparse(true)', input_fn=lambda: torch.randperm(2).repeat(1, 2), fullname='Embedding_sparse', @@ -1690,12 +1803,14 @@ def unsqueeze_inp(inp): constructor_args=(3,), cpp_constructor_args='torch::nn::PixelShuffleOptions(3)', input_size=(1, 9, 4, 4), + default_dtype=torch.double, ), dict( module_name='PixelUnshuffle', constructor_args=(3,), cpp_constructor_args='torch::nn::PixelUnshuffleOptions(3)', input_size=(1, 1, 12, 12), + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'), @@ -1704,6 +1819,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4), fullname='interpolate_nearest_1d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'), @@ -1720,6 +1836,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 3), fullname='interpolate_nearest_tuple_1d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'), @@ -1728,6 +1845,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4), fullname='interpolate_nearest_scale_1d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False), @@ -1739,6 +1857,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4), fullname='interpolate_linear_1d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=(4, ), scale_factor=None, mode='linear', align_corners=False), @@ -1750,6 +1869,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 3), fullname='interpolate_linear_tuple_1d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=False), @@ -1761,6 +1881,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4), fullname='interpolate_linear_scale_1d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False), @@ -1783,6 +1904,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4), fullname='interpolate_linear_1d_align_corners', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=True), @@ -1794,6 +1916,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4), fullname='interpolate_linear_scale_1d_align_corners', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=2, scale_factor=None, mode='nearest'), @@ -1804,6 +1927,7 @@ def unsqueeze_inp(inp): input_size=(1, 128, 1, 1), fullname='interpolate_nearest_2d_launch_configs', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'), @@ -1814,6 +1938,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_nearest_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=(12, 16), scale_factor=None, mode='nearest'), @@ -1824,6 +1949,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 3, 4), fullname='interpolate_nearest_tuple_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'), @@ -1834,6 +1960,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_nearest_scale_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'), @@ -1855,6 +1982,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bilinear_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bilinear', align_corners=False), @@ -1878,6 +2006,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 2, 3), fullname='interpolate_bilinear_tuple_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., @@ -1890,6 +2019,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bilinear_scale_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.), @@ -1902,6 +2032,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bilinear_scale_tuple_shared_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.), @@ -1914,6 +2045,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bilinear_scale_tuple_skewed_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bilinear', align_corners=True), @@ -1925,6 +2057,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bilinear_tuple_2d_align_corners', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.), @@ -1937,6 +2070,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bilinear_scale_tuple_skewed_2d_align_corners', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False), @@ -1948,6 +2082,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bicubic_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False), @@ -1971,6 +2106,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 2, 3), fullname='interpolate_bicubic_tuple_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='bicubic', align_corners=False), @@ -1982,6 +2118,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bicubic_scale_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.), @@ -1994,6 +2131,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bicubic_scale_tuple_shared_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.), @@ -2006,6 +2144,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bicubic_scale_tuple_skewed_2d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bicubic', align_corners=True), @@ -2017,6 +2156,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bicubic_tuple_2d_align_corners', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.), @@ -2029,6 +2169,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4), fullname='interpolate_bicubic_scale_tuple_skewed_2d_align_corners', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'), @@ -2039,6 +2180,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4, 4), fullname='interpolate_nearest_3d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'), @@ -2059,6 +2201,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 3, 4, 4), fullname='interpolate_nearest_tuple_3d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'), @@ -2069,6 +2212,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4, 4), fullname='interpolate_nearest_scale_3d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False), @@ -2080,6 +2224,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 4, 4, 4), fullname='interpolate_trilinear_3d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False), @@ -2103,6 +2248,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 2, 3, 3), fullname='interpolate_trilinear_tuple_3d', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=False), @@ -2116,6 +2262,7 @@ def unsqueeze_inp(inp): # See https://github.com/pytorch/pytorch/issues/5006 precision=3e-4, pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.interpolate, size=(4, 6, 6), scale_factor=None, @@ -2128,6 +2275,7 @@ def unsqueeze_inp(inp): input_size=(1, 2, 2, 3, 3), fullname='interpolate_trilinear_tuple_3d_align_corners', pickle=False, + default_dtype=torch.double ), dict( constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=True), @@ -2141,6 +2289,7 @@ def unsqueeze_inp(inp): # See https://github.com/pytorch/pytorch/issues/5006 precision=3e-4, pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=-1), @@ -2148,6 +2297,7 @@ def unsqueeze_inp(inp): input_size=(2, 128), # trigger the last-dim algo in CUDA fullname='softmax_lastdim', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64), @@ -2155,7 +2305,8 @@ def unsqueeze_inp(inp): input_size=(2, 128), fullname='softmax_lastdim_dtype', pickle=False, - test_cuda=False + test_cuda=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=1), @@ -2170,6 +2321,7 @@ def unsqueeze_inp(inp): input_size=(2, 2, 4, 4), # regular spatial algorithm fullname='softmax_spatial', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64), @@ -2177,7 +2329,8 @@ def unsqueeze_inp(inp): input_size=(2, 2, 4, 4), # regular spatial algorithm fullname='softmax_spatial_dtype', pickle=False, - test_cuda=False + test_cuda=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=0), @@ -2186,6 +2339,7 @@ def unsqueeze_inp(inp): fullname='softmax_functional_dim0', test_cuda=False, pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=3), @@ -2194,6 +2348,7 @@ def unsqueeze_inp(inp): fullname='softmax_functional_dim3', test_cuda=False, pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.softmax, dim=-1), @@ -2209,6 +2364,7 @@ def unsqueeze_inp(inp): input_size=(2, 128), # trigger the last-dim algo in CUDA fullname='log_softmax_lastdim', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.log_softmax, dim=1), @@ -2216,6 +2372,7 @@ def unsqueeze_inp(inp): input_size=(2, 128, 2, 2), # trigger special case of spatial CUDA algo fullname='log_softmax_spatial_special', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.log_softmax, dim=1), @@ -2223,6 +2380,7 @@ def unsqueeze_inp(inp): input_size=(2, 2, 4, 4), # regular spatial algorithm fullname='log_softmax_spatial', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.log_softmax, dim=0), @@ -2230,6 +2388,7 @@ def unsqueeze_inp(inp): input_size=(2, 3, 4, 5), fullname='log_softmax_dim0', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.log_softmax, dim=3), @@ -2237,6 +2396,7 @@ def unsqueeze_inp(inp): input_size=(2, 3, 4, 5), fullname='log_softmax_dim3', pickle=False, + default_dtype=torch.double, ), dict( constructor=wrap_functional(F.log_softmax, dim=0), @@ -2252,6 +2412,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 3, 3), check_gradgrad=False, test_cuda=True, + default_dtype=torch.double, ), dict( fullname='Fold', @@ -2260,6 +2421,7 @@ def unsqueeze_inp(inp): input_size=(2, 16, 4), check_gradgrad=False, test_cuda=True, + default_dtype=torch.double, ), dict( fullname='Fold_no_batch_dim_input', @@ -2269,6 +2431,7 @@ def unsqueeze_inp(inp): check_gradgrad=False, ref=single_batch_reference_fn, test_cuda=True, + default_dtype=torch.double, ), dict( fullname='Unfold_int_input', @@ -2277,6 +2440,7 @@ def unsqueeze_inp(inp): input_size=(2, 4, 3, 3), check_gradgrad=False, test_cuda=True, + default_dtype=torch.double, ), dict( fullname='Fold_int_input', @@ -2285,6 +2449,7 @@ def unsqueeze_inp(inp): input_size=(2, 16, 4), check_gradgrad=False, test_cuda=True, + default_dtype=torch.double, ), dict( fullname='Fold_no_batch_dim_int_input', @@ -2294,6 +2459,7 @@ def unsqueeze_inp(inp): ref=single_batch_reference_fn, check_gradgrad=False, test_cuda=True, + default_dtype=torch.double, ), dict( module_name='RReLU', @@ -2302,20 +2468,24 @@ def unsqueeze_inp(inp): input_size=(), desc='with_up_down_scalar', test_cuda=False, + default_dtype=torch.double, ), dict( module_name='PairwiseDistance', input_fn=lambda: (torch.randn(10, 8), torch.randn(10, 8)), + default_dtype=torch.double, ), dict( module_name='PairwiseDistance', input_fn=lambda: (torch.randn(10, 1), torch.randn(10, 8)), - desc='broadcast_lhs' + desc='broadcast_lhs', + default_dtype=torch.double, ), dict( module_name='PairwiseDistance', input_fn=lambda: (torch.randn(10, 8), torch.randn(1, 8)), - desc='broadcast_rhs' + desc='broadcast_rhs', + default_dtype=torch.double, ), dict( module_name='PairwiseDistance', @@ -2323,12 +2493,14 @@ def unsqueeze_inp(inp): cpp_constructor_args='torch::nn::PairwiseDistanceOptions().p(1.5).eps(1e-05).keepdim(true)', input_fn=lambda: (torch.randn(10, 8), torch.randn(10, 8)), desc='with_non_default_args', + default_dtype=torch.double, ), dict( module_name='PairwiseDistance', input_fn=lambda: (torch.randn(8), torch.randn(8)), reference_fn=single_batch_reference_fn, desc='no_batch_dim', + default_dtype=torch.double, ), dict( module_name='TransformerEncoderLayer', @@ -2345,6 +2517,7 @@ def unsqueeze_inp(inp): # at non-singleton dimension 2 check_batched_grad=False, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='TransformerEncoderLayer', @@ -2358,6 +2531,7 @@ def unsqueeze_inp(inp): desc='gelu_activation', with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='TransformerDecoderLayer', @@ -2370,6 +2544,7 @@ def unsqueeze_inp(inp): desc='relu_activation', with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='TransformerDecoderLayer', @@ -2383,6 +2558,7 @@ def unsqueeze_inp(inp): desc='gelu_activation', with_tf32=True, tf32_precision=0.05, + default_dtype=torch.double, ), dict( module_name='Transformer', @@ -2400,6 +2576,7 @@ def unsqueeze_inp(inp): desc='multilayer_coder', with_tf32=True, tf32_precision=0.03, + default_dtype=torch.double, ), dict( module_name='Linear', @@ -2410,6 +2587,7 @@ def unsqueeze_inp(inp): desc="no_batch_dim", with_tf32=True, tf32_precision=0.005, + default_dtype=torch.double, ), dict( module_name='Flatten', @@ -2418,6 +2596,7 @@ def unsqueeze_inp(inp): input_size=(3, 4, 5), reference_fn=single_batch_reference_fn, desc="no_batch_dim", + default_dtype=torch.double, ), dict( module_name='Unflatten', @@ -2426,6 +2605,7 @@ def unsqueeze_inp(inp): input_size=(3, 4, 5), reference_fn=single_batch_reference_fn, desc="no_batch_dim", + default_dtype=torch.double, ), dict( module_name='LayerNorm', @@ -2473,7 +2653,8 @@ def unsqueeze_inp(inp): cudnn=True, desc=f'{padding_mode}_stride2_pad2', with_tf32=True, - tf32_precision=0.05 + tf32_precision=0.05, + default_dtype=torch.double, ), ) @@ -2485,13 +2666,32 @@ def unsqueeze_inp(inp): 'Tanhshrink', 'Threshold' ] non_linear_activations_extra_info: Dict[str, dict] = { - 'CELU': {'constructor_args': (2.,)}, + 'CELU': {'constructor_args': (2.,), 'default_dtype': torch.double}, 'Threshold': {'constructor_args': (2., 1.)}, - 'Hardsigmoid': {'check_gradgrad': False, 'check_jit': False}, - 'Hardswish': {'check_gradgrad': False, 'check_jit': False}, + 'Hardsigmoid': {'check_gradgrad': False, 'check_jit': False, 'default_dtype': torch.double}, + 'Hardswish': {'check_gradgrad': False, 'check_jit': False, 'default_dtype': torch.double}, # For RRelu, test that compare CPU and GPU results fail because RNG # is different between CPU and GPU - 'RReLU': {'test_cuda': False}, + 'RReLU': {'test_cuda': False, 'default_dtype': torch.double}, + 'ELU': {'default_dtype': torch.double}, + 'GELU': {'default_dtype': torch.double}, + 'GLU': {'default_dtype': torch.double}, + 'Hardshrink': {'default_dtype': torch.double}, + 'Hardtanh': {'default_dtype': torch.double}, + 'LeakyReLU': {'default_dtype': torch.double}, + 'LogSigmoid': {'default_dtype': torch.double}, + 'Mish': {'default_dtype': torch.double}, + 'PReLU': {'default_dtype': torch.double}, + 'ReLU6': {'default_dtype': torch.double}, + 'ReLU': {'default_dtype': torch.double}, + 'SELU': {'default_dtype': torch.double}, + 'SiLU': {'default_dtype': torch.double}, + 'Sigmoid': {'default_dtype': torch.double}, + 'Softplus': {'default_dtype': torch.double}, + 'Softshrink': {'default_dtype': torch.double}, + 'Softsign': {'default_dtype': torch.double}, + 'Tanh': {'default_dtype': torch.double}, + 'Tanhshrink': {'default_dtype': torch.double}, } for non_linear_activation in non_linear_activations_no_batch: activation_test_info = dict( @@ -2885,6 +3085,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, _: 1. / i.numel() * sum((a - b).abs().sum() for a, b in zip(i, t)), check_complex=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -2894,6 +3095,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 nllloss_reference(i, t, reduction=get_reduction(m)), check_sum_reduction=True, check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -2904,6 +3106,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, _: nllloss_reference(i, t, ignore_index=2), desc='ignore_index', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -2915,6 +3118,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 nllloss_reference(i, t, weight=get_weight(m)), desc='weights', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -2926,6 +3130,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 nllloss_reference(i, t, weight=get_weight(m), ignore_index=2), desc='weights_ignore_index', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -2937,6 +3142,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 nllloss_reference(i, t, weight=get_weight(m), ignore_index=-1), desc='weights_ignore_index_neg', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='KLDivLoss', @@ -2945,6 +3151,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: kldivloss_reference(i, t, get_reduction(m)), check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='KLDivLoss', @@ -2956,14 +3163,16 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 kldivloss_log_target_reference(i, t, get_reduction(m)), check_sum_reduction=True, desc='log_target', + default_dtype=torch.double, ), dict( module_name='MSELoss', input_size=(2, 3, 4, 5), - target_fn=lambda: torch.randn((2, 3, 4, 5), requires_grad=True), + target_fn=lambda: torch.randn((2, 3, 4, 5), dtype=torch.double, requires_grad=True), reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() / (i.numel() if get_reduction(m) == 'mean' else 1)), check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='BCELoss', @@ -2972,6 +3181,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() / (i.numel() if get_reduction(m) else 1), check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='BCELoss', @@ -2983,11 +3193,13 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 (i.numel() if get_reduction(m) else 1), desc='weights', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', input_size=(15, 10), target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(), + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -2996,6 +3208,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 input_size=(15, 10), target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(), desc='weights', + default_dtype=torch.double, ), dict( module_name='HingeEmbeddingLoss', @@ -3004,6 +3217,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: hingeembeddingloss_reference(i, t, reduction=get_reduction(m)), check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='HingeEmbeddingLoss', @@ -3015,6 +3229,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 hingeembeddingloss_reference(i, t, margin=0.5, reduction=get_reduction(m)), desc='margin', check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='MultiLabelMarginLoss', @@ -3026,6 +3241,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, check_gradgrad=False, check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='MultiLabelMarginLoss', @@ -3036,6 +3252,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, check_gradgrad=False, check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='MultiLabelSoftMarginLoss', @@ -3043,6 +3260,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 target_fn=lambda: torch.rand(5, 10).mul(2).floor(), reference_fn=lambda i, t, m: -(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()).sum() / i.numel(), check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='MultiMarginLoss', @@ -3052,6 +3270,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 multimarginloss_reference(i, t, reduction=get_reduction(m)), check_sum_reduction=True, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='MultiMarginLoss', @@ -3062,6 +3281,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 desc='1d', check_sum_reduction=True, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='MultiMarginLoss', @@ -3074,6 +3294,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 desc='p', check_sum_reduction=True, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='MultiMarginLoss', @@ -3087,12 +3308,13 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 desc='margin', check_sum_reduction=True, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='MultiMarginLoss', - constructor_args=(1, 1., torch.rand(10).to(torch.get_default_dtype())), + constructor_args=(1, 1., torch.rand(10, dtype=torch.double)), cpp_constructor_args='torch::nn::MultiMarginLossOptions().p(1).margin(1.).weight(torch::rand(10))', - legacy_constructor_args=(1, torch.rand(10).to(torch.get_default_dtype())), + legacy_constructor_args=(1, torch.rand(10, dtype=torch.double)), input_size=(5, 10), target_fn=lambda: torch.rand(5).mul(8).floor().long(), reference_fn=lambda i, t, m: @@ -3100,6 +3322,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 desc='weights', check_sum_reduction=True, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='SmoothL1Loss', @@ -3108,6 +3331,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, reference_fn=lambda i, t, m, b=1.0: smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b), + default_dtype=torch.double, ), dict( module_name='HuberLoss', @@ -3118,6 +3342,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_bfloat16=True, reference_fn=lambda i, t, m: huberloss_reference(i, t, reduction=get_reduction(m)), + default_dtype=torch.double, ), dict( module_name='SoftMarginLoss', @@ -3126,11 +3351,12 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: softmarginloss_reference(i, t, reduction=get_reduction(m)), check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='CosineEmbeddingLoss', - input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)), - target_fn=lambda: torch.randn(15).sign(), + input_fn=lambda: (torch.rand(15, 10, dtype=torch.double), torch.rand(15, 10, dtype=torch.double)), + target_fn=lambda: torch.randn(15, dtype=torch.double).sign(), reference_fn=lambda i, t, m: cosineembeddingloss_reference(i[0], i[1], t, reduction=get_reduction(m)), check_sum_reduction=True, @@ -3139,8 +3365,8 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 module_name='CosineEmbeddingLoss', constructor_args=(0.7,), cpp_constructor_args='torch::nn::CosineEmbeddingLossOptions().margin(0.7)', - input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)), - target_fn=lambda: torch.randn(15).sign(), + input_fn=lambda: (torch.rand(15, 10, dtype=torch.double), torch.rand(15, 10, dtype=torch.double)), + target_fn=lambda: torch.randn(15, dtype=torch.double).sign(), reference_fn=lambda i, t, m: cosineembeddingloss_reference(i[0], i[1], t, margin=0.7, reduction=get_reduction(m)), desc='margin', @@ -3153,6 +3379,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: marginrankingloss_reference(i[0], i[1], t, reduction=get_reduction(m)), check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='MarginRankingLoss', @@ -3164,27 +3391,31 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 marginrankingloss_reference(i[0], i[1], t, margin=0.5, reduction=get_reduction(m)), desc='margin', check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='BCEWithLogitsLoss', input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2), target_fn=lambda: torch.randn(15, 10).gt(0).to(torch.get_default_dtype()), + default_dtype=torch.double, ), dict( module_name='BCEWithLogitsLoss', - constructor_args=(torch.rand(10),), + constructor_args=(torch.rand(10, dtype=torch.double),), cpp_constructor_args='torch::nn::BCEWithLogitsLossOptions().weight(torch::rand(10))', input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2), target_fn=lambda: torch.randn(15, 10).gt(0).to(torch.get_default_dtype()), desc='weights', + default_dtype=torch.double, ), dict( module_name='BCEWithLogitsLoss', - constructor_args=(torch.rand(()),), + constructor_args=(torch.rand((), dtype=torch.double),), cpp_constructor_args='torch::nn::BCEWithLogitsLossOptions().weight(torch::rand({}))', input_fn=lambda: torch.rand(()).clamp_(1e-2, 1 - 1e-2), target_fn=lambda: torch.randn(()).gt(0).to(torch.get_default_dtype()), - desc='scalar_weights' + desc='scalar_weights', + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -3195,6 +3426,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='2d', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -3206,6 +3438,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 loss_reference_fns['NLLLossNd'](i, t, weight=get_weight(m)), desc='2d_weights', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -3217,6 +3450,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 loss_reference_fns['NLLLossNd'](i, t, ignore_index=1), desc='2d_ignore_index', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -3227,6 +3461,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='higher_dim', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='NLLLoss', @@ -3237,6 +3472,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='dim_is_3', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3247,6 +3483,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='2d', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3258,6 +3495,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 loss_reference_fns['CrossEntropyLoss'](i, t, weight=get_weight(m)), desc='2d_weights', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3269,6 +3507,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 loss_reference_fns['CrossEntropyLoss'](i, t, ignore_index=1), desc='2d_ignore_index', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3279,6 +3518,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='higher_dim', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3289,6 +3529,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='dim_is_3', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3299,6 +3540,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='2d_prob_target', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3309,6 +3551,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='3d_prob_target', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3319,6 +3562,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='4d_prob_target', check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_prob_target_smoothing_sum_reduction', @@ -3330,6 +3574,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_prob_target_smoothing', @@ -3340,6 +3585,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_prob_target_smoothing_weight', @@ -3351,6 +3597,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_3d_prob_target_smoothing_sum_reduction', @@ -3362,6 +3609,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_3d_prob_target_smoothing', @@ -3372,6 +3620,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_3d_indices_target_smoothing', @@ -3382,6 +3631,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_3d_indices_target_smoothing_ignore_index', @@ -3392,6 +3642,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction', @@ -3402,6 +3653,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction_ignore_index', @@ -3413,6 +3665,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_indices_target_smoothing', @@ -3423,6 +3676,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_indices_target_smoothing_sum_reduction', @@ -3433,6 +3687,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_indices_target_smoothing_ignore_index', @@ -3443,6 +3698,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=3), check_bfloat16=False, + default_dtype=torch.double, ), dict( fullname='CrossEntropyLoss_2d_indices_target_smoothing_weight', @@ -3454,6 +3710,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m: loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15), check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3466,6 +3723,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='2d_prob_target_weights', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3478,6 +3736,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='3d_prob_target_weights', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='CrossEntropyLoss', @@ -3490,6 +3749,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, desc='4d_prob_target_weights', check_bfloat16=False, + default_dtype=torch.double, ), dict( module_name='PoissonNLLLoss', # Default is log_input=True, full=False @@ -3497,6 +3757,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(), reference_fn=lambda i, t, _: (i.exp() - t.mul(i)).mean(), desc='no_full_loss', + default_dtype=torch.double, ), dict( module_name='PoissonNLLLoss', @@ -3506,6 +3767,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 target_fn=lambda: torch.randn(2, 3, 4, 5).floor_().abs_(), reference_fn=lambda i, t, _: (i - t.mul((i + 1e-8).log())).mean(), desc='no_full_loss_no_log_input', + default_dtype=torch.double, ), dict( module_name='PoissonNLLLoss', @@ -3516,6 +3778,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, _: (i.exp() - t.mul(i) + (t.mul(t.log()) - t + 0.5 * (2. * pi * t).log()).masked_fill(t <= 1, 0)).mean(), desc='full_loss', + default_dtype=torch.double, ), dict( module_name='PoissonNLLLoss', @@ -3527,6 +3790,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 i - t.mul((i + 1e-8).log()) + (t.mul(t.log()) - t + 0.5 * (2. * pi * t).log()).masked_fill(t <= 1, 0) ).mean(), desc='full_loss_no_log_input', + default_dtype=torch.double, ), dict( module_name='L1Loss', @@ -3535,6 +3799,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, _: 1. / i.numel() * (i - t).abs().sum(), desc='scalar', check_complex=True, + default_dtype=torch.double, ), dict( module_name='KLDivLoss', @@ -3544,6 +3809,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 kldivloss_reference(i, t, get_reduction(m)), check_sum_reduction=True, desc='scalar', + default_dtype=torch.double, ), dict( module_name='KLDivLoss', @@ -3555,16 +3821,18 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 kldivloss_log_target_reference(i, t, get_reduction(m)), check_sum_reduction=True, desc='scalar_log_target', + default_dtype=torch.double, ), dict( module_name='MSELoss', input_size=(), - target_fn=lambda: torch.randn((), requires_grad=True), + target_fn=lambda: torch.randn((), requires_grad=True, dtype=torch.double), reference_fn=lambda i, t, m: ((i - t).abs().pow(2).sum() / (i.numel() if get_reduction(m) == 'mean' else 1)), check_sum_reduction=True, desc='scalar', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='MSELoss', @@ -3586,6 +3854,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 (i.numel() if get_reduction(m) == 'mean' else 1), desc='scalar_weights', check_bfloat16=True, + default_dtype=torch.double, ), dict( module_name='HingeEmbeddingLoss', @@ -3595,6 +3864,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 target_fn=lambda: torch.randn(()).gt(0).to(torch.get_default_dtype()).mul_(2).sub(1), desc='scalar_margin', check_sum_reduction=True, + default_dtype=torch.double, ), dict( module_name='SmoothL1Loss', @@ -3604,6 +3874,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 reference_fn=lambda i, t, m, b=1.0: smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b), desc='scalar', + default_dtype=torch.double, ), dict( module_name='MultiLabelSoftMarginLoss', @@ -3616,6 +3887,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 desc='weights', check_sum_reduction=True, check_gradgrad=False, + default_dtype=torch.double, ), dict( module_name='CTCLoss', @@ -3633,6 +3905,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths` test_cpp_api_parity=False, check_jit=False, + default_dtype=torch.double, ), dict( module_name='CTCLoss', @@ -3648,6 +3921,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, check_gradgrad=False, check_half=False, + default_dtype=torch.double, ), # Test is flaky # See https://github.com/pytorch/pytorch/issues/29380. @@ -3680,6 +3954,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths` test_cpp_api_parity=False, check_jit=False, + default_dtype=torch.double, ), dict( module_name='CTCLoss', @@ -3695,6 +3970,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, check_gradgrad=False, check_half=False, + default_dtype=torch.double, ), dict( module_name='CTCLoss', @@ -3710,6 +3986,7 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 check_sum_reduction=True, check_gradgrad=False, check_half=False, + default_dtype=torch.double, ), ] @@ -3760,6 +4037,7 @@ def flatten(xs): target_size=(3, ), reference_fn=single_batch_reference_criterion_fn, test_cpp_api_parity=False, + default_dtype=torch.double, ) criterion_tests.append(regression_test_info) @@ -3772,6 +4050,7 @@ def flatten(xs): target_fn=lambda: torch.rand((3,)), reference_fn=single_batch_reference_criterion_fn, test_cpp_api_parity=False, + default_dtype=torch.double, ) criterion_tests.append(regression_test_info) @@ -3779,18 +4058,30 @@ def flatten(xs): # Check that classification criterion work with no batch dimensions # List of tuples of (name, input_fn, target_fn) classification_criterion_no_batch = [ - ('BCELoss', lambda: torch.sigmoid(torch.randn(9)), lambda: torch.randn(9).gt(0).to(torch.get_default_dtype())), - ('BCEWithLogitsLoss', lambda: torch.randn(9), lambda: torch.randn(9)), - ('HingeEmbeddingLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)), - ('MultiLabelMarginLoss', lambda: torch.randn(4), lambda: torch.tensor([3, 0, -1, 1])), - ('SoftMarginLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)), - ('NLLLoss', lambda: F.log_softmax(torch.randn(3), dim=0), lambda: torch.tensor(1)), - ('CosineEmbeddingLoss', lambda: (torch.randn(9), torch.randn(9)), lambda: torch.tensor(1)), + ( + 'BCELoss', + lambda: torch.sigmoid(torch.randn(9, dtype=torch.double)), + lambda: torch.randn(9, dtype=torch.double).gt(0).to(torch.double) + ), + ('BCEWithLogitsLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.randn(9, dtype=torch.double)), + ('HingeEmbeddingLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.tensor([-1, 1, 1] * 3)), + ('MultiLabelMarginLoss', lambda: torch.randn(4, dtype=torch.double), lambda: torch.tensor([3, 0, -1, 1])), + ('SoftMarginLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.tensor([-1, 1, 1] * 3)), + ('NLLLoss', lambda: F.log_softmax(torch.randn(3, dtype=torch.double), dim=0), lambda: torch.tensor(1)), + ( + 'CosineEmbeddingLoss', + lambda: (torch.randn(9, dtype=torch.double), torch.randn(9, dtype=torch.double)), + lambda: torch.tensor(1, dtype=torch.double) + ), # For MarginRankingLoss, input_fn : (x1, x2) and target_fn : target ('MarginRankingLoss', lambda: (torch.randn(()), torch.randn(())), lambda: torch.randn(()).sign()), # For TripletMarginLoss, input_fn : (anchor, positive) and target_fn : negative - ('TripletMarginLoss', lambda: (torch.randn(9), torch.randn(9)), lambda: torch.randn(9)), - ('MultiLabelSoftMarginLoss', lambda: torch.randn(9), lambda: torch.randn(9)), + ( + 'TripletMarginLoss', + lambda: (torch.randn(9, dtype=torch.double), torch.randn(9, dtype=torch.double)), + lambda: torch.randn(9, dtype=torch.double) + ), + ('MultiLabelSoftMarginLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.randn(9)), ] classification_criterion_no_batch_extra_info: Dict[str, dict] = { 'MultiLabelMarginLoss': {'check_gradgrad': False}, @@ -4037,31 +4328,35 @@ def __init__(self, *args, **kwargs): kwargs.get('FIXME_no_cuda_gradgrad_comparison', False) self.precision = kwargs.get('precision', 2e-4) self.check_forward_only = kwargs.get('check_forward_only', False) + self.default_dtype = kwargs.get('default_dtype', None) + if self.default_dtype is None: + self.default_dtype = torch.get_default_dtype() def __call__(self, test_case): - module = self.constructor(*self.constructor_args) - input = self._get_input() - - if self.reference_fn is not None: - out = test_case._forward(module, input) - ref_input = deepcopy(input) - ref_module = deepcopy(module) - expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0], ref_module) - test_case.assertEqual(out, expected_out, exact_dtype=False) - if self.check_forward_only: - return - self.test_noncontig(test_case, module, input) - - if self.should_test_pickle: - # TODO: do this with in-memory files as soon as torch.save will support it - with tempfile.TemporaryFile() as f: - test_case._forward(module, input) - torch.save(module, f) - f.seek(0) - module_copy = torch.load(f) - test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input)) - - self._do_test(test_case, module, input) + with set_default_dtype(self.default_dtype): + module = self.constructor(*self.constructor_args) + input = self._get_input() + + if self.reference_fn is not None: + out = test_case._forward(module, input) + ref_input = deepcopy(input) + ref_module = deepcopy(module) + expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0], ref_module) + test_case.assertEqual(out, expected_out, exact_dtype=False) + if self.check_forward_only: + return + self.test_noncontig(test_case, module, input) + + if self.should_test_pickle: + # TODO: do this with in-memory files as soon as torch.save will support it + with tempfile.TemporaryFile() as f: + test_case._forward(module, input) + torch.save(module, f) + f.seek(0) + module_copy = torch.load(f) + test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input)) + + self._do_test(test_case, module, input) def noncontiguize(self, obj): if isinstance(obj, list): @@ -4125,92 +4420,94 @@ def test_cuda(self, test_case): if not TEST_CUDA or not self.should_test_cuda: raise unittest.SkipTest('Excluded from CUDA tests') - cpu_input = self._get_input() - type_map = {torch.double: torch.float} - cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,) - - is_any_input_complex = any(isinstance(t, torch.Tensor) and t.dtype.is_complex for t in cpu_input_tuple) - - gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map) - - cpu_module = self.constructor(*self.constructor_args) - gpu_module = self.constructor(*self.constructor_args).float().cuda() - cpu_param = test_case._get_parameters(cpu_module) - gpu_param = test_case._get_parameters(gpu_module) - for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]): - gpu_p.data.copy_(cpu_p) - - test_case._zero_grad_input(cpu_input_tuple) - test_case._zero_grad_input(gpu_input_tuple) - test_case._zero_grad_parameters(cpu_module) - test_case._zero_grad_parameters(gpu_module) - cpu_output = test_case._forward(cpu_module, cpu_input_tuple) - gpu_output = test_case._forward(gpu_module, gpu_input_tuple) - if getattr(cpu_module, "return_indices", False): - cpu_output = cpu_output[0] - gpu_output = gpu_output[0] - test_case.assertEqual(cpu_output, gpu_output, atol=self.precision, rtol=0, exact_dtype=False) - - # Run backwards on CPU and GPU and compare results - for _ in range(5): - cpu_gradOutput = cpu_output.clone().normal_() - gpu_gradOutput = cpu_gradOutput.type_as(gpu_output) - cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput) - gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput) - test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False) - for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]): - test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0) - - # Run double-backwards on CPU and GPU and compare results - if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison: - cpu_output = cpu_module(*cpu_input_tuple) - gpu_output = gpu_module(*gpu_input_tuple) + with set_default_dtype(self.default_dtype): + cpu_input = self._get_input() + + type_map = {torch.double: torch.float} + cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,) + + is_any_input_complex = any(isinstance(t, torch.Tensor) and t.dtype.is_complex for t in cpu_input_tuple) + + gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map) + + cpu_module = self.constructor(*self.constructor_args) + gpu_module = self.constructor(*self.constructor_args).float().cuda() + cpu_param = test_case._get_parameters(cpu_module) + gpu_param = test_case._get_parameters(gpu_module) + for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]): + gpu_p.data.copy_(cpu_p) + + test_case._zero_grad_input(cpu_input_tuple) + test_case._zero_grad_input(gpu_input_tuple) + test_case._zero_grad_parameters(cpu_module) + test_case._zero_grad_parameters(gpu_module) + cpu_output = test_case._forward(cpu_module, cpu_input_tuple) + gpu_output = test_case._forward(gpu_module, gpu_input_tuple) if getattr(cpu_module, "return_indices", False): cpu_output = cpu_output[0] gpu_output = gpu_output[0] + test_case.assertEqual(cpu_output, gpu_output, atol=self.precision, rtol=0, exact_dtype=False) + + # Run backwards on CPU and GPU and compare results + for _ in range(5): + cpu_gradOutput = cpu_output.clone().normal_() + gpu_gradOutput = cpu_gradOutput.type_as(gpu_output) + cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput) + gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput) + test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False) + for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]): + test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0) + + # Run double-backwards on CPU and GPU and compare results + if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison: + cpu_output = cpu_module(*cpu_input_tuple) + gpu_output = gpu_module(*gpu_input_tuple) + if getattr(cpu_module, "return_indices", False): + cpu_output = cpu_output[0] + gpu_output = gpu_output[0] + + cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True) + gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach() + gpu_gradOutput.requires_grad = True + + cpu_gradInputs = torch.autograd.grad( + cpu_output, + cpu_input_tuple + tuple(cpu_module.parameters()), + cpu_gradOutput, + create_graph=True) + gpu_gradInputs = torch.autograd.grad( + gpu_output, + gpu_input_tuple + tuple(gpu_module.parameters()), + gpu_gradOutput, + create_graph=True) + + for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs): + test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False) + + # We mix output into the second backwards computation so that + # torch.autograd.grad doesn't complain that some inputs + # are unreachable (which can happen if you differentiate + # only on the gradient. + if is_any_input_complex: + outputs_cpu = cpu_output.sum().abs() + sum(x.sum().abs() for x in cpu_gradInputs) + outputs_gpu = gpu_output.sum().abs() + sum(x.sum().abs() for x in gpu_gradInputs) + else: + outputs_cpu = cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs) + outputs_gpu = gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs) - cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True) - gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach() - gpu_gradOutput.requires_grad = True - - cpu_gradInputs = torch.autograd.grad( - cpu_output, - cpu_input_tuple + tuple(cpu_module.parameters()), - cpu_gradOutput, - create_graph=True) - gpu_gradInputs = torch.autograd.grad( - gpu_output, - gpu_input_tuple + tuple(gpu_module.parameters()), - gpu_gradOutput, - create_graph=True) - - for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs): - test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False) - - # We mix output into the second backwards computation so that - # torch.autograd.grad doesn't complain that some inputs - # are unreachable (which can happen if you differentiate - # only on the gradient. - if is_any_input_complex: - outputs_cpu = cpu_output.sum().abs() + sum(x.sum().abs() for x in cpu_gradInputs) - outputs_gpu = gpu_output.sum().abs() + sum(x.sum().abs() for x in gpu_gradInputs) - else: - outputs_cpu = cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs) - outputs_gpu = gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs) - - cpu_gg = torch.autograd.grad( - outputs_cpu, - cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()), - retain_graph=True) - gpu_gg = torch.autograd.grad( - outputs_gpu, - gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()), - retain_graph=True) - test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False) - for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg): - test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False) + cpu_gg = torch.autograd.grad( + outputs_cpu, + cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()), + retain_graph=True) + gpu_gg = torch.autograd.grad( + outputs_gpu, + gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()), + retain_graph=True) + test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False) + for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg): + test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False) - self.test_noncontig(test_case, gpu_module, gpu_input_tuple) + self.test_noncontig(test_case, gpu_module, gpu_input_tuple) class InputVariableMixin: @@ -4445,42 +4742,46 @@ def __init__(self, *args, **kwargs): self.with_tf32 = kwargs.get('with_tf32', True) self.tf32_precision = kwargs.get('tf32_precision', 0.001) self.check_batched_grad = kwargs.get('check_batched_grad', True) + self.default_dtype = kwargs.get('default_dtype', None) + if self.default_dtype is None: + self.default_dtype = torch.get_default_dtype() def __call__(self, test_case): - module = self.constructor(*self.constructor_args) - input = self._get_input() + with set_default_dtype(self.default_dtype): + module = self.constructor(*self.constructor_args) + input = self._get_input() - # Check that these methods don't raise errors - module.__repr__() - str(module) + # Check that these methods don't raise errors + module.__repr__() + str(module) - target = self._get_target() + target = self._get_target() - if self.reference_fn is not None: - out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args) - ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,) - expected_out = self.reference_fn(*ref_args) - test_case.assertEqual(out, expected_out) + if self.reference_fn is not None: + out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args) + ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,) + expected_out = self.reference_fn(*ref_args) + test_case.assertEqual(out, expected_out) - if self.check_forward_only: - return + if self.check_forward_only: + return - params = tuple(x for x in module.parameters()) - if not isinstance(input, tuple): - inputs = (input,) + params + (target,) + params = tuple(x for x in module.parameters()) + if not isinstance(input, tuple): + inputs = (input,) + params + (target,) - def apply_fn(input, target, *params): - return module(input, target) - else: - inputs = input + params + (target,) + def apply_fn(input, target, *params): + return module(input, target) + else: + inputs = input + params + (target,) - def apply_fn(input1, input2, target, *params): # type: ignore[misc] - return module(input1, input2, target) + def apply_fn(input1, input2, target, *params): # type: ignore[misc] + return module(input1, input2, target) - gradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad) + gradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad) - if self.check_gradgrad: - gradgradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad) + if self.check_gradgrad: + gradgradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad) def test_cuda(self, test_case, dtype, extra_args=None): def convert_dtype(obj, dtype, requires_grad=False): @@ -4494,43 +4795,44 @@ def convert_dtype(obj, dtype, requires_grad=False): if not TEST_CUDA or not self.should_test_cuda: raise unittest.SkipTest('Excluded from CUDA tests') - cpu_input = self._get_input() - cpu_target = self._get_target() - cpu_module = self.constructor(*self.constructor_args) - gpu_module = self.constructor(*self.constructor_args) - - # Convert input, target and module parameters to dtype - cpu_input = convert_dtype(cpu_input, dtype, True) - if cpu_target.is_floating_point() or cpu_target.is_complex(): - cpu_target = convert_dtype(cpu_target, dtype) - cpu_module.type(dtype) - gpu_module.type(dtype) - - # GPU setup - gpu_input = to_gpu(cpu_input) - gpu_target = to_gpu(cpu_target) - gpu_module.cuda() - - # torch.HalfTensor doesn't support most operations, converting back to default - if dtype in {torch.half, torch.bfloat16}: + with set_default_dtype(self.default_dtype): cpu_input = self._get_input() cpu_target = self._get_target() - # Loss modules with weights require consistent input/module weight types cpu_module = self.constructor(*self.constructor_args) - - cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) - gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) - # dtype used to be able to be None, so set precision in this way instead of a precision map - test_case.assertEqual(cpu_output, gpu_output, - atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False) - - cpu_gradInput = test_case._backward_criterion( - cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args) - gpu_gradInput = test_case._backward_criterion( - gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args) - # dtype used to be able to be None, so set precision in this way instead of a precision map - test_case.assertEqual(cpu_gradInput, gpu_gradInput, - atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False) + gpu_module = self.constructor(*self.constructor_args) + + # Convert input, target and module parameters to dtype + cpu_input = convert_dtype(cpu_input, dtype, True) + if cpu_target.is_floating_point() or cpu_target.is_complex(): + cpu_target = convert_dtype(cpu_target, dtype) + cpu_module.type(dtype) + gpu_module.type(dtype) + + # GPU setup + gpu_input = to_gpu(cpu_input) + gpu_target = to_gpu(cpu_target) + gpu_module.cuda() + + # torch.HalfTensor doesn't support most operations, converting back to default + if dtype in {torch.half, torch.bfloat16}: + cpu_input = self._get_input() + cpu_target = self._get_target() + # Loss modules with weights require consistent input/module weight types + cpu_module = self.constructor(*self.constructor_args) + + cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) + gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) + # dtype used to be able to be None, so set precision in this way instead of a precision map + test_case.assertEqual(cpu_output, gpu_output, + atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False) + + cpu_gradInput = test_case._backward_criterion( + cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args) + gpu_gradInput = test_case._backward_criterion( + gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args) + # dtype used to be able to be None, so set precision in this way instead of a precision map + test_case.assertEqual(cpu_gradInput, gpu_gradInput, + atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False) def _get_target(self): return self._get_arg('target', False) diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py index 15e7b4512a42a3..4ace78f7594e33 100644 --- a/torch/testing/_internal/hypothesis_utils.py +++ b/torch/testing/_internal/hypothesis_utils.py @@ -189,7 +189,7 @@ def array_shapes(draw, min_dims=1, max_dims=None, min_side=1, max_side=None, max (If `qparams` arg is None), returns None. """ @st.composite -def tensor(draw, shapes=None, elements=None, qparams=None): +def tensor(draw, shapes=None, elements=None, qparams=None, dtype=np.float32): if isinstance(shapes, SearchStrategy): _shape = draw(shapes) else: @@ -197,7 +197,7 @@ def tensor(draw, shapes=None, elements=None, qparams=None): if qparams is None: if elements is None: elements = floats(-1e6, 1e6, allow_nan=False, width=32) - X = draw(stnp.arrays(dtype=np.float32, elements=elements, shape=_shape)) + X = draw(stnp.arrays(dtype=dtype, elements=elements, shape=_shape)) assume(not (np.isnan(X).any() or np.isinf(X).any())) return X, None qparams = draw(qparams) @@ -205,7 +205,7 @@ def tensor(draw, shapes=None, elements=None, qparams=None): min_value, max_value = _get_valid_min_max(qparams) elements = floats(min_value, max_value, allow_infinity=False, allow_nan=False, width=32) - X = draw(stnp.arrays(dtype=np.float32, elements=elements, shape=_shape)) + X = draw(stnp.arrays(dtype=dtype, elements=elements, shape=_shape)) # Recompute the scale and zero_points according to the X statistics. scale, zp = _calculate_dynamic_qparams(X, qparams[2]) enforced_zp = _ENFORCED_ZERO_POINT.get(qparams[2], None)