From 22c3a582cb922b2fed6641b3976dbfd07143bc08 Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:08:33 +0530
Subject: [PATCH 01/13] changes to modeling to accomodate custom 4D masks

---
 src/transformers/models/gpt2/modeling_gpt2.py | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index df3d88eda8cad9..24cecd3bbe986f 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -815,6 +815,30 @@ def forward(
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
             position_ids = position_ids.unsqueeze(0)
 
+        # Convert 4D attention mask to 2D if necessary
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # Extract the relevant information from the 4D mask
+            # Assuming mask of shape [batch_size, 1, seq_len, seq_len]
+            # where 0 means "attend" and negative infinity means "don't attend"
+            attention_mask = (attention_mask[:, 0, 0] > -1).float()  # Convert to [batch_size, seq_len]
+            attention_mask = attention_mask.view(batch_size, -1)
+
+        # GPT2's original attention mask handling
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+            
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
         position_embeds = self.wpe(position_ids)

From ac6636d3d318e7186b3622d65b78bf8972be0fe5 Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:12:28 +0530
Subject: [PATCH 02/13] initilaise test file from custom GPT library import

---
 tests/models/gpt2/test_modeling_4D_attention_mask.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 tests/models/gpt2/test_modeling_4D_attention_mask.py

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
new file mode 100644
index 00000000000000..dafc20ff115f2f
--- /dev/null
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -0,0 +1,5 @@
+import unittest
+import torch
+from transformers import AutoTokenizer
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+

From 32d9db230b7186bc261f024ce1d0f080ce0d099d Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:14:13 +0530
Subject: [PATCH 03/13] setup basic class for testing attention mask for gpt2

---
 tests/models/gpt2/test_modeling_4D_attention_mask.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index dafc20ff115f2f..ffc7f4632e493b 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -3,3 +3,9 @@
 from transformers import AutoTokenizer
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
+class TestAttentionMaskIssue(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "gpt2"
+        cls.model = GPT2LMHeadModel.from_pretrained(cls.model_name)
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
\ No newline at end of file

From b2f0f80ee4309b4afe94e39a442a63cacb0fe1cc Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:15:37 +0530
Subject: [PATCH 04/13] add method for preparing data for testing

---
 .../gpt2/test_modeling_4D_attention_mask.py   | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index ffc7f4632e493b..3b244926c33c83 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -8,4 +8,43 @@ class TestAttentionMaskIssue(unittest.TestCase):
     def setUpClass(cls):
         cls.model_name = "gpt2"
         cls.model = GPT2LMHeadModel.from_pretrained(cls.model_name)
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
\ No newline at end of file
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+
+    def prepare_data(self, packing=False):
+        texts = [
+            "Hello, how are you?",
+            "When is the next holiday?",
+            "China is a great country.",
+        ]
+        encoded = self.tokenizer(texts)
+        
+        if packing:
+            total_length = sum(len(x) for x in encoded["input_ids"])
+            input_ids = torch.zeros((1, total_length), dtype=torch.long)
+            # Create 4D attention mask with proper shape
+            attention_mask = torch.full(
+                (1, 1, total_length, total_length), 
+                dtype=torch.float32, 
+                fill_value=float("-inf")
+            )
+
+            offset = 0
+            for i, (ids, mask) in enumerate(zip(encoded["input_ids"], encoded["attention_mask"])):
+                length = len(ids)
+                input_ids[0, offset:offset + length] = torch.tensor(ids)
+                # Set valid attention positions to 0
+                attention_mask[0, 0, offset:offset + length, :offset + length] = 0.
+                offset += length
+            
+            return input_ids, attention_mask
+        else:
+            # Regular batched processing
+            max_length = max(len(x) for x in encoded["input_ids"])
+            input_ids = torch.zeros((len(texts), max_length), dtype=torch.long)
+            attention_mask = torch.zeros((len(texts), max_length), dtype=torch.long)
+            
+            for i, (ids, mask) in enumerate(zip(encoded["input_ids"], encoded["attention_mask"])):
+                input_ids[i, :len(ids)] = torch.tensor(ids)
+                attention_mask[i, :len(mask)] = torch.tensor(mask)
+                
+            return input_ids, attention_mask
\ No newline at end of file

From cbc2344e5bf900fa8acddc4a94f6ab5639c4411b Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:17:16 +0530
Subject: [PATCH 05/13] add test for testing attention mask shapes

---
 .../gpt2/test_modeling_4D_attention_mask.py   | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index 3b244926c33c83..d512803f832d62 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -47,4 +47,22 @@ def prepare_data(self, packing=False):
                 input_ids[i, :len(ids)] = torch.tensor(ids)
                 attention_mask[i, :len(mask)] = torch.tensor(mask)
                 
-            return input_ids, attention_mask
\ No newline at end of file
+            return input_ids, attention_mask
+    
+    def test_attention_mask_shapes(self):
+        # Test both regular and packed versions
+        input_ids_regular, mask_regular = self.prepare_data(packing=False)
+        output_regular = self.model(input_ids=input_ids_regular, attention_mask=mask_regular)
+        
+        input_ids_packed, mask_packed = self.prepare_data(packing=True)
+        output_packed = self.model(input_ids=input_ids_packed, attention_mask=mask_packed)
+        
+        # Verify outputs have expected shapes
+        self.assertEqual(
+            output_regular.logits.shape[:-1],
+            input_ids_regular.shape
+        )
+        self.assertEqual(
+            output_packed.logits.shape[:-1],
+            input_ids_packed.shape
+        )
\ No newline at end of file

From b260df7098d53c8dd9d572be20792ff05cb3cc66 Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:18:00 +0530
Subject: [PATCH 06/13] add test for testing attention mask shapes

---
 .../gpt2/test_modeling_4D_attention_mask.py      | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index d512803f832d62..bb30c74dd57da1 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -65,4 +65,18 @@ def test_attention_mask_shapes(self):
         self.assertEqual(
             output_packed.logits.shape[:-1],
             input_ids_packed.shape
-        )
\ No newline at end of file
+        )
+
+    def test_attention_patterns(self):
+        # Test that attention patterns are preserved
+        input_ids, mask_4d = self.prepare_data(packing=True)
+        
+        # Create equivalent 2D mask
+        mask_2d = (mask_4d[:, 0, 0] > -1).float()
+        
+        # Compare outputs
+        output_4d = self.model(input_ids, attention_mask=mask_4d)
+        output_2d = self.model(input_ids, attention_mask=mask_2d)
+        
+        # Outputs should be nearly identical
+        torch.testing.assert_close(output_4d.logits, output_2d.logits)
\ No newline at end of file

From 43bdb10edc80eb388b64c71160b5c2ef3b50b63e Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:19:29 +0530
Subject: [PATCH 07/13] add test for testing casual attention

---
 tests/models/gpt2/test_modeling_4D_attention_mask.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index bb30c74dd57da1..bbee1e91e7396d 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -79,4 +79,14 @@ def test_attention_patterns(self):
         output_2d = self.model(input_ids, attention_mask=mask_2d)
         
         # Outputs should be nearly identical
-        torch.testing.assert_close(output_4d.logits, output_2d.logits)
\ No newline at end of file
+        torch.testing.assert_close(output_4d.logits, output_2d.logits)
+
+    def test_causal_attention(self):
+        # Test causal attention is preserved with 4D masks
+        input_ids, mask_4d = self.prepare_data(packing=True)
+        outputs = self.model(input_ids, attention_mask=mask_4d, output_attentions=True)
+        
+        # Verify upper triangle is masked
+        attentions = outputs.attentions[0]  # First layer
+        upper_triangle = torch.triu(attentions, diagonal=1)
+        assert torch.all(upper_triangle == 0)
\ No newline at end of file

From c9bbc1717360da3b0d04e9de432e67d76351a5ea Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:35:22 +0530
Subject: [PATCH 08/13] add test for testing batch consistency

---
 .../gpt2/test_modeling_4D_attention_mask.py   | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index bbee1e91e7396d..c99ed44b3e76ed 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -89,4 +89,38 @@ def test_causal_attention(self):
         # Verify upper triangle is masked
         attentions = outputs.attentions[0]  # First layer
         upper_triangle = torch.triu(attentions, diagonal=1)
-        assert torch.all(upper_triangle == 0)
\ No newline at end of file
+        assert torch.all(upper_triangle == 0)
+
+    def test_causal_attention(self):
+        # Test causal attention is preserved with 4D masks
+        input_ids, mask_4d = self.prepare_data(packing=True)
+        outputs = self.model(input_ids, attention_mask=mask_4d, output_attentions=True)
+        
+        # Verify upper triangle is masked
+        attentions = outputs.attentions[0]  # First layer
+        upper_triangle = torch.triu(attentions, diagonal=1)
+        assert torch.all(upper_triangle == 0)
+
+    def test_batch_consistency(self):
+        # Test consistency across different batch sizes
+        input_ids, mask_4d = self.prepare_data(packing=True)
+        
+        # Single batch
+        single_output = self.model(
+            input_ids[:1],
+            attention_mask=mask_4d[:1]
+        )
+        
+        # Multiple batches
+        multi_output = self.model(
+            input_ids,
+            attention_mask=mask_4d
+        )
+        
+        # First batch should give same results
+        torch.testing.assert_close(
+            single_output.logits,
+            multi_output.logits[:1],
+            rtol=1e-5,
+            atol=1e-5
+        )
\ No newline at end of file

From 370ff491aa6cf3b62a9908a9be297cbba736d3e9 Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:36:04 +0530
Subject: [PATCH 09/13] add test for testing edge cases

---
 .../gpt2/test_modeling_4D_attention_mask.py   | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index c99ed44b3e76ed..a144fa251cab9c 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -123,4 +123,29 @@ def test_batch_consistency(self):
             multi_output.logits[:1],
             rtol=1e-5,
             atol=1e-5
+        )
+
+    def test_edge_cases(self):
+        # Test edge cases
+        
+        # 1. Empty sequence (just padding)
+        empty_ids = torch.zeros((1, 10), dtype=torch.long)
+        empty_mask = torch.full((1, 1, 10, 10), float("-inf"))
+        outputs = self.model(empty_ids, attention_mask=empty_mask)
+        self.assertEqual(outputs.logits.shape, (1, 10, self.model.config.vocab_size))
+        
+        # 2. Single token
+        single_token = torch.tensor([[1]], dtype=torch.long)
+        single_mask = torch.zeros((1, 1, 1, 1))
+        outputs = self.model(single_token, attention_mask=single_mask)
+        self.assertEqual(outputs.logits.shape, (1, 1, self.model.config.vocab_size))
+        
+        # 3. Maximum context length
+        max_length = self.model.config.max_position_embeddings
+        long_ids = torch.ones((1, max_length), dtype=torch.long)
+        long_mask = torch.zeros((1, 1, max_length, max_length))
+        outputs = self.model(long_ids, attention_mask=long_mask)
+        self.assertEqual(
+            outputs.logits.shape,
+            (1, max_length, self.model.config.vocab_size)
         )
\ No newline at end of file

From bdfd4f61b65a549b2a96144b669f097457b720c5 Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:36:50 +0530
Subject: [PATCH 10/13] add test for testing 4D mask handling

---
 .../models/gpt2/test_modeling_4D_attention_mask.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index a144fa251cab9c..4fcc4f6421cbb1 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -148,4 +148,16 @@ def test_edge_cases(self):
         self.assertEqual(
             outputs.logits.shape,
             (1, max_length, self.model.config.vocab_size)
-        )
\ No newline at end of file
+        )
+
+    def test_4d_mask_handling(self):
+        """Critical test: Verify 4D attention mask is handled correctly"""
+        # Prepare packed sequence with 4D mask
+        input_ids, mask_4d = self.prepare_packed_sequence()
+        
+        # Should run without errors and produce valid outputs
+        try:
+            outputs = self.model(input_ids, attention_mask=mask_4d)
+            self.assertIsNotNone(outputs.logits)
+        except Exception as e:
+            self.fail(f"Failed to handle 4D mask: {e}")
\ No newline at end of file

From 08dc3f00200347c3f71ba36631a0c02f823ac76e Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:37:47 +0530
Subject: [PATCH 11/13] add test for testing 2D vs 4D behaviour to ensure
 consistency

---
 .../gpt2/test_modeling_4D_attention_mask.py   | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index 4fcc4f6421cbb1..ec556d71453fe9 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -160,4 +160,46 @@ def test_4d_mask_handling(self):
             outputs = self.model(input_ids, attention_mask=mask_4d)
             self.assertIsNotNone(outputs.logits)
         except Exception as e:
-            self.fail(f"Failed to handle 4D mask: {e}")
\ No newline at end of file
+            self.fail(f"Failed to handle 4D mask: {e}")
+
+    def test_2d_vs_4d_mask_behavior(self):
+        """Test that 2D and 4D masks produce consistent behavior"""
+        model = GPT2LMHeadModel.from_pretrained(
+            self.model_name,
+            attn_implementation="eager"
+        )
+        
+        # Create a simple sequence with both 2D and 4D masks
+        input_ids = torch.tensor([[1, 2, 3, 4]], dtype=torch.long)
+        
+        # 2D mask: [1, 1, 0, 0] (masking last two tokens)
+        mask_2d = torch.tensor([[1, 1, 0, 0]], dtype=torch.float)
+        
+        # Equivalent 4D mask
+        mask_4d = torch.full((1, 1, 4, 4), float('-inf'))
+        mask_4d[0, 0, :2, :2] = 0  # Allow attention for first two tokens
+        
+        # Get outputs for both masks
+        outputs_2d = model(
+            input_ids,
+            attention_mask=mask_2d,
+            output_attentions=True
+        )
+        
+        outputs_4d = model(
+            input_ids,
+            attention_mask=mask_4d,
+            output_attentions=True
+        )
+        
+        print("\n2D mask attention patterns:")
+        print(outputs_2d.attentions[0][0, 0])  # First layer, first batch, first head
+        
+        print("\n4D mask attention patterns:")
+        print(outputs_4d.attentions[0][0, 0])
+        
+        print("\n2D mask:")
+        print(mask_2d)
+        
+        print("\n4D mask:")
+        print(mask_4d[0, 0])
\ No newline at end of file

From 3c4e77840fc23115e3de3e9d7486b82424fdcfa5 Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:38:56 +0530
Subject: [PATCH 12/13] add helper function to prepare packed sequence for
 other functions in passing

---
 .../gpt2/test_modeling_4D_attention_mask.py   | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index ec556d71453fe9..dfaf14ffe6a808 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -202,4 +202,35 @@ def test_2d_vs_4d_mask_behavior(self):
         print(mask_2d)
         
         print("\n4D mask:")
-        print(mask_4d[0, 0])
\ No newline at end of file
+        print(mask_4d[0, 0])
+
+    def prepare_packed_sequence(self):
+        """Helper to prepare a packed sequence with 4D attention mask"""
+        texts = ["Hello world", "This is a test"]
+        encoded = self.tokenizer(texts)
+        
+        total_length = sum(len(x) for x in encoded["input_ids"])
+        input_ids = torch.zeros((1, total_length), dtype=torch.long)
+        
+        # Create 4D attention mask initialized to -inf
+        mask_4d = torch.full(
+            (1, 1, total_length, total_length), 
+            float('-inf'),
+            dtype=torch.float
+        )
+        
+        offset = 0
+        for ids in encoded["input_ids"]:
+            length = len(ids)
+            input_ids[0, offset:offset + length] = torch.tensor(ids)
+            # Set valid attention positions to 0.0
+            mask_4d[0, 0, offset:offset + length, offset:offset + length] = 0.0
+            offset += length
+            
+        # Add debugging print
+        print("Mask statistics:")
+        print("- Total positions:", mask_4d.numel())
+        print("- Masked positions:", (mask_4d == float('-inf')).sum().item())
+        print("- Unmasked positions:", (mask_4d == 0).sum().item())
+        
+        return input_ids, mask_4d
\ No newline at end of file

From 0dd63f77239f30f6797183051c938e5ab4c420fe Mon Sep 17 00:00:00 2001
From: sambhavnoobcoder <indosambahv@gmail.com>
Date: Mon, 6 Jan 2025 03:39:39 +0530
Subject: [PATCH 13/13] add main call

---
 tests/models/gpt2/test_modeling_4D_attention_mask.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/models/gpt2/test_modeling_4D_attention_mask.py b/tests/models/gpt2/test_modeling_4D_attention_mask.py
index dfaf14ffe6a808..2948f8dc8d8c2f 100644
--- a/tests/models/gpt2/test_modeling_4D_attention_mask.py
+++ b/tests/models/gpt2/test_modeling_4D_attention_mask.py
@@ -233,4 +233,7 @@ def prepare_packed_sequence(self):
         print("- Masked positions:", (mask_4d == float('-inf')).sum().item())
         print("- Unmasked positions:", (mask_4d == 0).sum().item())
         
-        return input_ids, mask_4d
\ No newline at end of file
+        return input_ids, mask_4d
+    
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file