huggingface · jp1924 · Jan 6, 2025 · Jan 8, 2025
@@ -151,6 +151,13 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
+        num_image_tokens = sum([txt.count(self.image_token) for txt in text])
+        num_images = len(image_inputs["pixel_values"]) if image_inputs else 0
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image token ({num_image_tokens}) should be the same as in the number of provided images ({num_images})"
+            )
+
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = text
         if image_inputs.get("pixel_values") is not None:

@@ -147,6 +147,13 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
+        num_image_tokens = sum([txt.count(self.image_token) for txt in text])
+        num_images = len(image_inputs["pixel_values"]) if image_inputs else 0
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image token ({num_image_tokens}) should be the same as in the number of provided images ({num_images})"
+            )
+
         prompt_strings = text
         if image_inputs:
             if self.patch_size is None or self.vision_feature_select_strategy is None:

@@ -173,6 +173,19 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
+        num_image_tokens = sum([txt.count(self.image_token) for txt in text])
+        num_images = len(image_inputs["pixel_values"]) if image_inputs else 0
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image token ({num_image_tokens}) should be the same as in the number of provided images ({num_images})"
+            )
+        num_video_tokens = sum([txt.count(self.video_token) for txt in text])
+        num_videos = len(videos_inputs["pixel_values_videos"]) if videos_inputs else 0
+        if num_video_tokens != num_videos:
+            raise ValueError(
+                f"The number of video token ({num_video_tokens}) should be the same as in the number of provided videos ({num_videos})"
+            )
+
         if self.patch_size is None or self.vision_feature_select_strategy is None:
             logger.warning_once(
                 "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "

@@ -154,6 +154,13 @@ def __call__(
         if images is not None:
             image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
 
+            num_image_tokens = sum([txt.count(self.image_token) for txt in text])
+            num_images = len(image_inputs["pixel_values"])
+            if num_image_tokens != num_images:
+                raise ValueError(
+                    f"The number of image token ({num_image_tokens}) should be the same as in the number of provided images ({num_images})"
+                )
+
             image_sizes = iter(image_inputs["image_sizes"])
             height, width = get_image_size(
                 to_numpy_array(image_inputs["pixel_values"][0][0]),
@@ -164,6 +171,13 @@ def __call__(
         if videos is not None:
             video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
 
+            num_video_tokens = sum([txt.count(self.video_token) for txt in text])
+            num_videos = len(video_inputs["pixel_values_videos"])
+            if num_video_tokens != num_videos:
+                raise ValueError(
+                    f"The number of video token ({num_video_tokens}) should be the same as in the number of provided videos ({num_videos})"
+                )
+
             one_video = to_numpy_array(video_inputs["pixel_values_videos"][0])
             height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
             num_frames = one_video.shape[0]  # frame dim is always after batch dim

@@ -113,6 +113,7 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+
         if images is not None:
             image_inputs = self.image_processor(images=images, videos=None, **output_kwargs["images_kwargs"])
             image_grid_thw = image_inputs["image_grid_thw"]
@@ -129,6 +130,21 @@ def __call__(
 
         if not isinstance(text, list):
             text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        num_image_tokens = sum([txt.count(self.image_token) for txt in text])
+        num_images = len(image_inputs["pixel_values"]) if image_inputs else 0
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image token ({num_image_tokens}) should be the same as in the number of provided images ({num_images})"
+            )
+        num_video_tokens = sum([txt.count(self.video_token) for txt in text])
+        num_videos = len(videos_inputs["pixel_values_videos"]) if videos_inputs else 0
+        if num_video_tokens != num_videos:
+            raise ValueError(
+                f"The number of video token ({num_video_tokens}) should be the same as in the number of provided videos ({num_videos})"
+            )
 
         if image_grid_thw is not None:
             merge_length = self.image_processor.merge_size**2