Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IndexError: The shape of the mask [7387] at index 0 does not match the shape of the indexed tensor [1] at index 0 #515

Open
14H034160212 opened this issue Jan 6, 2025 · 0 comments

Comments

@14H034160212
Copy link

14H034160212 commented Jan 6, 2025

🐛 Describe the bug

The error exists when I try to use the qwen2-vl with qwen2-vl liger kernel to generate text.

The following code got the following error. But the same code if I change the liger kernel to qwen2 instead of qwen2-vl. It will correct.

image
Then I add the cache_position to the lce_forward function and it got the following error.
image

Here is the way I used qwen2 liger kernel.

from liger_kernel.transformers import apply_liger_kernel_to_qwen2
apply_liger_kernel_to_qwen2()

here is the way I used qwen2-vl liger kernel.

from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
apply_liger_kernel_to_qwen2_vl()

Reproduce

# Reference: https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html

import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List

import torch
from PIL import Image, ImageDraw, ImageFont
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration

from mmdoc.data.xdataset_base import PageRecord
from mmdoc.data.xdataset_entity import normalize_bbox
from mmdoc.data.xdataset_gpt_entity import GPTXDatasetBuilder
from mmdoc.paths import DATA_DIR
from mmdoc.trainers.eval.gpt_evaluation import get_content_dict, get_value_with_grounding, load_prediction
from peft import PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from peft import LoraConfig, PeftModel

from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
from liger_kernel.transformers import apply_liger_kernel_to_qwen2
from liger_kernel.transformers import monkey_patch
import transformers


# apply_liger_kernel_to_qwen2()

print("Applying Liger Kernel to Qwen2-VL model")
# monkey_patch.apply_liger_kernel_to_qwen2_vl(
#     # These args can be used to override the default Liger settings
#     # cross_entropy=True,
#     # fused_linear_cross_entropy=False,
# )
apply_liger_kernel_to_qwen2_vl(
    rope= True,
    cross_entropy = False,
    fused_linear_cross_entropy = True,
    rms_norm = True,
    layer_norm = True,
    swiglu = True,
)
def get_data(
    dataset_name,
    doc_id,
    page_id,
    header_only: bool = False,
    line_only: bool = False,
    filter_labels: List[str] = None,
    end_to_end: bool = False,
    num_pages: int = None,
    text_grounding: bool = False,  # Add text grounding
    require_grounding: bool = False,  # Ignore typed texts without bounding box
):
    dataset_dir = f"{DATA_DIR}/{dataset_name}"
    iocr_dir = f"{dataset_dir}/iocr_json"
    field_data_dir = f"{dataset_dir}/field_data"

    page = PageRecord.load_from_disk(doc_id, page_id, iocr_dir, field_data_dir, load_json=True)
    add_page_promt = False if num_pages is None else True
    builder = GPTXDatasetBuilder(
        name=dataset_name,
        data_dir=f"{DATA_DIR}",
        header_only=header_only,
        line_only=line_only,
        filter_labels=filter_labels,
        add_page_promt=add_page_promt,
        text_grounding=text_grounding,
        end_to_end=end_to_end,
        require_grounding=require_grounding,
    )
    doc_id_pg_str = str(doc_id) + "_" + str(page_id)
    data_json_dir = f"{dataset_dir}/dataset.json"
    with Path(data_json_dir).open(encoding="utf-8") as f:
        dataset_info = json.load(f)
        field_configs = builder._get_field_configs(dataset_info)
        annos = builder._get_annotations(
            page=page,
            field_configs=field_configs,
            page_idx=page_id,
            num_pages=num_pages,
        )
        prompt = annos["prompt"]
        json_string = annos["response_json"]
        image_path = f"{dataset_dir}/images/{doc_id_pg_str}.jpg"
        return prompt, json_string, image_path


def draw_box(
    pil_img, draw: ImageDraw, bbox, text: str, color: str, font: ImageFont, text_above: bool = True, y_shift: int = 10
):
    x0, y0, x1, y1 = normalize_bbox(bbox=bbox, from_size=(1000, 1000), to_size=pil_img.size)
    draw.rectangle(((x0, y0), (x1, y1)), outline=color, width=2)
    if text_above:  # Draw text above
        draw.text((x0 + 10, y0 - y_shift), text=text, font=font, fill=color)
    else:  # Draw text below
        draw.text((x0 + 10, y0 + y_shift), text=text, font=font, fill=color)


def _draw_lines(
    lines: List[Dict],
    color: str,
    pil_img,
    draw: ImageDraw,
    font: ImageFont,
    text_above: bool = True,
    y_shift: int = 10,
    delta: int = 0,
):
    for _row in lines:
        cnt = 0
        for field_name, _value in _row.items():
            _txt_value, _coord = get_value_with_grounding(_value)
            if _coord:
                txt_value = field_name + ": " + _txt_value
                y_shift_final = y_shift + cnt * delta  # Shift different field differently
                draw_box(pil_img, draw, _coord, txt_value, color, font, text_above, y_shift_final)
                cnt += 1


peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
    )

# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = "Qwen/Qwen2-VL-2B-Instruct"
# device_map = split_model('InternVL2-1B')
model_path = (  # Header only
    "/home/paperspace/mmdoc/outputs/internvl/Mainclass40GlobalShop_SPpromptTGPr/checkpoint-14800"
)
header_only = True
end_to_end = False
filter_labels = None  # "".split(",")
num_pages = 1
text_grounding = True
doc_id = 366023271
page_id = 1
max_seq_length = 1024
require_grounding = True

prompt, json_string, image_path = get_data(
    "GlobalShopWF976934_USER_50USER_20241002",
    doc_id,
    page_id,
    header_only=header_only,
    filter_labels=filter_labels,
    end_to_end=end_to_end,
    num_pages=num_pages,
    text_grounding=text_grounding,
    require_grounding=require_grounding,
)

# model_cls = InternVLChatModel
# model = model_cls.from_pretrained(
#     # model_path,
#     path,
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True,
# ).eval()


model = Qwen2VLForConditionalGeneration.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
model = get_peft_model(model, peft_config)
model.to("cuda")
processor = transformers.AutoProcessor.from_pretrained(path)
processor.tokenizer.pad_token = processor.tokenizer.eos_token
# processor = AutoProcessor.from_pretrained(path)
# tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

start_time = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
            },
            {"type": "text", "text": prompt},
        ],
    },
    {"role": "assistant", "content": [{"type": "text", "text": json_string}]},
]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs.data, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Versions

transformers=4.47.1
liger_kernel=0.5.2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant