Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llava-next-video inference result is empty #498

Open
AmazDeng opened this issue Jun 3, 2024 · 4 comments
Open

llava-next-video inference result is empty #498

AmazDeng opened this issue Jun 3, 2024 · 4 comments

Comments

@AmazDeng
Copy link

AmazDeng commented Jun 3, 2024

I tested the code srt_example_llava_v.py and found that it was unable to produce inference results (the results are empty). The code is as follows:
str_example_llava_v.py

"""
Usage: python3 srt_example_llava.py
"""

import sglang as sgl
import os
import csv
import time
import argparse

@sgl.function
def video_qa(s, num_frames, video_path, question):
    s += sgl.user(sgl.video(video_path,num_frames) + question)
    s += sgl.assistant(sgl.gen("answer"))


def single(path, num_frames=16):
    print(f"single path={path}")
    state = video_qa.run(
        num_frames=num_frames,
        video_path=path,
        question="Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes",
        # temperature=0.0,
        max_new_tokens=1024,
    )
    print(f"single state={state}\n")


def split_into_chunks(lst, num_chunks):
    """Split a list into a specified number of chunks."""
    # Calculate the chunk size using integer division. Note that this may drop some items if not evenly divisible.
    chunk_size = len(lst) // num_chunks

    if chunk_size == 0:
        chunk_size = len(lst)
    # Use list comprehension to generate chunks. The last chunk will take any remainder if the list size isn't evenly divisible.
    chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
    # Ensure we have exactly num_chunks chunks, even if some are empty
    chunks.extend([[] for _ in range(num_chunks - len(chunks))])
    return chunks


def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, save_dir):
    csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['video_name', 'answer'])
        for video_path, state in zip(batch_video_files, states):
            video_name = os.path.basename(video_path)
            # writer.writerow([video_name, state["answer"]])
            writer.writerow([video_name, state])

def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir):
    final_csv_filename = f"{save_dir}/final_results_chunk_{cur_chunk}.csv"
    with open(final_csv_filename, 'w', newline='') as final_csvfile:
        writer = csv.writer(final_csvfile)
        writer.writerow(['video_name', 'answer'])
        for batch_idx in range(num_batches):
            batch_csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
            with open(batch_csv_filename, 'r') as batch_csvfile:
                reader = csv.reader(batch_csvfile)
                next(reader)  # Skip header row
                for row in reader:
                    writer.writerow(row)
            os.remove(batch_csv_filename)

def find_video_files(video_dir):
    # Check if the video_dir is actually a file
    if os.path.isfile(video_dir):
        # If it's a file, return it as a single-element list
        return [video_dir]
    
    # Original logic to find video files in a directory
    video_files = []
    for root, dirs, files in os.walk(video_dir):
        for file in files:
            if file.endswith(('.mp4', '.avi', '.mov')):
                video_files.append(os.path.join(root, file))
    return video_files

def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=64):
    print(f"batch,video_dir={video_dir}")
    video_files = find_video_files(video_dir)
    print(f"batch,video_files={video_files}")
    chunked_video_files = split_into_chunks(video_files, num_chunks)[cur_chunk]
    num_batches = 0

    for i in range(0, len(chunked_video_files), batch_size):
        batch_video_files = chunked_video_files[i:i + batch_size]
        print(f"Processing batch of {len(batch_video_files)} video(s)...")

        if not batch_video_files:
            print("No video files found in the specified directory.")
            return
        
        batch_input = [
            {   
                "num_frames": num_frames,
                "video_path": video_path,
                "question": "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes.",
            } for video_path in batch_video_files
        ]

        start_time = time.time()
        states = video_qa.run_batch(batch_input, max_new_tokens=512, temperature=0.2)
        total_time = time.time() - start_time
        average_time = total_time / len(batch_video_files)
        print(f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds")
        print(f"batch,states={states}")
        save_batch_results(batch_video_files, states, cur_chunk, num_batches, save_dir)
        num_batches += 1

    compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir)


if __name__ == "__main__":

    # Create the parser
    parser = argparse.ArgumentParser(description='Run video processing with specified port.')

    # Add an argument for the port
    parser.add_argument('--port', type=int, default=30000, help='The master port for distributed serving.')
    parser.add_argument('--chunk-idx', type=int, default=0, help='The index of the chunk to process.')
    parser.add_argument('--num-chunks', type=int, default=8, help='The number of chunks to process.')
    parser.add_argument('--save-dir', type=str, default="/media/star/8T/tmp/llava_video", help='The directory to save the processed video files.')
    parser.add_argument('--video-dir', type=str, default="/media/star/8T/tmp/gpt4v/video/tmp", help='The directory or path for the processed video files.')
    parser.add_argument('--model-path', type=str, default="/media/star/8T/model/gpt/llava/llava-next/lmms-lab/LLaVA-NeXT-Video-7B-DPO", help='The model path for the video processing.')
    parser.add_argument('--num-frames', type=int, default=16, help='The number of frames to process in each video.' )
    parser.add_argument("--mm_spatial_pool_stride", type=int, default=2)

    # Parse the arguments
    args = parser.parse_args()

    cur_port = args.port

    cur_chunk = args.chunk_idx

    num_chunks = args.num_chunks

    num_frames = args.num_frames

    if "34b" in args.model_path.lower():
        tokenizer_path = "liuhaotian/llava-v1.6-34b-tokenizer"
    elif "7b" in args.model_path.lower():
        # tokenizer_path = "llava-hf/llava-1.5-7b-hf"
        tokenizer_path="/media/star/8T/model/gpt/llava/llava-hf/llava-1.5-7b-hf"
    else:
        print("Invalid model path. Please specify a valid model path.")
        exit()

    model_overide_args = {}

    model_overide_args["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
    model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
    model_overide_args["num_frames"] = args.num_frames
    model_overide_args["model_type"] = "llava"
    model_overide_args["mm_vision_tower"] = "/media/star/8T/model/clip/openai_clip/clip-vit-large-patch14-336"

    if "34b" in args.model_path.lower():
        model_overide_args["image_token_index"] = 64002


    if args.num_frames == 32:
        model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
        model_overide_args["max_sequence_length"] = 4096 * 2
        model_overide_args["tokenizer_model_max_length"] = 4096 * 2
    elif args.num_frames < 32:
        pass
    else:
        print("The maximum number of frames to process is 32. Please specify a valid number of frames.")
        exit()


    runtime = sgl.Runtime(
        model_path=args.model_path, #"liuhaotian/llava-v1.6-vicuna-7b",
        tokenizer_path=tokenizer_path,
        port=cur_port,
        additional_ports=[cur_port+1,cur_port+2,cur_port+3,cur_port+4],
        model_overide_args=model_overide_args,
        tp_size=1
    )
    sgl.set_default_backend(runtime)
    print(f"chat template: {runtime.endpoint.chat_template.name}")


    # Run a single request
    # try:
    print("\n========== single ==========\n")
    root = args.video_dir
    if os.path.isfile(root):
        video_files = [root]
    else:
        video_files = [os.path.join(root, f) for f in os.listdir(root) if f.endswith(('.mp4', '.avi', '.mov'))]  # Add more extensions if needed
    start_time = time.time()  # Start time for processing a single video
    for cur_video in video_files[:1]:
        print(cur_video)
        single(cur_video, num_frames)
    end_time = time.time()  # End time for processing a single video
    total_time = end_time - start_time
    average_time = total_time / len(video_files)  # Calculate the average processing time
    print(f"Average processing time per video: {average_time:.2f} seconds")
    runtime.shutdown()
    # except Exception as e:
    #     print(e)
    runtime.shutdown()


    # # # Run a batch of requests
    # print("\n========== batch ==========\n")
    # if not os.path.exists(args.save_dir):
    #     os.makedirs(args.save_dir)
    # batch(args.video_dir,args.save_dir,cur_chunk, num_chunks, num_frames, num_chunks)
    # runtime.shutdown()

srt_example_llava_v.sh

#!/bin/bash

##### USAGE #####
#    - First node:
#      ```sh
#      bash examples/usage/llava_video/srt_example_llava_v.sh K 0 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```
#    - Second node:
#      ```sh
#      bash examples/usage/llava_video/srt_example_llava_v.sh K 1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```
#    - The K node:
#      ```sh
#      bash examples/usage/llava_video/srt_example_llava_v.sh K K-1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```


# Replace `K`, `YOUR_VIDEO_PATH`, `YOUR_MODEL_PATH`, and `FRAMES_PER_VIDEO` with your specific details.
# CURRENT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
CURRENT_ROOT=$(dirname "$0")

echo ${CURRENT_ROOT}

cd ${CURRENT_ROOT}

export PYTHONWARNINGS=ignore

START_TIME=$(date +%s)  # Capture start time

NUM_NODES=$1

CUR_NODES_IDX=$2

VIDEO_DIR=$3

MODEL_PATH=$4   

NUM_FRAMES=$5


# FRAME_FORMAT=$6

# FRAME_FORMAT=$(echo $FRAME_FORMAT | tr '[:lower:]' '[:upper:]')

# # Check if FRAME_FORMAT is either JPEG or PNG
# if [[ "$FRAME_FORMAT" != "JPEG" && "$FRAME_FORMAT" != "PNG" ]]; then
#     echo "Error: FRAME_FORMAT must be either JPEG or PNG."
#     exit 1
# fi

# export TARGET_FRAMES=$TARGET_FRAMES

echo "Each video you will sample $NUM_FRAMES frames"

# export FRAME_FORMAT=$FRAME_FORMAT

# echo "The frame format is $FRAME_FORMAT"

# Assuming GPULIST is a bash array containing your GPUs
GPULIST=(0)
LOCAL_CHUNKS=${#GPULIST[@]}

echo "Number of GPUs in GPULIST: $LOCAL_CHUNKS"

ALL_CHUNKS=$((NUM_NODES * LOCAL_CHUNKS))

# Calculate GPUs per chunk
GPUS_PER_CHUNK=1

echo $GPUS_PER_CHUNK

for IDX in $(seq 1 $LOCAL_CHUNKS); do
    (
        START=$(((IDX-1) * GPUS_PER_CHUNK))
        LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
        
        CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
        
        # Convert the chunk GPUs array to a comma-separated string
        CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")

        LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))

        echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
        
        # Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
        PORT=$((10000 + RANDOM % 55536))

        MAX_RETRIES=10
        RETRY_COUNT=0
        COMMAND_STATUS=1  # Initialize as failed

        while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
            echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
            
#!/bin/bash
            CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
            --port $PORT \
            --num-chunks $ALL_CHUNKS \
            --chunk-idx $(($LOCAL_IDX - 1)) \
            --save-dir /media/star/8T/tmp/llava_video \
            --video-dir $VIDEO_DIR \
            --model-path $MODEL_PATH \
            --num-frames $NUM_FRAMES #&
            
            wait $!  # Wait for the process to finish and capture its exit status
            COMMAND_STATUS=$?
            
            if [ $COMMAND_STATUS -ne 0 ]; then
                echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
                RETRY_COUNT=$(($RETRY_COUNT + 1))
                sleep 180  # Wait a bit before retrying
            else
                echo "Execution succeeded for chunk $(($LOCAL_IDX - 1))."
            fi
        done

        if [ $COMMAND_STATUS -ne 0 ]; then
            echo "Execution failed for chunk $(($LOCAL_IDX - 1)) after $MAX_RETRIES attempts."
        fi
    ) #&
    sleep 2  # Slight delay to stagger the start times
done

wait

cat /media/star/8T/tmp/llava_video/final_results_chunk_*.csv > /media/star/8T/tmp/llava_video/final_results_node_${CUR_NODES_IDX}.csv

END_TIME=$(date +%s)  # Capture end time
ELAPSED_TIME=$(($END_TIME - $START_TIME))
echo "Total execution time: $ELAPSED_TIME seconds."

running bash script

bash examples/usage/llava_video/srt_example_llava_v.sh 1 0 "/media/star/8T/tmp/gpt4v/video/tmp/1.mp4" "/media/star/8T/model/gpt/llava/llava-next/lmms-lab/LLaVA-NeXT-Video-7B-DPO" 32

terminal output

(sglang) star@star-SYS-7049GP-TRT:/media/star/8T/PycharmProjects/github/train_inference_accelerate/sglang$ bash examples/usage/llava_video/srt_example_llava_v.sh 1 0 "/media/star/8T/tmp/gpt4v/video/tmp/1.mp4" "/media/star/8T/model/gpt/llava/llava-next/lmms-lab/LLaVA-NeXT-Video-7B-DPO" 16
examples/usage/llava_video
Each video you will sample 16 frames
Number of GPUs in GPULIST: 1
1
Chunk 0 will run on GPUs 0
Running chunk 0 on GPUs 0 with port 30256. Attempt 1
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
target_frames: 16
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
chat template: vicuna_v1.1

========== single ==========

/media/star/8T/tmp/gpt4v/video/tmp/1.mp4
single path=/media/star/8T/tmp/gpt4v/video/tmp/1.mp4
single state=ProgramState(A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:)

Average processing time per video: 0.01 seconds
Execution succeeded for chunk 0.
Total execution time: 32 seconds.

why?

@ZhangYuanhan-AI
Copy link
Contributor

image
Hi, thanks for your interest!

it works well at my side. What GPU do you use?

@AmazDeng
Copy link
Author

AmazDeng commented Jun 3, 2024

image
Hi, thanks for your interest!

it works well at my side. What GPU do you use?

A100 80G,only one GPU card

@AmazDeng
Copy link
Author

AmazDeng commented Jun 3, 2024

image
Hi, thanks for your interest!

it works well at my side. What GPU do you use?

I have also tested llava-next-image demo code, the output is right, not empty. which version of sglang do you use?

@AmazDeng
Copy link
Author

AmazDeng commented Jun 4, 2024

Could you please take a look at this issue? @merrymercy

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants