From 8e6e30ef89a46b7cc780c2c5398eef95a8780096 Mon Sep 17 00:00:00 2001 From: yuluoyun <1731396519@qq.com> Date: Wed, 10 Jan 2024 12:14:44 +0800 Subject: [PATCH] add monkey.py --- vlmeval/config.py | 1 + vlmeval/vlm/__init__.py | 1 + vlmeval/vlm/monkey.py | 43 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 vlmeval/vlm/monkey.py diff --git a/vlmeval/config.py b/vlmeval/config.py index e55ed056..b1430e5a 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -43,6 +43,7 @@ 'cogvlm-grounding-generalist':partial(CogVlm, name='cogvlm-grounding-generalist',tokenizer_name ='lmsys/vicuna-7b-v1.5'), 'cogvlm-chat':partial(CogVlm, name='cogvlm-chat',tokenizer_name ='lmsys/vicuna-7b-v1.5'), 'sharedcaptioner':partial(SharedCaptioner, model_path='Lin-Chen/ShareCaptioner'), + 'monkey':partial(Monkey, model_path='echo840/Monkey'), } api_models = { diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py index e5172a83..117743e8 100644 --- a/vlmeval/vlm/__init__.py +++ b/vlmeval/vlm/__init__.py @@ -15,3 +15,4 @@ from .llava_xtuner import LLaVA_XTuner from .cogvlm import CogVlm from .sharedcaptioner import SharedCaptioner +from .monkey import Monkey diff --git a/vlmeval/vlm/monkey.py b/vlmeval/vlm/monkey.py new file mode 100644 index 00000000..6dc6c71d --- /dev/null +++ b/vlmeval/vlm/monkey.py @@ -0,0 +1,43 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +import warnings +import os.path as osp +from vlmeval.smp import isimg +import re + +class Monkey: + + INSTALL_REQ = False + + def __init__(self, model_path='echo840/Monkey', **kwargs): + assert model_path is not None + self.model_path = model_path + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() + self.kwargs = kwargs + warnings.warn(f"Following kwargs received: {self.kwargs}, will use as generation config. ") + torch.cuda.empty_cache() + + def generate(self, image_path, prompt, dataset=None): + cur_prompt = f'{image_path} {prompt} Answer: ' + input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest') + attention_mask = input_ids.attention_mask + input_ids = input_ids.input_ids + + output_ids = self.model.generate( + input_ids=input_ids.cuda(), + attention_mask=attention_mask.cuda(), + do_sample=False, + num_beams=1, + max_new_tokens=512, + min_new_tokens=1, + length_penalty=3, + num_return_sequences=1, + output_hidden_states=True, + use_cache=True, + pad_token_id=self.tokenizer.eod_id, + eos_token_id=self.tokenizer.eod_id, + ) + response = self.tokenizer.decode(output_ids[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip() + + return response