使用开源模型blip用于图片内容识别

使用开源模型blip用于图片内容识别

2024-3-19·devcxl
devcxl

Q:什么是 BLIP?

A:一个图像转自然语言的预训练模型,助力实现统一的视觉-语言理解与生成。

讲的通俗点就是它可以将识别图片中的内容,并将其转换为自然语言。

实现效果:

image

可以看的出来实现效果还是不错的。有gpt-4-vision-preview的一点效果了、

下面讲讲怎么去使用这个模型

安装依赖

pip install transformer[torch] pillow

下载模型

git clone https://huggingface.co/Salesforce/blip-image-captioning-large

推理代码

CPU 推理代码:

import time
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

start_time = time.time()
# 本地模型
processor = BlipProcessor.from_pretrained(
    "/data/models/blip-image-captioning-large", local_files_only=True)
model = BlipForConditionalGeneration.from_pretrained(
    "/data/models/blip-image-captioning-large", local_files_only=True)

raw_image = Image.open('123.jpg').convert('RGB')

# conditional image captioning
text = "This picture show"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs, max_new_tokens=50)
print(processor.decode(out[0], skip_special_tokens=True))

# # unconditional image captioning
# inputs = processor(raw_image, return_tensors="pt")

# out = model.generate(**inputs)
# print(processor.decode(out[0], skip_special_tokens=True))

spend_time = start_time-time.time()
print(f'spend time: {spend_time}')

GPU 推理代码

import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large"
# 半精度推理
# ,torch_dtype=torch.float16
).to("cuda")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
>>> a woman sitting on the beach with her dog

更便于使用的推理方法

from typing import  Dict, List, Any
from PIL import Image
import torch
import os
from io import BytesIO
from transformers import BlipForConditionalGeneration, BlipProcessor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EndpointHandler():
    def __init__(self, path=""):
        # load the optimized model
        
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 
        self.model = BlipForConditionalGeneration.from_pretrained(
            "Salesforce/blip-image-captioning-base"
        ).to(device)
        self.model.eval()
        self.model = self.model.to(device)
        


    def __call__(self, data: Any) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
                - "caption": A string corresponding to the generated caption.
        """
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})
 
        raw_images = [Image.open(BytesIO(_img)) for _img in inputs]
                                     
        processed_image = self.processor(images=raw_images, return_tensors="pt") 
        processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
        processed_image = {**processed_image, **parameters}
        
        with torch.no_grad():
            out = self.model.generate(
                **processed_image
            )
        captions = self.processor.batch_decode(out, skip_special_tokens=True)
        # postprocess the prediction
        return {"captions": captions}

使用自定义数据训练

参考文档