使用开源模型blip用于图片内容识别
使用开源模型blip用于图片内容识别
Q:什么是 BLIP?
A:一个图像转自然语言的预训练模型,助力实现统一的视觉-语言理解与生成。
讲的通俗点就是它可以将识别图片中的内容,并将其转换为自然语言。
实现效果:
可以看的出来实现效果还是不错的。有gpt-4-vision-preview的一点效果了、
下面讲讲怎么去使用这个模型
安装依赖
pip install transformer[torch] pillow
下载模型
git clone https://huggingface.co/Salesforce/blip-image-captioning-large
推理代码
CPU 推理代码:
import time
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
start_time = time.time()
# 本地模型
processor = BlipProcessor.from_pretrained(
"/data/models/blip-image-captioning-large", local_files_only=True)
model = BlipForConditionalGeneration.from_pretrained(
"/data/models/blip-image-captioning-large", local_files_only=True)
raw_image = Image.open('123.jpg').convert('RGB')
# conditional image captioning
text = "This picture show"
inputs = processor(raw_image, text, return_tensors="pt")
out = model.generate(**inputs, max_new_tokens=50)
print(processor.decode(out[0], skip_special_tokens=True))
# # unconditional image captioning
# inputs = processor(raw_image, return_tensors="pt")
# out = model.generate(**inputs)
# print(processor.decode(out[0], skip_special_tokens=True))
spend_time = start_time-time.time()
print(f'spend time: {spend_time}')
GPU 推理代码
import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large"
# 半精度推理
# ,torch_dtype=torch.float16
).to("cuda")
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog
# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
>>> a woman sitting on the beach with her dog
更便于使用的推理方法
from typing import Dict, List, Any
from PIL import Image
import torch
import os
from io import BytesIO
from transformers import BlipForConditionalGeneration, BlipProcessor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler():
def __init__(self, path=""):
# load the optimized model
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
self.model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
).to(device)
self.model.eval()
self.model = self.model.to(device)
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
raw_images = [Image.open(BytesIO(_img)) for _img in inputs]
processed_image = self.processor(images=raw_images, return_tensors="pt")
processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
processed_image = {**processed_image, **parameters}
with torch.no_grad():
out = self.model.generate(
**processed_image
)
captions = self.processor.batch_decode(out, skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}