一、api client--> text infer
这个在线推理的客户端实现代码。服务端就是启动vllm的模型,使用vllm server *** 的方式启动一个模型。
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for `vllm.entrypoints.api_server`
Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name>
NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use.
For production use, we recommend `vllm serve` and the OpenAI client API.
"""
import argparse
import json
from argparse import Namespace
from collections.abc import Iterable
import requests
def clear_line(n: int = 1) -> None:
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)
def post_http_request(
prompt: str, api_url: str, n: int = 1, stream: bool = False
) -> requests.Response:
headers = {"User-Agent": "Test Client"}
pload = {
"prompt": prompt,
"n": n,
"temperature": 0.0,
"max_tokens": 16,
"stream": stream,
}
response = requests.post(api_url, headers=headers, json=pload, stream=stream)
return response
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"\n"
):
if chunk:
data = json.loads(chunk.decode("utf-8"))
output = data["text"]
yield output
def get_response(response: requests.Response) -> list[str]:
data = json.loads(response.content)
output = data["text"]
return output
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
return parser.parse_args()
def main(args: Namespace):
prompt = args.prompt
api_url = f"http://{args.host}:{args.port}/generate"
n = args.n
stream = args.stream
print(f"Prompt: {prompt!r}\n", flush=True)
response = post_http_request(prompt, api_url, n, stream)
if stream:
num_printed_lines = 0
for h in get_streaming_response(response):
clear_line(num_printed_lines)
num_printed_lines = 0
for i, line in enumerate(h):
num_printed_lines += 1
print(f"Beam candidate {i}: {line!r}", flush=True)
else:
output = get_response(response)
for i, line in enumerate(output):
print(f"Beam candidate {i}: {line!r}", flush=True)
if __name__ == "__main__":
args = parse_args()
main(args)
二、使用openai的接口方式-->text infer
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
import argparse
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020.",
},
{"role": "user", "content": "Where was it played?"},
]
def parse_args():
parser = argparse.ArgumentParser(description="Client for vLLM API server")
parser.add_argument(
"--stream", action="store_true", help="Enable streaming response"
)
return parser.parse_args()
def main(args):
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Chat Completion API
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
stream=args.stream,
)
print("-" * 50)
print("Chat completion results:")
if args.stream:
for c in chat_completion:
print(c)
else:
print(chat_completion)
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)
三、使用多模态数据进行online infer
这个的多模态数据可以使用文本,图片,视频,语音输入。
我们的兼容 OpenAI 的服务器通过聊天完成 API 接收多模态数据。媒体输入还支持用户可提供的可选 UUID,用于唯一标识每个媒体,这可用于跨请求缓存媒体结果。
重要提示
使用聊天完成 API 需要聊天模板。对于 HF 格式的模型,默认聊天模板定义在 chat_template.json 或 tokenizer_config.json 中。
如果没有可用的默认聊天模板,我们会首先在 vllm/transformers_utils/chat_templates/registry.py 中查找内置的备用模板。如果没有备用模板,将会报错,并且您必须通过 --chat-template 参数手动提供聊天模板。
对于某些模型,我们在示例中提供了备用聊天模板。例如,VLM2Vec 使用 examples/template_vlm2vec_phi3v.jinja,它与 Phi-3-Vision 的默认模板不同。
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
--max-model-len 4096 --trust-remote-code
run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""
import base64
import requests
from openai import OpenAI
from utils import get_first_model
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
headers = {"User-Agent": "vLLM Example Client"}
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url, headers=headers) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return result
# Text-only inference
def run_text_only(model: str, max_completion_tokens: int) -> None:
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": "What's the capital of France?"}],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion.choices[0].message.content
print("Chat completion output:\n", result)
# Single-image input inference
def run_single_image(model: str, max_completion_tokens: int) -> None:
## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:\n", result)
## Use base64 encoded image in the payload
image_base64 = encode_base64_content_from_url(image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)
# Multi-image input inference
def run_multi_image(model: str, max_completion_tokens: int) -> None:
image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What are the animals in these images?"},
{
"type": "image_url",
"image_url": {"url": image_url_duck},
},
{
"type": "image_url",
"image_url": {"url": image_url_lion},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output:\n", result)
# Video input inference
def run_video(model: str, max_completion_tokens: int) -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url)
## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from video url:\n", result)
## Use base64 encoded video in the payload
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded video:\n", result)
# Audio input inference
def run_audio(model: str, max_completion_tokens: int) -> None:
from vllm.assets.audio import AudioAsset
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this audio?"},
{
"type": "input_audio",
"input_audio": {
# Any format supported by librosa is supported
"data": audio_base64,
"format": "wav",
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:\n", result)
# HTTP URL
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this audio?"},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": audio_url
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:\n", result)
# base64 URL
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this audio?"},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": f"data:audio/ogg;base64,{audio_base64}"
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded audio:\n", result)
def run_multi_audio(model: str, max_completion_tokens: int) -> None:
from vllm.assets.audio import AudioAsset
# Two different audios to showcase batched inference.
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
audio_url2 = AudioAsset("azacinto_foscolo").url
audio_base64_2 = encode_base64_content_from_url(audio_url2)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64 = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Are these two audios the same?"},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav",
},
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64_2,
"format": "wav",
},
},
],
}
],
model=model,
max_completion_tokens=max_completion_tokens,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:\n", result)
example_function_map = {
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"multi-audio": run_multi_audio,
"video": run_video,
"audio": run_audio,
}
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using OpenAI client for online serving with "
"multimodal language models served with vLLM."
)
parser.add_argument(
"--chat-type",
"-c",
type=str,
default="single-image",
choices=list(example_function_map.keys()),
help="Conversation type with multimodal data.",
)
parser.add_argument(
"--max-completion-tokens",
"-n",
type=int,
default=128,
help="Maximum number of tokens to generate for each completion.",
)
return parser.parse_args()
def main(args) -> None:
chat_type = args.chat_type
model = get_first_model(client)
example_function_map[chat_type](model, args.max_completion_tokens)
if __name__ == "__main__":
args = parse_args()
main(args)
四、传入embedding的Tensor数据进行推理
直接传入embedding tensor 进行推理。两种方式接口和使用方式的细节差异。
You must enable this feature via the --enable-mm-embeds flag in vllm serve。
client.completions 接口:
# NOTE: The OpenAI client allows passing in extra JSON body via the # `extra_body` argument. extra_body={ "prompt_embeds": encoded_embeds},
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vLLM OpenAI-Compatible Client with Prompt Embeddings
This script demonstrates how to:
1. Generate prompt embeddings using Hugging Face Transformers
2. Encode them in base64 format
3. Send them to a vLLM server via the OpenAI-compatible Completions API
Run the vLLM server first:
vllm serve meta-llama/Llama-3.2-1B-Instruct \
--runner generate \
--max-model-len 4096 \
--enable-prompt-embeds
Run the client:
python examples/online_serving/prompt_embed_inference_with_openai_client.py
Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
Dependencies:
- transformers
- torch
- openai
"""
import base64
import io
import torch
import transformers
from openai import OpenAI
def main():
client = OpenAI(
api_key="EMPTY",
base_url="http://localhost:8000/v1",
)
model_name = "meta-llama/Llama-3.2-1B-Instruct"
# Transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
# Refer to the HuggingFace repo for the correct format to use
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
token_ids = tokenizer.apply_chat_template(
chat, add_generation_prompt=True, return_tensors="pt"
)
embedding_layer = transformers_model.get_input_embeddings()
prompt_embeds = embedding_layer(token_ids).squeeze(0)
# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
completion = client.completions.create(
model=model_name,
# NOTE: The OpenAI client does not allow `None` as an input to
# `prompt`. Use an empty string if you have no text prompts.
prompt="",
max_tokens=5,
temperature=0.0,
# NOTE: The OpenAI client allows passing in extra JSON body via the
# `extra_body` argument.
extra_body={"prompt_embeds": encoded_embeds},
)
print("-" * 30)
print(completion.choices[0].text)
print("-" * 30)
if __name__ == "__main__":
main()
client.chat.completions 接口:
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
buffer = io.BytesIO()
torch.save(image_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
# Basic usage - this is equivalent to the LLaVA example for offline inference
model = "llava-hf/llava-1.5-7b-hf"
embeds = {
"type": "image_embeds",
"image_embeds": f"{base64_image_embedding}",
"uuid": image_url, # Optional
}
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
model = "Qwen/Qwen2-VL-2B-Instruct"
embeds = {
"type": "image_embeds",
"image_embeds": {
"image_embeds": f"{base64_image_embedding}", # Required
"image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct
},
"uuid": image_url, # Optional
}
model = "openbmb/MiniCPM-V-2_6"
embeds = {
"type": "image_embeds",
"image_embeds": {
"image_embeds": f"{base64_image_embedding}", # Required
"image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6
},
"uuid": image_url, # Optional
}
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this image?",
},
embeds,
],
},
],
model=model,
)
Prompt Embedding Inputs - vLLMhttps://docs.vllm.ai/en/latest/features/prompt_embeds/#transformers-inputs-via-openai-clientMultimodal Inputs - vLLM
https://docs.vllm.ai/en/latest/features/multimodal_inputs/
五、将prompt 编码成tensor 传入
Our OpenAI-compatible server accepts prompt embeddings inputs via the Completions API. Prompt embeddings inputs are added via a new 'prompt_embeds' key in the JSON package and are enabled by the --enable-prompt-embeds flag in vllm serve.
服务端启动:
vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \
--max-model-len 4096 --enable-prompt-embeds
客户端代码:
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vLLM OpenAI-Compatible Client with Prompt Embeddings
This script demonstrates how to:
1. Generate prompt embeddings using Hugging Face Transformers
2. Encode them in base64 format
3. Send them to a vLLM server via the OpenAI-compatible Completions API
Run the vLLM server first:
vllm serve meta-llama/Llama-3.2-1B-Instruct \
--runner generate \
--max-model-len 4096 \
--enable-prompt-embeds
Run the client:
python examples/online_serving/prompt_embed_inference_with_openai_client.py
Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
Dependencies:
- transformers
- torch
- openai
"""
import base64
import io
import torch
import transformers
from openai import OpenAI
def main():
client = OpenAI(
api_key="EMPTY",
base_url="http://localhost:8000/v1",
)
model_name = "meta-llama/Llama-3.2-1B-Instruct"
# Transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
# Refer to the HuggingFace repo for the correct format to use
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
token_ids = tokenizer.apply_chat_template(
chat, add_generation_prompt=True, return_tensors="pt"
)
embedding_layer = transformers_model.get_input_embeddings()
prompt_embeds = embedding_layer(token_ids).squeeze(0)
# Prompt embeddings
buffer = io.BytesIO()
torch.save(prompt_embeds, buffer)
buffer.seek(0)
binary_data = buffer.read()
encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
completion = client.completions.create(
model=model_name,
# NOTE: The OpenAI client does not allow `None` as an input to
# `prompt`. Use an empty string if you have no text prompts.
prompt="",
max_tokens=5,
temperature=0.0,
# NOTE: The OpenAI client allows passing in extra JSON body via the
# `extra_body` argument.
extra_body={"prompt_embeds": encoded_embeds},
)
print("-" * 30)
print(completion.choices[0].text)
print("-" * 30)
if __name__ == "__main__":
main()