Update README.md
Browse filesAdded vLLM code example
README.md
CHANGED
@@ -118,6 +118,186 @@ output_text = tokenizer.batch_decode(
|
|
118 |
print(f"STT output = {output_text[0].upper()}")
|
119 |
```
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
**Model Architecture:**
|
122 |
|
123 |
The architecture of granite-speech-3.3-8b consists of the following components:
|
|
|
118 |
print(f"STT output = {output_text[0].upper()}")
|
119 |
```
|
120 |
|
121 |
+
### Usage with `vLLM`
|
122 |
+
|
123 |
+
First, make sure to install the latest version of vLLM:
|
124 |
+
```shell
|
125 |
+
pip install vllm --upgrade
|
126 |
+
```
|
127 |
+
* Code for offline mode:
|
128 |
+
```python
|
129 |
+
from transformers import AutoTokenizer
|
130 |
+
from vllm import LLM, SamplingParams
|
131 |
+
from vllm.assets.audio import AudioAsset
|
132 |
+
from vllm.lora.request import LoRARequest
|
133 |
+
|
134 |
+
model_id = "ibm-granite/granite-speech-3.3-8b"
|
135 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
136 |
+
|
137 |
+
def get_prompt(question: str, has_audio: bool):
|
138 |
+
"""Build the input prompt to send to vLLM."""
|
139 |
+
if has_audio:
|
140 |
+
question = f"<|audio|>{question}"
|
141 |
+
chat = [
|
142 |
+
{
|
143 |
+
"role": "user",
|
144 |
+
"content": question
|
145 |
+
}
|
146 |
+
]
|
147 |
+
return tokenizer.apply_chat_template(chat, tokenize=False)
|
148 |
+
|
149 |
+
# NOTE - you may see warnings about multimodal lora layers being ignored;
|
150 |
+
# this is okay as the lora in this model is only applied to the LLM.
|
151 |
+
model = LLM(
|
152 |
+
model=model_id,
|
153 |
+
enable_lora=True,
|
154 |
+
max_lora_rank=64,
|
155 |
+
max_model_len=2048, # This may be needed for lower resource devices.
|
156 |
+
limit_mm_per_prompt={"audio": 1},
|
157 |
+
)
|
158 |
+
|
159 |
+
### 1. Example with Audio [make sure to use the lora]
|
160 |
+
question = "can you transcribe the speech into a written format?"
|
161 |
+
prompt_with_audio = get_prompt(
|
162 |
+
question=question,
|
163 |
+
has_audio=True,
|
164 |
+
)
|
165 |
+
audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
|
166 |
+
|
167 |
+
inputs = {
|
168 |
+
"prompt": prompt_with_audio,
|
169 |
+
"multi_modal_data": {
|
170 |
+
"audio": audio,
|
171 |
+
}
|
172 |
+
}
|
173 |
+
|
174 |
+
outputs = model.generate(
|
175 |
+
inputs,
|
176 |
+
sampling_params=SamplingParams(
|
177 |
+
temperature=0.2,
|
178 |
+
max_tokens=64,
|
179 |
+
),
|
180 |
+
lora_request=[LoRARequest("speech", 1, model_id)]
|
181 |
+
)
|
182 |
+
print(f"Audio Example - Question: {question}")
|
183 |
+
print(f"Generated text: {outputs[0].outputs[0].text}")
|
184 |
+
|
185 |
+
|
186 |
+
### 2. Example without Audio [do NOT use the lora]
|
187 |
+
question = "What is the capital of Brazil?"
|
188 |
+
prompt = get_prompt(
|
189 |
+
question=question,
|
190 |
+
has_audio=False,
|
191 |
+
)
|
192 |
+
|
193 |
+
outputs = model.generate(
|
194 |
+
{"prompt": prompt},
|
195 |
+
sampling_params=SamplingParams(
|
196 |
+
temperature=0.2,
|
197 |
+
max_tokens=12,
|
198 |
+
),
|
199 |
+
)
|
200 |
+
print(f"Text Only Example - Question: {question}")
|
201 |
+
print(f"Generated text: {outputs[0].outputs[0].text}")
|
202 |
+
```
|
203 |
+
|
204 |
+
* Code for online mode:
|
205 |
+
```python
|
206 |
+
"""
|
207 |
+
Launch the vLLM server with the following command:
|
208 |
+
|
209 |
+
vllm serve ibm-granite/granite-speech-3.3-8b \
|
210 |
+
--api-key token-abc123 \
|
211 |
+
--max-model-len 2048 \
|
212 |
+
--enable-lora \
|
213 |
+
--lora-modules speech=ibm-granite/granite-speech-3.3-8b \
|
214 |
+
--max-lora-rank 64
|
215 |
+
"""
|
216 |
+
|
217 |
+
import base64
|
218 |
+
|
219 |
+
import requests
|
220 |
+
from openai import OpenAI
|
221 |
+
|
222 |
+
from vllm.assets.audio import AudioAsset
|
223 |
+
|
224 |
+
# Modify OpenAI's API key and API base to use vLLM's API server.
|
225 |
+
openai_api_key = "token-abc123"
|
226 |
+
openai_api_base = "http://localhost:8000/v1"
|
227 |
+
|
228 |
+
client = OpenAI(
|
229 |
+
# defaults to os.environ.get("OPENAI_API_KEY")
|
230 |
+
api_key=openai_api_key,
|
231 |
+
base_url=openai_api_base,
|
232 |
+
)
|
233 |
+
|
234 |
+
base_model_name = "ibm-granite/granite-speech-3.3-8b"
|
235 |
+
lora_model_name = "speech"
|
236 |
+
# Any format supported by librosa is supported
|
237 |
+
audio_url = AudioAsset("mary_had_lamb").url
|
238 |
+
|
239 |
+
# Use base64 encoded audio in the payload
|
240 |
+
def encode_audio_base64_from_url(audio_url: str) -> str:
|
241 |
+
"""Encode an audio retrieved from a remote url to base64 format."""
|
242 |
+
with requests.get(audio_url) as response:
|
243 |
+
response.raise_for_status()
|
244 |
+
result = base64.b64encode(response.content).decode('utf-8')
|
245 |
+
return result
|
246 |
+
|
247 |
+
audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
|
248 |
+
|
249 |
+
### 1. Example with Audio
|
250 |
+
# NOTE: we pass the name of the lora model (`speech`) here because we have audio.
|
251 |
+
question = "can you transcribe the speech into a written format?"
|
252 |
+
chat_completion_with_audio = client.chat.completions.create(
|
253 |
+
messages=[{
|
254 |
+
"role": "user",
|
255 |
+
"content": [
|
256 |
+
{
|
257 |
+
"type": "text",
|
258 |
+
"text": question
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"type": "audio_url",
|
262 |
+
"audio_url": {
|
263 |
+
# Any format supported by librosa is supported
|
264 |
+
"url": f"data:audio/ogg;base64,{audio_base64}"
|
265 |
+
},
|
266 |
+
},
|
267 |
+
],
|
268 |
+
}],
|
269 |
+
temperature=0.2,
|
270 |
+
max_tokens=64,
|
271 |
+
model=lora_model_name,
|
272 |
+
)
|
273 |
+
|
274 |
+
|
275 |
+
print(f"Audio Example - Question: {question}")
|
276 |
+
print(f"Generated text: {chat_completion_with_audio.choices[0].message.content}")
|
277 |
+
|
278 |
+
|
279 |
+
### 2. Example without Audio
|
280 |
+
# NOTE: we pass the name of the base model here because we do not have audio.
|
281 |
+
question = "What is the capital of Brazil?"
|
282 |
+
chat_completion_with_audio = client.chat.completions.create(
|
283 |
+
messages=[{
|
284 |
+
"role": "user",
|
285 |
+
"content": [
|
286 |
+
{
|
287 |
+
"type": "text",
|
288 |
+
"text": question
|
289 |
+
},
|
290 |
+
],
|
291 |
+
}],
|
292 |
+
temperature=0.2,
|
293 |
+
max_tokens=12,
|
294 |
+
model=base_model_name,
|
295 |
+
)
|
296 |
+
|
297 |
+
print(f"Text Only Example - Question: {question}")
|
298 |
+
print(f"Generated text: {chat_completion_with_audio.choices[0].message.content}")
|
299 |
+
```
|
300 |
+
|
301 |
**Model Architecture:**
|
302 |
|
303 |
The architecture of granite-speech-3.3-8b consists of the following components:
|