gsaon commited on
Commit
2cc0aac
·
verified ·
1 Parent(s): df28cad

Update README.md

Browse files

Added vLLM code example

Files changed (1) hide show
  1. README.md +180 -0
README.md CHANGED
@@ -118,6 +118,186 @@ output_text = tokenizer.batch_decode(
118
  print(f"STT output = {output_text[0].upper()}")
119
  ```
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  **Model Architecture:**
122
 
123
  The architecture of granite-speech-3.3-8b consists of the following components:
 
118
  print(f"STT output = {output_text[0].upper()}")
119
  ```
120
 
121
+ ### Usage with `vLLM`
122
+
123
+ First, make sure to install the latest version of vLLM:
124
+ ```shell
125
+ pip install vllm --upgrade
126
+ ```
127
+ * Code for offline mode:
128
+ ```python
129
+ from transformers import AutoTokenizer
130
+ from vllm import LLM, SamplingParams
131
+ from vllm.assets.audio import AudioAsset
132
+ from vllm.lora.request import LoRARequest
133
+
134
+ model_id = "ibm-granite/granite-speech-3.3-8b"
135
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
136
+
137
+ def get_prompt(question: str, has_audio: bool):
138
+ """Build the input prompt to send to vLLM."""
139
+ if has_audio:
140
+ question = f"<|audio|>{question}"
141
+ chat = [
142
+ {
143
+ "role": "user",
144
+ "content": question
145
+ }
146
+ ]
147
+ return tokenizer.apply_chat_template(chat, tokenize=False)
148
+
149
+ # NOTE - you may see warnings about multimodal lora layers being ignored;
150
+ # this is okay as the lora in this model is only applied to the LLM.
151
+ model = LLM(
152
+ model=model_id,
153
+ enable_lora=True,
154
+ max_lora_rank=64,
155
+ max_model_len=2048, # This may be needed for lower resource devices.
156
+ limit_mm_per_prompt={"audio": 1},
157
+ )
158
+
159
+ ### 1. Example with Audio [make sure to use the lora]
160
+ question = "can you transcribe the speech into a written format?"
161
+ prompt_with_audio = get_prompt(
162
+ question=question,
163
+ has_audio=True,
164
+ )
165
+ audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
166
+
167
+ inputs = {
168
+ "prompt": prompt_with_audio,
169
+ "multi_modal_data": {
170
+ "audio": audio,
171
+ }
172
+ }
173
+
174
+ outputs = model.generate(
175
+ inputs,
176
+ sampling_params=SamplingParams(
177
+ temperature=0.2,
178
+ max_tokens=64,
179
+ ),
180
+ lora_request=[LoRARequest("speech", 1, model_id)]
181
+ )
182
+ print(f"Audio Example - Question: {question}")
183
+ print(f"Generated text: {outputs[0].outputs[0].text}")
184
+
185
+
186
+ ### 2. Example without Audio [do NOT use the lora]
187
+ question = "What is the capital of Brazil?"
188
+ prompt = get_prompt(
189
+ question=question,
190
+ has_audio=False,
191
+ )
192
+
193
+ outputs = model.generate(
194
+ {"prompt": prompt},
195
+ sampling_params=SamplingParams(
196
+ temperature=0.2,
197
+ max_tokens=12,
198
+ ),
199
+ )
200
+ print(f"Text Only Example - Question: {question}")
201
+ print(f"Generated text: {outputs[0].outputs[0].text}")
202
+ ```
203
+
204
+ * Code for online mode:
205
+ ```python
206
+ """
207
+ Launch the vLLM server with the following command:
208
+
209
+ vllm serve ibm-granite/granite-speech-3.3-8b \
210
+ --api-key token-abc123 \
211
+ --max-model-len 2048 \
212
+ --enable-lora \
213
+ --lora-modules speech=ibm-granite/granite-speech-3.3-8b \
214
+ --max-lora-rank 64
215
+ """
216
+
217
+ import base64
218
+
219
+ import requests
220
+ from openai import OpenAI
221
+
222
+ from vllm.assets.audio import AudioAsset
223
+
224
+ # Modify OpenAI's API key and API base to use vLLM's API server.
225
+ openai_api_key = "token-abc123"
226
+ openai_api_base = "http://localhost:8000/v1"
227
+
228
+ client = OpenAI(
229
+ # defaults to os.environ.get("OPENAI_API_KEY")
230
+ api_key=openai_api_key,
231
+ base_url=openai_api_base,
232
+ )
233
+
234
+ base_model_name = "ibm-granite/granite-speech-3.3-8b"
235
+ lora_model_name = "speech"
236
+ # Any format supported by librosa is supported
237
+ audio_url = AudioAsset("mary_had_lamb").url
238
+
239
+ # Use base64 encoded audio in the payload
240
+ def encode_audio_base64_from_url(audio_url: str) -> str:
241
+ """Encode an audio retrieved from a remote url to base64 format."""
242
+ with requests.get(audio_url) as response:
243
+ response.raise_for_status()
244
+ result = base64.b64encode(response.content).decode('utf-8')
245
+ return result
246
+
247
+ audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
248
+
249
+ ### 1. Example with Audio
250
+ # NOTE: we pass the name of the lora model (`speech`) here because we have audio.
251
+ question = "can you transcribe the speech into a written format?"
252
+ chat_completion_with_audio = client.chat.completions.create(
253
+ messages=[{
254
+ "role": "user",
255
+ "content": [
256
+ {
257
+ "type": "text",
258
+ "text": question
259
+ },
260
+ {
261
+ "type": "audio_url",
262
+ "audio_url": {
263
+ # Any format supported by librosa is supported
264
+ "url": f"data:audio/ogg;base64,{audio_base64}"
265
+ },
266
+ },
267
+ ],
268
+ }],
269
+ temperature=0.2,
270
+ max_tokens=64,
271
+ model=lora_model_name,
272
+ )
273
+
274
+
275
+ print(f"Audio Example - Question: {question}")
276
+ print(f"Generated text: {chat_completion_with_audio.choices[0].message.content}")
277
+
278
+
279
+ ### 2. Example without Audio
280
+ # NOTE: we pass the name of the base model here because we do not have audio.
281
+ question = "What is the capital of Brazil?"
282
+ chat_completion_with_audio = client.chat.completions.create(
283
+ messages=[{
284
+ "role": "user",
285
+ "content": [
286
+ {
287
+ "type": "text",
288
+ "text": question
289
+ },
290
+ ],
291
+ }],
292
+ temperature=0.2,
293
+ max_tokens=12,
294
+ model=base_model_name,
295
+ )
296
+
297
+ print(f"Text Only Example - Question: {question}")
298
+ print(f"Generated text: {chat_completion_with_audio.choices[0].message.content}")
299
+ ```
300
+
301
  **Model Architecture:**
302
 
303
  The architecture of granite-speech-3.3-8b consists of the following components: