ibm-granite
/

granite-speech-3.3-8b

@@ -118,6 +118,186 @@ output_text = tokenizer.batch_decode(
 print(f"STT output = {output_text[0].upper()}")
 ```
 **Model Architecture:**
 The architecture of granite-speech-3.3-8b consists of the following components:

 print(f"STT output = {output_text[0].upper()}")
 ```
+### Usage with `vLLM`
+First, make sure to install the latest version of vLLM:
+```shell
+pip install vllm --upgrade
+```
+* Code for offline mode:
+```python
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.lora.request import LoRARequest
+model_id = "ibm-granite/granite-speech-3.3-8b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+def get_prompt(question: str, has_audio: bool):
+    """Build the input prompt to send to vLLM."""
+    if has_audio:
+        question = f"<|audio|>{question}"
+    chat = [
+        {
+            "role": "user",
+            "content": question
+        }
+    ]
+    return tokenizer.apply_chat_template(chat, tokenize=False)
+# NOTE - you may see warnings about multimodal lora layers being ignored;
+# this is okay as the lora in this model is only applied to the LLM.
+model = LLM(
+    model=model_id,
+    enable_lora=True,
+    max_lora_rank=64,
+    max_model_len=2048, # This may be needed for lower resource devices.
+    limit_mm_per_prompt={"audio": 1},
+)
+### 1. Example with Audio [make sure to use the lora]
+question = "can you transcribe the speech into a written format?"
+prompt_with_audio = get_prompt(
+    question=question,
+    has_audio=True,
+)
+audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
+inputs = {
+    "prompt": prompt_with_audio,
+    "multi_modal_data": {
+        "audio": audio,
+    }
+}
+outputs = model.generate(
+    inputs,
+    sampling_params=SamplingParams(
+        temperature=0.2,
+        max_tokens=64,
+    ),
+    lora_request=[LoRARequest("speech", 1, model_id)]
+)
+print(f"Audio Example - Question: {question}")
+print(f"Generated text: {outputs[0].outputs[0].text}")
+### 2. Example without Audio [do NOT use the lora]
+question = "What is the capital of Brazil?"
+prompt = get_prompt(
+    question=question,
+    has_audio=False,
+)
+outputs = model.generate(
+    {"prompt": prompt},
+    sampling_params=SamplingParams(
+        temperature=0.2,
+        max_tokens=12,
+    ),
+)
+print(f"Text Only Example - Question: {question}")
+print(f"Generated text: {outputs[0].outputs[0].text}")
+```
+* Code for online mode:
+```python
+"""
+Launch the vLLM server with the following command:
+vllm serve ibm-granite/granite-speech-3.3-8b \
+    --api-key token-abc123 \
+    --max-model-len 2048 \
+    --enable-lora  \
+    --lora-modules speech=ibm-granite/granite-speech-3.3-8b \
+    --max-lora-rank 64
+"""
+import base64
+import requests
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "token-abc123"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+base_model_name = "ibm-granite/granite-speech-3.3-8b"
+lora_model_name = "speech"
+# Any format supported by librosa is supported
+audio_url = AudioAsset("mary_had_lamb").url
+# Use base64 encoded audio in the payload
+def encode_audio_base64_from_url(audio_url: str) -> str:
+    """Encode an audio retrieved from a remote url to base64 format."""
+    with requests.get(audio_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+    return result
+audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
+### 1. Example with Audio
+# NOTE: we pass the name of the lora model (`speech`) here because we have audio.
+question = "can you transcribe the speech into a written format?"
+chat_completion_with_audio = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": question
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    # Any format supported by librosa is supported
+                    "url": f"data:audio/ogg;base64,{audio_base64}"
+                },
+            },
+        ],
+    }],
+    temperature=0.2,
+    max_tokens=64,
+    model=lora_model_name,
+)
+print(f"Audio Example - Question: {question}")
+print(f"Generated text: {chat_completion_with_audio.choices[0].message.content}")
+### 2. Example without Audio
+# NOTE: we pass the name of the base model here because we do not have audio.
+question = "What is the capital of Brazil?"
+chat_completion_with_audio = client.chat.completions.create(
+    messages=[{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }],
+    temperature=0.2,
+    max_tokens=12,
+    model=base_model_name,
+)
+print(f"Text Only Example - Question: {question}")
+print(f"Generated text: {chat_completion_with_audio.choices[0].message.content}")
+```
 **Model Architecture:**
 The architecture of granite-speech-3.3-8b consists of the following components: