import base64 import json import os import time from copy import deepcopy from io import BytesIO from typing import Any, List, Tuple import librosa import numpy as np import soundfile as sf from accelerate import Accelerator, DistributedType from openai import AzureOpenAI, OpenAI from tqdm import tqdm from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms from lmms_eval.api.registry import register_model try: from scipy import signal scipy_available = True except ImportError: scipy_available = False from loguru import logger as eval_logger # File: lmms_eval/models/simple/gpt4o_audio.py API_TYPE = os.getenv("API_TYPE", "openai") NUM_SECONDS_TO_SLEEP = 10 if API_TYPE == "openai": API_URL = os.getenv( "OPENAI_API_URL", "https://api.openai.com/v1/chat/completions", ) API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") elif API_TYPE == "azure": API_URL = os.getenv( "AZURE_ENDPOINT", "https://your-resource-name.openai.azure.com", ) API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") API_VERSION = os.getenv("AZURE_API_VERSION", "2024-08-01-preview") else: raise ValueError(f"Unsupported API_TYPE '{API_TYPE}'. Expected 'openai' or 'azure'.") @register_model("gpt4o_audio") class GPT4OAudio(lmms): def __init__( self, model_version: str = "gpt-4o-audio-preview", modality: str = "audio", timeout: int = 120, continual_mode: bool = False, response_persistent_folder: str = None, audio_voice: str = "alloy", audio_format: str = "wav", **kwargs, ) -> None: super().__init__() if librosa is None or sf is None: raise ImportError("librosa and soundfile are required for GPT-4o audio. Please install with: pip install librosa soundfile") self.model_version = model_version self.modality = modality self.audio_token = "