|
--- |
|
library_name: transformers |
|
pipeline_tag: any-to-any |
|
inference: true |
|
widget: |
|
- text: Hello! |
|
example_title: Hello world |
|
group: Python |
|
--- |
|
|
|
This tiny model is for debugging. It is randomly initialized with the config adapted from [Qwen/Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B). |
|
|
|
### Example usage: |
|
|
|
```python |
|
import soundfile as sf |
|
from qwen_omni_utils import process_mm_info |
|
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor |
|
|
|
model_id = "tiny-random/qwen2.5-omni" |
|
# model = Qwen2_5OmniModel.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval() |
|
# We recommend enabling flash_attention_2 for better acceleration and memory saving. |
|
model = Qwen2_5OmniModel.from_pretrained( |
|
model_id, |
|
torch_dtype="auto", |
|
device_map="auto", |
|
attn_implementation="flash_attention_2", |
|
).eval() |
|
processor = Qwen2_5OmniProcessor.from_pretrained(model_id) |
|
|
|
conversation = [ |
|
{ |
|
"role": "system", |
|
"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": "Hi, can you tell me a joke?"}, |
|
{"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"}, |
|
{"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"}, |
|
{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, |
|
], |
|
}, |
|
] |
|
|
|
# Preparation for inference |
|
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) |
|
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True) |
|
print('Audios:', audios) |
|
print('Images:', images) |
|
print('Videos:', videos) |
|
inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) |
|
inputs = inputs.to(model.device).to(model.dtype) |
|
|
|
# Inference: Generation of the output text and audio |
|
text_ids, audio = model.generate( |
|
**inputs, use_audio_in_video=True, |
|
thinker_max_new_tokens=16, talker_max_new_tokens=16, |
|
) |
|
|
|
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) |
|
print(text, '\n' * 3) |
|
sf.write( |
|
"/tmp/output.wav", |
|
audio.reshape(-1).detach().cpu().numpy(), |
|
samplerate=24000, |
|
) |
|
``` |
|
|
|
### Codes to create this repo: |
|
|
|
```python |
|
from pathlib import Path |
|
|
|
import torch |
|
|
|
from huggingface_hub import hf_hub_download |
|
from transformers import ( |
|
AutoConfig, |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
GenerationConfig, |
|
Qwen2_5OmniModel, |
|
Qwen2_5OmniProcessor, |
|
pipeline, |
|
set_seed, |
|
) |
|
|
|
source_model_id = "Qwen/Qwen2.5-Omni-7B" |
|
save_folder = "/tmp/tiny-random/qwen2.5-omni" |
|
|
|
processor = Qwen2_5OmniProcessor.from_pretrained( |
|
source_model_id, trust_remote_code=True, |
|
) |
|
processor.save_pretrained(save_folder) |
|
|
|
config = AutoConfig.from_pretrained( |
|
source_model_id, trust_remote_code=True, |
|
) |
|
OUTPUT_DIM = 16 |
|
config.talker_config.num_hidden_layers = 1 |
|
config.talker_config.hidden_size = 16 |
|
config.talker_config.embedding_size = OUTPUT_DIM |
|
config.talker_config.head_dim = 16 |
|
config.talker_config.num_attention_heads = 1 |
|
config.talker_config.num_key_value_heads = 1 |
|
config.talker_config.intermediate_size = 32 |
|
config.talker_config.rope_scaling['mrope_section'] = [2, 2, 4] |
|
assert 2 * sum(config.talker_config.rope_scaling['mrope_section'] |
|
) == config.talker_config.hidden_size / config.talker_config.num_attention_heads |
|
|
|
config.thinker_config.audio_config.num_hidden_layers = 1 |
|
config.thinker_config.audio_config.encoder_layers = 1 |
|
config.thinker_config.audio_config.d_model = 16 |
|
config.thinker_config.audio_config.encoder_attention_heads = 1 |
|
config.thinker_config.audio_config.encoder_ffn_dim = 32 |
|
config.thinker_config.audio_config.output_dim = OUTPUT_DIM |
|
|
|
config.thinker_config.text_config.num_hidden_layers = 1 |
|
config.thinker_config.text_config.hidden_size = OUTPUT_DIM |
|
config.thinker_config.text_config.intermediate_size = 32 |
|
config.thinker_config.text_config.num_attention_heads = 1 |
|
config.thinker_config.text_config.num_key_value_heads = 1 |
|
config.thinker_config.text_config.rope_scaling['mrope_section'] = [2, 2, 4] |
|
assert 2 * sum(config.thinker_config.text_config.rope_scaling['mrope_section'] |
|
) == config.thinker_config.text_config.hidden_size / config.thinker_config.text_config.num_attention_heads |
|
|
|
config.thinker_config.vision_config.depth = 2 |
|
config.thinker_config.vision_config.embed_dim = 16 |
|
config.thinker_config.vision_config.hidden_size = 16 |
|
config.thinker_config.vision_config.intermediate_size = 32 |
|
config.thinker_config.vision_config.out_hidden_size = OUTPUT_DIM |
|
config.thinker_config.vision_config.num_heads = 1 |
|
config.thinker_config.vision_config.fullatt_block_indexes = [1] |
|
|
|
config.token2wav_config.bigvgan_config.resblock_dilation_sizes = [[1, 3, 5]] |
|
config.token2wav_config.bigvgan_config.resblock_kernel_sizes = [7] |
|
config.token2wav_config.bigvgan_config.upsample_initial_channel = 32 |
|
config.token2wav_config.bigvgan_config.upsample_kernel_sizes = [11, 4] |
|
config.token2wav_config.bigvgan_config.upsample_rates = [5, 2] |
|
|
|
config.token2wav_config.dit_config.depth = 1 |
|
config.token2wav_config.dit_config.num_hidden_layers = 1 |
|
config.token2wav_config.dit_config.hidden_size = 16 |
|
config.token2wav_config.dit_config.dim = 16 |
|
config.token2wav_config.dit_config.emb_dim = 16 |
|
config.token2wav_config.dit_config.enc_attention_channels = 16 |
|
config.token2wav_config.dit_config.enc_channels = [32, 32, 32] |
|
config.token2wav_config.dit_config.enc_dilations = [1, 3, 4] |
|
config.token2wav_config.dit_config.enc_kernel_sizes = [5, 3, 1] |
|
config.token2wav_config.dit_config.enc_dim = 16 |
|
config.token2wav_config.dit_config.enc_emb_dim = 16 |
|
config.token2wav_config.dit_config.enc_lin_neurons = 16 |
|
config.token2wav_config.dit_config.head_dim = 16 |
|
config.token2wav_config.dit_config.num_attention_heads = 1 |
|
config.token2wav_config.dit_config.heads = 1 |
|
# avoid mismatch in vocab size because this is random model! |
|
config.token2wav_config.dit_config.num_embeds = config.talker_config.vocab_size |
|
print(config) |
|
|
|
spk_dict = torch.load(hf_hub_download(source_model_id, 'spk_dict.pt', repo_type='model')) |
|
for _, info in spk_dict.items(): |
|
info['cond'] = info['cond'][:, :config.token2wav_config.dit_config.enc_emb_dim].clone() |
|
torch.save(spk_dict, Path(save_folder, "spk_dict.pt")) |
|
|
|
torch.set_default_dtype(torch.bfloat16) |
|
model = Qwen2_5OmniModel( |
|
config, |
|
) |
|
torch.set_default_dtype(torch.float32) |
|
model.generation_config = GenerationConfig.from_pretrained( |
|
source_model_id, trust_remote_code=True, |
|
) |
|
set_seed(42) |
|
with torch.no_grad(): |
|
for name, p in sorted(model.named_parameters()): |
|
torch.nn.init.normal_(p, 0, 0.5) |
|
print(name, p.shape, p.dtype) |
|
model.save_pretrained(save_folder) |
|
``` |