Enable AutoProcessor and support latest transformers release (#3)
Browse files- Upload folder using huggingface_hub (7eaa54c4c23cd6586f1946b77a2d5276baec58e7)
Co-authored-by: Philipp Guevorguian <[email protected]>
- chat_template.jinja +89 -0
- config.json +35 -6
- generation_config.json +1 -1
- model.safetensors.index.json +1 -0
- modular_isaac.py +169 -8
- processor_config.json +209 -0
- tokenizer.json +2 -2
- tokenizer_config.json +4 -2
chat_template.jinja
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{%- if tools %}
|
2 |
+
{{- '<|im_start|>system\n' }}
|
3 |
+
{%- if messages[0].role == 'system' %}
|
4 |
+
{{- messages[0].content + '\n\n' }}
|
5 |
+
{%- endif %}
|
6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
7 |
+
{%- for tool in tools %}
|
8 |
+
{{- "\n" }}
|
9 |
+
{{- tool | tojson }}
|
10 |
+
{%- endfor %}
|
11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
12 |
+
{%- else %}
|
13 |
+
{%- if messages[0].role == 'system' %}
|
14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
15 |
+
{%- endif %}
|
16 |
+
{%- endif %}
|
17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
18 |
+
{%- for message in messages[::-1] %}
|
19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
20 |
+
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
21 |
+
{%- set ns.multi_step_tool = false %}
|
22 |
+
{%- set ns.last_query_index = index %}
|
23 |
+
{%- endif %}
|
24 |
+
{%- endfor %}
|
25 |
+
{%- for message in messages %}
|
26 |
+
{%- if message.content is string %}
|
27 |
+
{%- set content = message.content %}
|
28 |
+
{%- else %}
|
29 |
+
{%- set content = '' %}
|
30 |
+
{%- endif %}
|
31 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
32 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
33 |
+
{%- elif message.role == "assistant" %}
|
34 |
+
{%- set reasoning_content = '' %}
|
35 |
+
{%- if message.reasoning_content is string %}
|
36 |
+
{%- set reasoning_content = message.reasoning_content %}
|
37 |
+
{%- else %}
|
38 |
+
{%- if '</think>' in content %}
|
39 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
40 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
41 |
+
{%- endif %}
|
42 |
+
{%- endif %}
|
43 |
+
{%- if loop.index0 > ns.last_query_index %}
|
44 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
45 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
46 |
+
{%- else %}
|
47 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
48 |
+
{%- endif %}
|
49 |
+
{%- else %}
|
50 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
51 |
+
{%- endif %}
|
52 |
+
{%- if message.tool_calls %}
|
53 |
+
{%- for tool_call in message.tool_calls %}
|
54 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
55 |
+
{{- '\n' }}
|
56 |
+
{%- endif %}
|
57 |
+
{%- if tool_call.function %}
|
58 |
+
{%- set tool_call = tool_call.function %}
|
59 |
+
{%- endif %}
|
60 |
+
{{- '<tool_call>\n{"name": "' }}
|
61 |
+
{{- tool_call.name }}
|
62 |
+
{{- '", "arguments": ' }}
|
63 |
+
{%- if tool_call.arguments is string %}
|
64 |
+
{{- tool_call.arguments }}
|
65 |
+
{%- else %}
|
66 |
+
{{- tool_call.arguments | tojson }}
|
67 |
+
{%- endif %}
|
68 |
+
{{- '}\n</tool_call>' }}
|
69 |
+
{%- endfor %}
|
70 |
+
{%- endif %}
|
71 |
+
{{- '<|im_end|>\n' }}
|
72 |
+
{%- elif message.role == "tool" %}
|
73 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
74 |
+
{{- '<|im_start|>user' }}
|
75 |
+
{%- endif %}
|
76 |
+
{{- '\n<tool_response>\n' }}
|
77 |
+
{{- content }}
|
78 |
+
{{- '\n</tool_response>' }}
|
79 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
80 |
+
{{- '<|im_end|>\n' }}
|
81 |
+
{%- endif %}
|
82 |
+
{%- endif %}
|
83 |
+
{%- endfor %}
|
84 |
+
{%- if add_generation_prompt %}
|
85 |
+
{{- '<|im_start|>assistant\n' }}
|
86 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
87 |
+
{{- '<think>\n\n</think>\n\n' }}
|
88 |
+
{%- endif %}
|
89 |
+
{%- endif %}
|
config.json
CHANGED
@@ -2,20 +2,50 @@
|
|
2 |
"architectures": [
|
3 |
"IsaacForConditionalGeneration"
|
4 |
],
|
|
|
|
|
5 |
"auto_map": {
|
6 |
-
"AutoProcessor": "modular_isaac.IsaacProcessor",
|
7 |
"AutoConfig": "modular_isaac.IsaacConfig",
|
8 |
"AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
|
9 |
},
|
10 |
-
"attention_bias": false,
|
11 |
-
"attention_dropout": 0.0,
|
12 |
"bos_token_id": 151643,
|
|
|
13 |
"eos_token_id": 151645,
|
14 |
"head_dim": 128,
|
15 |
"hidden_act": "silu",
|
16 |
"hidden_size": 2048,
|
17 |
"initializer_range": 0.02,
|
18 |
"intermediate_size": 6144,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
"max_position_embeddings": 40960,
|
20 |
"max_sequence_length": 16384,
|
21 |
"max_window_layers": 28,
|
@@ -33,8 +63,7 @@
|
|
33 |
"rope_theta": 1000000.0,
|
34 |
"sliding_window": null,
|
35 |
"tie_word_embeddings": false,
|
36 |
-
"
|
37 |
-
"transformers_version": "4.51.1",
|
38 |
"use_cache": true,
|
39 |
"use_sliding_window": false,
|
40 |
"video_patch_size": 16,
|
@@ -57,4 +86,4 @@
|
|
57 |
"vision_min_num_patches": 256,
|
58 |
"vision_token": "<image>",
|
59 |
"vocab_size": 151936
|
60 |
-
}
|
|
|
2 |
"architectures": [
|
3 |
"IsaacForConditionalGeneration"
|
4 |
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
"auto_map": {
|
|
|
8 |
"AutoConfig": "modular_isaac.IsaacConfig",
|
9 |
"AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
|
10 |
},
|
|
|
|
|
11 |
"bos_token_id": 151643,
|
12 |
+
"dtype": "float32",
|
13 |
"eos_token_id": 151645,
|
14 |
"head_dim": 128,
|
15 |
"hidden_act": "silu",
|
16 |
"hidden_size": 2048,
|
17 |
"initializer_range": 0.02,
|
18 |
"intermediate_size": 6144,
|
19 |
+
"layer_types": [
|
20 |
+
"full_attention",
|
21 |
+
"full_attention",
|
22 |
+
"full_attention",
|
23 |
+
"full_attention",
|
24 |
+
"full_attention",
|
25 |
+
"full_attention",
|
26 |
+
"full_attention",
|
27 |
+
"full_attention",
|
28 |
+
"full_attention",
|
29 |
+
"full_attention",
|
30 |
+
"full_attention",
|
31 |
+
"full_attention",
|
32 |
+
"full_attention",
|
33 |
+
"full_attention",
|
34 |
+
"full_attention",
|
35 |
+
"full_attention",
|
36 |
+
"full_attention",
|
37 |
+
"full_attention",
|
38 |
+
"full_attention",
|
39 |
+
"full_attention",
|
40 |
+
"full_attention",
|
41 |
+
"full_attention",
|
42 |
+
"full_attention",
|
43 |
+
"full_attention",
|
44 |
+
"full_attention",
|
45 |
+
"full_attention",
|
46 |
+
"full_attention",
|
47 |
+
"full_attention"
|
48 |
+
],
|
49 |
"max_position_embeddings": 40960,
|
50 |
"max_sequence_length": 16384,
|
51 |
"max_window_layers": 28,
|
|
|
63 |
"rope_theta": 1000000.0,
|
64 |
"sliding_window": null,
|
65 |
"tie_word_embeddings": false,
|
66 |
+
"transformers_version": "4.56.1",
|
|
|
67 |
"use_cache": true,
|
68 |
"use_sliding_window": false,
|
69 |
"video_patch_size": 16,
|
|
|
86 |
"vision_min_num_patches": 256,
|
87 |
"vision_token": "<image>",
|
88 |
"vocab_size": 151936
|
89 |
+
}
|
generation_config.json
CHANGED
@@ -2,5 +2,5 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 151643,
|
4 |
"eos_token_id": 151645,
|
5 |
-
"transformers_version": "4.
|
6 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 151643,
|
4 |
"eos_token_id": 151645,
|
5 |
+
"transformers_version": "4.56.1"
|
6 |
}
|
model.safetensors.index.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"metadata": {
|
|
|
3 |
"total_size": 10268292032
|
4 |
},
|
5 |
"weight_map": {
|
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
+
"total_parameters": 2567073008,
|
4 |
"total_size": 10268292032
|
5 |
},
|
6 |
"weight_map": {
|
modular_isaac.py
CHANGED
@@ -14,15 +14,19 @@ import PIL.Image
|
|
14 |
from transformers import (
|
15 |
AutoTokenizer,
|
16 |
BatchFeature,
|
|
|
17 |
Qwen3Config,
|
18 |
Qwen3ForCausalLM,
|
19 |
Qwen3PreTrainedModel,
|
20 |
)
|
|
|
21 |
from transformers.generation.utils import GenerationMixin
|
22 |
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
23 |
from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3Model
|
|
|
24 |
from transformers.processing_utils import ProcessorMixin
|
25 |
from transformers.tokenization_utils import TensorType
|
|
|
26 |
import re
|
27 |
|
28 |
from transformers.models.siglip2.modeling_siglip2 import (
|
@@ -62,7 +66,6 @@ class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
|
|
62 |
num_patches: int = 256,
|
63 |
**kwargs,
|
64 |
):
|
65 |
-
# Call parent with all vision config parameters
|
66 |
super().__init__(**kwargs)
|
67 |
|
68 |
# Add our custom fields
|
@@ -874,16 +877,20 @@ def create_text_event(tokenizer: AutoTokenizer, text: str, time: float = 0.0) ->
|
|
874 |
|
875 |
|
876 |
class IsaacProcessor(ProcessorMixin):
|
877 |
-
attributes = []
|
878 |
-
tokenizer_class = ("
|
|
|
879 |
|
880 |
def __init__(
|
881 |
self,
|
882 |
-
tokenizer:
|
883 |
-
config: IsaacConfig,
|
884 |
):
|
885 |
-
super().__init__()
|
886 |
self.tokenizer = tokenizer
|
|
|
|
|
|
|
887 |
self.config = config
|
888 |
|
889 |
# Use vision token from config
|
@@ -1121,8 +1128,9 @@ class IsaacRotaryEmbedding(nn.Module):
|
|
1121 |
class IsaacModel(Qwen3Model):
|
1122 |
def __init__(self, config: IsaacConfig):
|
1123 |
super().__init__(config)
|
|
|
1124 |
self.layers = torch.nn.ModuleList(
|
1125 |
-
[Qwen3DecoderLayer(
|
1126 |
)
|
1127 |
self.rotary_emb = IsaacRotaryEmbedding(config, device=self.device)
|
1128 |
|
@@ -1276,7 +1284,7 @@ class IsaacModel(Qwen3Model):
|
|
1276 |
**kwargs,
|
1277 |
)
|
1278 |
|
1279 |
-
hidden_states = layer_outputs[0]
|
1280 |
|
1281 |
# Final layer norm
|
1282 |
hidden_states = self.norm(hidden_states)
|
@@ -1286,6 +1294,159 @@ class IsaacModel(Qwen3Model):
|
|
1286 |
past_key_values=past_key_values,
|
1287 |
)
|
1288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1289 |
|
1290 |
class IsaacForConditionalGeneration(Qwen3ForCausalLM, GenerationMixin):
|
1291 |
"""Isaac multimodal model for conditional generation."""
|
|
|
14 |
from transformers import (
|
15 |
AutoTokenizer,
|
16 |
BatchFeature,
|
17 |
+
Cache,
|
18 |
Qwen3Config,
|
19 |
Qwen3ForCausalLM,
|
20 |
Qwen3PreTrainedModel,
|
21 |
)
|
22 |
+
from transformers.cache_utils import SlidingWindowCache, StaticCache
|
23 |
from transformers.generation.utils import GenerationMixin
|
24 |
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
25 |
from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3Model
|
26 |
+
from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
|
27 |
from transformers.processing_utils import ProcessorMixin
|
28 |
from transformers.tokenization_utils import TensorType
|
29 |
+
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
|
30 |
import re
|
31 |
|
32 |
from transformers.models.siglip2.modeling_siglip2 import (
|
|
|
66 |
num_patches: int = 256,
|
67 |
**kwargs,
|
68 |
):
|
|
|
69 |
super().__init__(**kwargs)
|
70 |
|
71 |
# Add our custom fields
|
|
|
877 |
|
878 |
|
879 |
class IsaacProcessor(ProcessorMixin):
|
880 |
+
attributes = ["tokenizer"]
|
881 |
+
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
|
882 |
+
|
883 |
|
884 |
def __init__(
|
885 |
self,
|
886 |
+
tokenizer: Qwen2Tokenizer,
|
887 |
+
config: IsaacConfig | dict,
|
888 |
):
|
889 |
+
super().__init__(tokenizer)
|
890 |
self.tokenizer = tokenizer
|
891 |
+
|
892 |
+
if isinstance(config, dict):
|
893 |
+
config = IsaacConfig(**config)
|
894 |
self.config = config
|
895 |
|
896 |
# Use vision token from config
|
|
|
1128 |
class IsaacModel(Qwen3Model):
|
1129 |
def __init__(self, config: IsaacConfig):
|
1130 |
super().__init__(config)
|
1131 |
+
text_cfg = getattr(config, "get_text_config", lambda: config)()
|
1132 |
self.layers = torch.nn.ModuleList(
|
1133 |
+
[Qwen3DecoderLayer(text_cfg, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
1134 |
)
|
1135 |
self.rotary_emb = IsaacRotaryEmbedding(config, device=self.device)
|
1136 |
|
|
|
1284 |
**kwargs,
|
1285 |
)
|
1286 |
|
1287 |
+
hidden_states = layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
|
1288 |
|
1289 |
# Final layer norm
|
1290 |
hidden_states = self.norm(hidden_states)
|
|
|
1294 |
past_key_values=past_key_values,
|
1295 |
)
|
1296 |
|
1297 |
+
def _update_causal_mask(
|
1298 |
+
self,
|
1299 |
+
attention_mask: torch.Tensor,
|
1300 |
+
input_tensor: torch.Tensor,
|
1301 |
+
cache_position: torch.Tensor,
|
1302 |
+
past_key_values: Cache,
|
1303 |
+
output_attentions: bool = False,
|
1304 |
+
):
|
1305 |
+
if self.config._attn_implementation == "flash_attention_2":
|
1306 |
+
if attention_mask is not None and past_key_values is not None:
|
1307 |
+
is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
|
1308 |
+
if is_padding_right:
|
1309 |
+
raise ValueError(
|
1310 |
+
"You are attempting to perform batched generation with padding_side='right'"
|
1311 |
+
" this may lead to unexpected behaviour for Flash Attention version of Qwen3. Make sure to "
|
1312 |
+
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
1313 |
+
)
|
1314 |
+
if attention_mask is not None and 0.0 in attention_mask:
|
1315 |
+
return attention_mask
|
1316 |
+
return None
|
1317 |
+
|
1318 |
+
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
1319 |
+
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
1320 |
+
# to infer the attention mask.
|
1321 |
+
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
1322 |
+
using_static_cache = isinstance(past_key_values, StaticCache)
|
1323 |
+
using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
|
1324 |
+
|
1325 |
+
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
1326 |
+
if (
|
1327 |
+
self.config._attn_implementation == "sdpa"
|
1328 |
+
and not (using_static_cache or using_sliding_window_cache)
|
1329 |
+
and not output_attentions
|
1330 |
+
):
|
1331 |
+
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
1332 |
+
attention_mask,
|
1333 |
+
inputs_embeds=input_tensor,
|
1334 |
+
past_key_values_length=past_seen_tokens,
|
1335 |
+
sliding_window=self.config.sliding_window,
|
1336 |
+
is_training=self.training,
|
1337 |
+
):
|
1338 |
+
return None
|
1339 |
+
|
1340 |
+
dtype, device = input_tensor.dtype, input_tensor.device
|
1341 |
+
min_dtype = torch.finfo(dtype).min
|
1342 |
+
sequence_length = input_tensor.shape[1]
|
1343 |
+
# SlidingWindowCache or StaticCache
|
1344 |
+
if using_sliding_window_cache or using_static_cache:
|
1345 |
+
target_length = past_key_values.get_max_cache_shape()
|
1346 |
+
# DynamicCache or no cache
|
1347 |
+
else:
|
1348 |
+
target_length = (
|
1349 |
+
attention_mask.shape[-1]
|
1350 |
+
if isinstance(attention_mask, torch.Tensor)
|
1351 |
+
else past_seen_tokens + sequence_length + 1
|
1352 |
+
)
|
1353 |
+
|
1354 |
+
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
1355 |
+
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
1356 |
+
attention_mask,
|
1357 |
+
sequence_length=sequence_length,
|
1358 |
+
target_length=target_length,
|
1359 |
+
dtype=dtype,
|
1360 |
+
device=device,
|
1361 |
+
cache_position=cache_position,
|
1362 |
+
batch_size=input_tensor.shape[0],
|
1363 |
+
config=self.config,
|
1364 |
+
past_key_values=past_key_values,
|
1365 |
+
)
|
1366 |
+
|
1367 |
+
if (
|
1368 |
+
self.config._attn_implementation == "sdpa"
|
1369 |
+
and attention_mask is not None
|
1370 |
+
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
1371 |
+
and not output_attentions
|
1372 |
+
):
|
1373 |
+
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
1374 |
+
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
1375 |
+
# Details: https://github.com/pytorch/pytorch/issues/110213
|
1376 |
+
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
1377 |
+
|
1378 |
+
return causal_mask
|
1379 |
+
|
1380 |
+
@staticmethod
|
1381 |
+
def _prepare_4d_causal_attention_mask_with_cache_position(
|
1382 |
+
attention_mask: torch.Tensor,
|
1383 |
+
sequence_length: int,
|
1384 |
+
target_length: int,
|
1385 |
+
dtype: torch.dtype,
|
1386 |
+
device: torch.device,
|
1387 |
+
cache_position: torch.Tensor,
|
1388 |
+
batch_size: int,
|
1389 |
+
config: Qwen3Config,
|
1390 |
+
past_key_values: Cache,
|
1391 |
+
):
|
1392 |
+
"""
|
1393 |
+
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
1394 |
+
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
1395 |
+
|
1396 |
+
Args:
|
1397 |
+
attention_mask (`torch.Tensor`):
|
1398 |
+
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
1399 |
+
sequence_length (`int`):
|
1400 |
+
The sequence length being processed.
|
1401 |
+
target_length (`int`):
|
1402 |
+
The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
|
1403 |
+
dtype (`torch.dtype`):
|
1404 |
+
The dtype to use for the 4D attention mask.
|
1405 |
+
device (`torch.device`):
|
1406 |
+
The device to place the 4D attention mask on.
|
1407 |
+
cache_position (`torch.Tensor`):
|
1408 |
+
Indices depicting the position of the input sequence tokens in the sequence.
|
1409 |
+
batch_size (`torch.Tensor`):
|
1410 |
+
Batch size.
|
1411 |
+
config (`Qwen3Config`):
|
1412 |
+
The model's configuration class
|
1413 |
+
past_key_values (`Cache`):
|
1414 |
+
The cache class that is being used currently to generate
|
1415 |
+
"""
|
1416 |
+
if attention_mask is not None and attention_mask.dim() == 4:
|
1417 |
+
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
1418 |
+
causal_mask = attention_mask
|
1419 |
+
else:
|
1420 |
+
min_dtype = torch.finfo(dtype).min
|
1421 |
+
causal_mask = torch.full(
|
1422 |
+
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
|
1423 |
+
)
|
1424 |
+
diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
|
1425 |
+
if config.sliding_window is not None:
|
1426 |
+
# if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
|
1427 |
+
# the check is needed to verify is current checkpoint was trained with sliding window or not
|
1428 |
+
if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
|
1429 |
+
sliding_attend_mask = torch.arange(target_length, device=device) <= (
|
1430 |
+
cache_position.reshape(-1, 1) - config.sliding_window
|
1431 |
+
)
|
1432 |
+
diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
|
1433 |
+
causal_mask *= diagonal_attend_mask
|
1434 |
+
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
1435 |
+
if attention_mask is not None:
|
1436 |
+
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
1437 |
+
if attention_mask.shape[-1] > target_length:
|
1438 |
+
attention_mask = attention_mask[:, :target_length]
|
1439 |
+
mask_length = attention_mask.shape[-1]
|
1440 |
+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
1441 |
+
causal_mask.device
|
1442 |
+
)
|
1443 |
+
padding_mask = padding_mask == 0
|
1444 |
+
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
1445 |
+
padding_mask, min_dtype
|
1446 |
+
)
|
1447 |
+
return causal_mask
|
1448 |
+
|
1449 |
+
|
1450 |
|
1451 |
class IsaacForConditionalGeneration(Qwen3ForCausalLM, GenerationMixin):
|
1452 |
"""Isaac multimodal model for conditional generation."""
|
processor_config.json
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoProcessor": "modular_isaac.IsaacProcessor"
|
4 |
+
},
|
5 |
+
"config": {
|
6 |
+
"_name_or_path": "",
|
7 |
+
"add_cross_attention": false,
|
8 |
+
"architectures": [
|
9 |
+
"IsaacForConditionalGeneration"
|
10 |
+
],
|
11 |
+
"attention_bias": false,
|
12 |
+
"attention_dropout": 0.0,
|
13 |
+
"auto_map": {
|
14 |
+
"AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
|
15 |
+
},
|
16 |
+
"bad_words_ids": null,
|
17 |
+
"begin_suppress_tokens": null,
|
18 |
+
"bos_token_id": 151643,
|
19 |
+
"chunk_size_feed_forward": 0,
|
20 |
+
"cross_attention_hidden_size": null,
|
21 |
+
"decoder_start_token_id": null,
|
22 |
+
"diversity_penalty": 0.0,
|
23 |
+
"do_sample": false,
|
24 |
+
"dtype": "float32",
|
25 |
+
"early_stopping": false,
|
26 |
+
"encoder_no_repeat_ngram_size": 0,
|
27 |
+
"eos_token_id": 151645,
|
28 |
+
"exponential_decay_length_penalty": null,
|
29 |
+
"finetuning_task": null,
|
30 |
+
"forced_bos_token_id": null,
|
31 |
+
"forced_eos_token_id": null,
|
32 |
+
"head_dim": 128,
|
33 |
+
"hidden_act": "silu",
|
34 |
+
"hidden_size": 2048,
|
35 |
+
"id2label": {
|
36 |
+
"0": "LABEL_0",
|
37 |
+
"1": "LABEL_1"
|
38 |
+
},
|
39 |
+
"initializer_range": 0.02,
|
40 |
+
"intermediate_size": 6144,
|
41 |
+
"is_decoder": false,
|
42 |
+
"is_encoder_decoder": false,
|
43 |
+
"label2id": {
|
44 |
+
"LABEL_0": 0,
|
45 |
+
"LABEL_1": 1
|
46 |
+
},
|
47 |
+
"layer_types": [
|
48 |
+
"full_attention",
|
49 |
+
"full_attention",
|
50 |
+
"full_attention",
|
51 |
+
"full_attention",
|
52 |
+
"full_attention",
|
53 |
+
"full_attention",
|
54 |
+
"full_attention",
|
55 |
+
"full_attention",
|
56 |
+
"full_attention",
|
57 |
+
"full_attention",
|
58 |
+
"full_attention",
|
59 |
+
"full_attention",
|
60 |
+
"full_attention",
|
61 |
+
"full_attention",
|
62 |
+
"full_attention",
|
63 |
+
"full_attention",
|
64 |
+
"full_attention",
|
65 |
+
"full_attention",
|
66 |
+
"full_attention",
|
67 |
+
"full_attention",
|
68 |
+
"full_attention",
|
69 |
+
"full_attention",
|
70 |
+
"full_attention",
|
71 |
+
"full_attention",
|
72 |
+
"full_attention",
|
73 |
+
"full_attention",
|
74 |
+
"full_attention",
|
75 |
+
"full_attention"
|
76 |
+
],
|
77 |
+
"length_penalty": 1.0,
|
78 |
+
"max_length": 20,
|
79 |
+
"max_position_embeddings": 40960,
|
80 |
+
"max_sequence_length": 16384,
|
81 |
+
"max_window_layers": 28,
|
82 |
+
"min_length": 0,
|
83 |
+
"model_type": "isaac",
|
84 |
+
"no_repeat_ngram_size": 0,
|
85 |
+
"num_attention_heads": 16,
|
86 |
+
"num_beam_groups": 1,
|
87 |
+
"num_beams": 1,
|
88 |
+
"num_hidden_layers": 28,
|
89 |
+
"num_key_value_heads": 8,
|
90 |
+
"num_return_sequences": 1,
|
91 |
+
"output_attentions": false,
|
92 |
+
"output_hidden_states": false,
|
93 |
+
"output_scores": false,
|
94 |
+
"pad_token_id": null,
|
95 |
+
"pixel_shuffle_scale": 2,
|
96 |
+
"prefix": null,
|
97 |
+
"problem_type": null,
|
98 |
+
"pruned_heads": {},
|
99 |
+
"remove_invalid_values": false,
|
100 |
+
"repetition_penalty": 1.0,
|
101 |
+
"return_dict": true,
|
102 |
+
"return_dict_in_generate": false,
|
103 |
+
"rms_norm_eps": 1e-06,
|
104 |
+
"rope_scaling": {
|
105 |
+
"mrope_interleaved": true,
|
106 |
+
"mrope_section": null,
|
107 |
+
"rope_type": "default"
|
108 |
+
},
|
109 |
+
"rope_theta": 1000000.0,
|
110 |
+
"sep_token_id": null,
|
111 |
+
"sliding_window": null,
|
112 |
+
"suppress_tokens": null,
|
113 |
+
"task_specific_params": null,
|
114 |
+
"temperature": 1.0,
|
115 |
+
"tf_legacy_loss": false,
|
116 |
+
"tie_encoder_decoder": false,
|
117 |
+
"tie_word_embeddings": false,
|
118 |
+
"tokenizer_class": null,
|
119 |
+
"top_k": 50,
|
120 |
+
"top_p": 1.0,
|
121 |
+
"torchscript": false,
|
122 |
+
"transformers_version": "4.56.1",
|
123 |
+
"typical_p": 1.0,
|
124 |
+
"use_bfloat16": false,
|
125 |
+
"use_cache": true,
|
126 |
+
"use_sliding_window": false,
|
127 |
+
"video_patch_size": 16,
|
128 |
+
"vision_config": {
|
129 |
+
"_name_or_path": "",
|
130 |
+
"add_cross_attention": false,
|
131 |
+
"architectures": null,
|
132 |
+
"attention_dropout": 0.0,
|
133 |
+
"bad_words_ids": null,
|
134 |
+
"begin_suppress_tokens": null,
|
135 |
+
"bos_token_id": null,
|
136 |
+
"chunk_size_feed_forward": 0,
|
137 |
+
"cross_attention_hidden_size": null,
|
138 |
+
"decoder_start_token_id": null,
|
139 |
+
"diversity_penalty": 0.0,
|
140 |
+
"do_sample": false,
|
141 |
+
"dtype": null,
|
142 |
+
"early_stopping": false,
|
143 |
+
"encoder_no_repeat_ngram_size": 0,
|
144 |
+
"eos_token_id": null,
|
145 |
+
"exponential_decay_length_penalty": null,
|
146 |
+
"finetuning_task": null,
|
147 |
+
"forced_bos_token_id": null,
|
148 |
+
"forced_eos_token_id": null,
|
149 |
+
"hidden_act": "gelu_pytorch_tanh",
|
150 |
+
"hidden_size": 1152,
|
151 |
+
"id2label": {
|
152 |
+
"0": "LABEL_0",
|
153 |
+
"1": "LABEL_1"
|
154 |
+
},
|
155 |
+
"image_size": 256,
|
156 |
+
"intermediate_size": 4304,
|
157 |
+
"is_decoder": false,
|
158 |
+
"is_encoder_decoder": false,
|
159 |
+
"label2id": {
|
160 |
+
"LABEL_0": 0,
|
161 |
+
"LABEL_1": 1
|
162 |
+
},
|
163 |
+
"layer_norm_eps": 1e-06,
|
164 |
+
"length_penalty": 1.0,
|
165 |
+
"max_length": 20,
|
166 |
+
"min_length": 0,
|
167 |
+
"model_type": "pixel_shuffle_siglip2",
|
168 |
+
"no_repeat_ngram_size": 0,
|
169 |
+
"num_attention_heads": 16,
|
170 |
+
"num_beam_groups": 1,
|
171 |
+
"num_beams": 1,
|
172 |
+
"num_channels": 3,
|
173 |
+
"num_hidden_layers": 27,
|
174 |
+
"num_patches": 256,
|
175 |
+
"num_return_sequences": 1,
|
176 |
+
"output_attentions": false,
|
177 |
+
"output_hidden_states": false,
|
178 |
+
"output_scores": false,
|
179 |
+
"pad_token_id": null,
|
180 |
+
"patch_size": 16,
|
181 |
+
"pixel_shuffle_scale_factor": 2,
|
182 |
+
"prefix": null,
|
183 |
+
"problem_type": null,
|
184 |
+
"pruned_heads": {},
|
185 |
+
"remove_invalid_values": false,
|
186 |
+
"repetition_penalty": 1.0,
|
187 |
+
"return_dict": true,
|
188 |
+
"return_dict_in_generate": false,
|
189 |
+
"sep_token_id": null,
|
190 |
+
"suppress_tokens": null,
|
191 |
+
"task_specific_params": null,
|
192 |
+
"temperature": 1.0,
|
193 |
+
"tf_legacy_loss": false,
|
194 |
+
"tie_encoder_decoder": false,
|
195 |
+
"tie_word_embeddings": true,
|
196 |
+
"tokenizer_class": null,
|
197 |
+
"top_k": 50,
|
198 |
+
"top_p": 1.0,
|
199 |
+
"torchscript": false,
|
200 |
+
"typical_p": 1.0,
|
201 |
+
"use_bfloat16": false
|
202 |
+
},
|
203 |
+
"vision_max_num_patches": 6144,
|
204 |
+
"vision_min_num_patches": 256,
|
205 |
+
"vision_token": "<image>",
|
206 |
+
"vocab_size": 151936
|
207 |
+
},
|
208 |
+
"processor_class": "IsaacProcessor"
|
209 |
+
}
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ceaf87113caa06d8b2e2f6966ab11d12ac590cb887b64c591cae70ea89245f4
|
3 |
+
size 11422655
|
tokenizer_config.json
CHANGED
@@ -226,15 +226,17 @@
|
|
226 |
"<|image_pad|>",
|
227 |
"<|video_pad|>"
|
228 |
],
|
|
|
|
|
|
|
229 |
"bos_token": null,
|
230 |
-
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
|
231 |
"clean_up_tokenization_spaces": false,
|
232 |
"eos_token": "<|im_end|>",
|
233 |
"errors": "replace",
|
234 |
"extra_special_tokens": {},
|
235 |
"model_max_length": 131072,
|
236 |
"pad_token": "<|endoftext|>",
|
237 |
-
"processor_class": "
|
238 |
"split_special_tokens": false,
|
239 |
"tokenizer_class": "Qwen2Tokenizer",
|
240 |
"unk_token": null
|
|
|
226 |
"<|image_pad|>",
|
227 |
"<|video_pad|>"
|
228 |
],
|
229 |
+
"auto_map": {
|
230 |
+
"AutoProcessor": "modular_isaac.IsaacProcessor"
|
231 |
+
},
|
232 |
"bos_token": null,
|
|
|
233 |
"clean_up_tokenization_spaces": false,
|
234 |
"eos_token": "<|im_end|>",
|
235 |
"errors": "replace",
|
236 |
"extra_special_tokens": {},
|
237 |
"model_max_length": 131072,
|
238 |
"pad_token": "<|endoftext|>",
|
239 |
+
"processor_class": "IsaacProcessor",
|
240 |
"split_special_tokens": false,
|
241 |
"tokenizer_class": "Qwen2Tokenizer",
|
242 |
"unk_token": null
|