nvidia
/

NVIDIA-Nemotron-Nano-9B-v2

@@ -16,6 +16,7 @@
 """PyTorch NemotronH model."""
 import math
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Union
@@ -61,8 +62,9 @@ else:
 try:
     #from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
     from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn
 except ImportError:
-    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
 if is_causal_conv1d_available():
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
@@ -269,14 +271,30 @@ class MambaRMSNormGated(torch.nn.Module):
     # jan28b version
     def forward(self, hidden_states, gate=None):
-        return rmsnorm_fn(x=hidden_states,
-                          weight=self.weight,
-                          bias=None, # No bias
-                          z=gate,
-                          eps=self.variance_epsilon,
-                          group_size=self.group_size,
-                          norm_before_gate=False
-        )
 class NemotronHMamba2Mixer(nn.Module):
     """
@@ -623,8 +641,8 @@ class NemotronHMamba2Mixer(nn.Module):
             hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
             B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
             C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
-            B = B.repeat(1, 1, self.num_heads // self.n_groups, 1)
-            C = C.repeat(1, 1, self.num_heads // self.n_groups, 1)
             pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
             D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
@@ -757,6 +775,14 @@ class NemotronHBlock(nn.Module):
         else:
             raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}")
     def forward(
         self,
         hidden_states,
@@ -764,7 +790,7 @@ class NemotronHBlock(nn.Module):
         cache_position: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ):
-        with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)):
             # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
             residual = hidden_states
             hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))

 """PyTorch NemotronH model."""
 import math
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Union
 try:
     #from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
     from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn
+    FAST_RMSNORM = True
 except ImportError:
+    FAST_RMSNORM = False
 if is_causal_conv1d_available():
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
     # jan28b version
     def forward(self, hidden_states, gate=None):
+        if FAST_RMSNORM:
+            return rmsnorm_fn(x=hidden_states,
+                            weight=self.weight,
+                            bias=None, # No bias
+                            z=gate,
+                            eps=self.variance_epsilon,
+                            group_size=self.group_size,
+                            norm_before_gate=False
+            )
+        # standard version
+        input_dtype = hidden_states.dtype
+        batch_size, seq_len, hidden_size = hidden_states.shape
+        num_groups = self.weight.shape[0] // self.group_size
+        hidden_states = hidden_states.to(torch.float32)
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        hidden_states = hidden_states.view(batch_size, seq_len, num_groups, hidden_size // num_groups)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states.view(batch_size, seq_len, hidden_size)
+        return self.weight * hidden_states.to(input_dtype)
 class NemotronHMamba2Mixer(nn.Module):
     """
             hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
             B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
             C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
             pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
             D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
         else:
             raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}")
+    @contextmanager
+    def _maybe_cuda_stream(self, device):
+        if torch.cuda.is_available():
+            with torch.cuda.stream(torch.cuda.default_stream(device)):
+                yield
+        else:
+            yield
     def forward(
         self,
         hidden_states,
         cache_position: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ):
+        with self._maybe_cuda_stream(hidden_states.device):
             # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
             residual = hidden_states
             hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))