Safetensors
custom_code
gheinrich commited on
Commit
a1fc2a7
·
1 Parent(s): 983e84b

Sync with nvidia/C-RADIOv2-VLM-H-RC3

Browse files
Files changed (4) hide show
  1. README.md +3 -1
  2. config.json +12 -2
  3. model.safetensors +1 -1
  4. vit_patch_generator.py +19 -0
README.md CHANGED
@@ -6,6 +6,8 @@ license_link: https://developer.download.nvidia.com/licenses/nvidia-open-model-l
6
 
7
  # Model Overview
8
 
 
 
9
  ## Description
10
 
11
  This model performs visual feature extraction.
@@ -78,7 +80,7 @@ import torch
78
  from PIL import Image
79
  from transformers import AutoModel, CLIPImageProcessor
80
 
81
- hf_repo = "nvidia/C-RADIOv2-H"
82
 
83
  image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
84
  model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)
 
6
 
7
  # Model Overview
8
 
9
+ [[**Github**](https://github.com/NVlabs/RADIO)] [[**CVPR 2025**](https://arxiv.org/abs/2412.07679)] [[**CVPR 2024**](https://arxiv.org/abs/2312.06709)]
10
+
11
  ## Description
12
 
13
  This model performs visual feature extraction.
 
80
  from PIL import Image
81
  from transformers import AutoModel, CLIPImageProcessor
82
 
83
+ hf_repo = "nvidia/C-RADIOv2-VLM-H-RC3"
84
 
85
  image_processor = CLIPImageProcessor.from_pretrained(hf_repo)
86
  model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True)
config.json CHANGED
@@ -16,7 +16,7 @@
16
  "cache_dir": null,
17
  "channels_last": false,
18
  "checkpoint_hist": 10,
19
- "chk_keep_forever": 50,
20
  "class_map": "",
21
  "clip_grad": null,
22
  "clip_mode": "norm",
@@ -31,6 +31,7 @@
31
  "crop_pct": null,
32
  "cutmix": 0.0,
33
  "cutmix_minmax": null,
 
34
  "dataset_download": false,
35
  "debug_full_knn": false,
36
  "decay_epochs": 90,
@@ -64,7 +65,7 @@
64
  "force_new_wandb_id": false,
65
  "force_spectral_reparam": true,
66
  "freeze_bn": false,
67
- "fsdp": true,
68
  "full_equivariance": false,
69
  "fuser": "",
70
  "gp": null,
@@ -169,6 +170,15 @@
169
  "name": "siglip2-g",
170
  "type": "siglip2",
171
  "use_summary": true
 
 
 
 
 
 
 
 
 
172
  }
173
  ],
174
  "torchcompile": null,
 
16
  "cache_dir": null,
17
  "channels_last": false,
18
  "checkpoint_hist": 10,
19
+ "chk_keep_forever": 100,
20
  "class_map": "",
21
  "clip_grad": null,
22
  "clip_mode": "norm",
 
31
  "crop_pct": null,
32
  "cutmix": 0.0,
33
  "cutmix_minmax": null,
34
+ "damp": null,
35
  "dataset_download": false,
36
  "debug_full_knn": false,
37
  "decay_epochs": 90,
 
65
  "force_new_wandb_id": false,
66
  "force_spectral_reparam": true,
67
  "freeze_bn": false,
68
+ "fsdp": false,
69
  "full_equivariance": false,
70
  "fuser": "",
71
  "gp": null,
 
170
  "name": "siglip2-g",
171
  "type": "siglip2",
172
  "use_summary": true
173
+ },
174
+ {
175
+ "fd_normalize": false,
176
+ "feature_distillation": true,
177
+ "input_size": 384,
178
+ "model": "siglip2-g-384",
179
+ "name": "siglip2-g-dirty",
180
+ "type": "siglip2",
181
+ "use_summary": false
182
  }
183
  ],
184
  "torchcompile": null,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e00385571b42a060742e3dd14ea7ac17f099968482115dc49a66aac34d1aa0a2
3
  size 2606616120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96ff3bfec4f732d68a0c38c41a49de043abd2503df24481526ea87d26dd6a4f5
3
  size 2606616120
vit_patch_generator.py CHANGED
@@ -119,6 +119,10 @@ class ViTPatchGenerator(nn.Module):
119
  'pos_embed',
120
  ]
121
 
 
 
 
 
122
  def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
123
  if src_embed.shape != targ_embed.shape:
124
  src_size = int(math.sqrt(src_embed.shape[1]))
@@ -281,3 +285,18 @@ class ViTPatchLinear(nn.Linear):
281
  **factory
282
  )
283
  self.patch_size = patch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  'pos_embed',
120
  ]
121
 
122
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
123
+ if self.abs_pos:
124
+ self._load_embed(state_dict[f'{prefix}pos_embed'], self.pos_embed)
125
+
126
  def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
127
  if src_embed.shape != targ_embed.shape:
128
  src_size = int(math.sqrt(src_embed.shape[1]))
 
285
  **factory
286
  )
287
  self.patch_size = patch_size
288
+
289
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
290
+ if self.bias is not None:
291
+ self.bias.data.copy_(state_dict[f'{prefix}bias'])
292
+
293
+ chk_weight = state_dict[f'{prefix}weight']
294
+ if chk_weight.shape != self.weight.shape:
295
+ src_patch_size = int(math.sqrt(chk_weight.shape[1] // 3))
296
+
297
+ assert (src_patch_size ** 2) * 3 == chk_weight.shape[1], 'Unable to interpolate non-square patch size'
298
+
299
+ chk_weight = rearrange(chk_weight, 'b (c h w) -> b c h w', c=3, h=src_patch_size, w=src_patch_size)
300
+ chk_weight = F.interpolate(chk_weight, size=(self.patch_size, self.patch_size), mode='bicubic', align_corners=True, antialias=False)
301
+ chk_weight = rearrange(chk_weight, 'b c h w -> b (c h w)')
302
+ self.weight.data.copy_(chk_weight)