Upload folder using huggingface_hub
Browse files- config.json +8 -6
- modeling_interfuser.py +368 -0
- pytorch_model.bin +2 -2
config.json
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
{
|
2 |
-
"model_type": "interfuser",
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
|
|
|
|
6 |
"embed_dim": 256,
|
7 |
"enc_depth": 6,
|
8 |
-
"
|
|
|
|
|
9 |
"num_heads": 8,
|
10 |
-
"dim_feedforward": 2048,
|
11 |
-
"dropout": 0.1,
|
12 |
"rgb_backbone_name": "r50",
|
13 |
-
"
|
|
|
14 |
"use_different_backbone": true,
|
15 |
"waypoints_pred_head": "gru"
|
16 |
}
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
+
"InterfuserForHuggingFace"
|
4 |
],
|
5 |
+
"dec_depth": 6,
|
6 |
+
"dim_feedforward": 2048,
|
7 |
"embed_dim": 256,
|
8 |
"enc_depth": 6,
|
9 |
+
"in_chans": 12,
|
10 |
+
"lidar_backbone_name": "r18",
|
11 |
+
"model_type": "interfuser",
|
12 |
"num_heads": 8,
|
|
|
|
|
13 |
"rgb_backbone_name": "r50",
|
14 |
+
"torch_dtype": "float32",
|
15 |
+
"transformers_version": "4.52.4",
|
16 |
"use_different_backbone": true,
|
17 |
"waypoints_pred_head": "gru"
|
18 |
}
|
modeling_interfuser.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
# This file contains all custom class definitions required to run the Interfuser model.
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
import torch.nn.functional as F
|
8 |
+
from transformers import PreTrainedModel, PretrainedConfig
|
9 |
+
from functools import partial
|
10 |
+
import math
|
11 |
+
from collections import OrderedDict
|
12 |
+
import copy
|
13 |
+
from typing import Optional, List
|
14 |
+
from torch import Tensor
|
15 |
+
|
16 |
+
# ...
|
17 |
+
# It's better to import from the original source if possible
|
18 |
+
# For full portability, we define them here.
|
19 |
+
from InterFuser.interfuser.timm.models.layers import to_2tuple
|
20 |
+
from InterFuser.interfuser.timm.models.resnet import resnet50d, resnet26d, resnet18d
|
21 |
+
|
22 |
+
# ==============================================================================
|
23 |
+
# SECTION 1: ALL DEPENDENCY CLASSES FROM THE ORIGINAL CODE
|
24 |
+
# ==============================================================================
|
25 |
+
|
26 |
+
class HybridEmbed(nn.Module):
|
27 |
+
def __init__(self, backbone, img_size=224, patch_size=1, feature_size=None, in_chans=3, embed_dim=768):
|
28 |
+
super().__init__()
|
29 |
+
assert isinstance(backbone, nn.Module)
|
30 |
+
img_size = to_2tuple(img_size)
|
31 |
+
patch_size = to_2tuple(patch_size)
|
32 |
+
self.img_size = img_size
|
33 |
+
self.patch_size = patch_size
|
34 |
+
self.backbone = backbone
|
35 |
+
if feature_size is None:
|
36 |
+
with torch.no_grad():
|
37 |
+
training = backbone.training
|
38 |
+
if training:
|
39 |
+
backbone.eval()
|
40 |
+
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
|
41 |
+
if isinstance(o, (list, tuple)):
|
42 |
+
o = o[-1]
|
43 |
+
feature_size = o.shape[-2:]
|
44 |
+
feature_dim = o.shape[1]
|
45 |
+
backbone.train(training)
|
46 |
+
else:
|
47 |
+
feature_size = to_2tuple(feature_size)
|
48 |
+
if hasattr(self.backbone, "feature_info"):
|
49 |
+
feature_dim = self.backbone.feature_info.channels()[-1]
|
50 |
+
else:
|
51 |
+
feature_dim = self.backbone.num_features
|
52 |
+
self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=1, stride=1)
|
53 |
+
|
54 |
+
def forward(self, x):
|
55 |
+
x = self.backbone(x)
|
56 |
+
if isinstance(x, (list, tuple)):
|
57 |
+
x = x[-1]
|
58 |
+
x = self.proj(x)
|
59 |
+
global_x = torch.mean(x, [2, 3], keepdim=False)[:, :, None]
|
60 |
+
return x, global_x
|
61 |
+
|
62 |
+
class PositionEmbeddingSine(nn.Module):
|
63 |
+
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
|
64 |
+
super().__init__()
|
65 |
+
self.num_pos_feats = num_pos_feats
|
66 |
+
self.temperature = temperature
|
67 |
+
self.normalize = normalize
|
68 |
+
if scale is not None and normalize is False:
|
69 |
+
raise ValueError("normalize should be True if scale is passed")
|
70 |
+
if scale is None:
|
71 |
+
scale = 2 * math.pi
|
72 |
+
self.scale = scale
|
73 |
+
|
74 |
+
def forward(self, tensor):
|
75 |
+
x = tensor
|
76 |
+
bs, _, h, w = x.shape
|
77 |
+
not_mask = torch.ones((bs, h, w), device=x.device)
|
78 |
+
y_embed = not_mask.cumsum(1, dtype=torch.float32)
|
79 |
+
x_embed = not_mask.cumsum(2, dtype=torch.float32)
|
80 |
+
if self.normalize:
|
81 |
+
eps = 1e-6
|
82 |
+
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
|
83 |
+
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
|
84 |
+
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
|
85 |
+
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
|
86 |
+
pos_x = x_embed[:, :, :, None] / dim_t
|
87 |
+
pos_y = y_embed[:, :, :, None] / dim_t
|
88 |
+
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
|
89 |
+
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
|
90 |
+
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
|
91 |
+
return pos
|
92 |
+
|
93 |
+
def _get_clones(module, N):
|
94 |
+
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
|
95 |
+
|
96 |
+
class TransformerEncoderLayer(nn.Module):
|
97 |
+
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=nn.ReLU(), normalize_before=False):
|
98 |
+
super().__init__()
|
99 |
+
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
100 |
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
101 |
+
self.dropout = nn.Dropout(dropout)
|
102 |
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
103 |
+
self.norm1 = nn.LayerNorm(d_model)
|
104 |
+
self.norm2 = nn.LayerNorm(d_model)
|
105 |
+
self.dropout1 = nn.Dropout(dropout)
|
106 |
+
self.dropout2 = nn.Dropout(dropout)
|
107 |
+
self.activation = activation
|
108 |
+
self.normalize_before = normalize_before
|
109 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
110 |
+
return tensor if pos is None else tensor + pos
|
111 |
+
def forward(self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None):
|
112 |
+
q = k = self.with_pos_embed(src, pos)
|
113 |
+
src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
|
114 |
+
src = src + self.dropout1(src2)
|
115 |
+
src = self.norm1(src)
|
116 |
+
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
117 |
+
src = src + self.dropout2(src2)
|
118 |
+
src = self.norm2(src)
|
119 |
+
return src
|
120 |
+
|
121 |
+
class TransformerEncoder(nn.Module):
|
122 |
+
def __init__(self, encoder_layer, num_layers, norm=None):
|
123 |
+
super().__init__()
|
124 |
+
self.layers = _get_clones(encoder_layer, num_layers)
|
125 |
+
self.num_layers = num_layers
|
126 |
+
self.norm = norm
|
127 |
+
def forward(self, src, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None):
|
128 |
+
output = src
|
129 |
+
for layer in self.layers:
|
130 |
+
output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos)
|
131 |
+
if self.norm is not None:
|
132 |
+
output = self.norm(output)
|
133 |
+
return output
|
134 |
+
|
135 |
+
class TransformerDecoderLayer(nn.Module):
|
136 |
+
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=nn.ReLU(), normalize_before=False):
|
137 |
+
super().__init__()
|
138 |
+
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
139 |
+
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
140 |
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
141 |
+
self.dropout = nn.Dropout(dropout)
|
142 |
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
143 |
+
self.norm1 = nn.LayerNorm(d_model)
|
144 |
+
self.norm2 = nn.LayerNorm(d_model)
|
145 |
+
self.norm3 = nn.LayerNorm(d_model)
|
146 |
+
self.dropout1 = nn.Dropout(dropout)
|
147 |
+
self.dropout2 = nn.Dropout(dropout)
|
148 |
+
self.dropout3 = nn.Dropout(dropout)
|
149 |
+
self.activation = activation
|
150 |
+
self.normalize_before = normalize_before
|
151 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
152 |
+
return tensor if pos is None else tensor + pos
|
153 |
+
def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None):
|
154 |
+
q = k = self.with_pos_embed(tgt, query_pos)
|
155 |
+
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
|
156 |
+
tgt = tgt + self.dropout1(tgt2)
|
157 |
+
tgt = self.norm1(tgt)
|
158 |
+
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0]
|
159 |
+
tgt = tgt + self.dropout2(tgt2)
|
160 |
+
tgt = self.norm2(tgt)
|
161 |
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
162 |
+
tgt = tgt + self.dropout3(tgt2)
|
163 |
+
tgt = self.norm3(tgt)
|
164 |
+
return tgt
|
165 |
+
|
166 |
+
class TransformerDecoder(nn.Module):
|
167 |
+
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
|
168 |
+
super().__init__()
|
169 |
+
self.layers = _get_clones(decoder_layer, num_layers)
|
170 |
+
self.num_layers = num_layers
|
171 |
+
self.norm = norm
|
172 |
+
self.return_intermediate = return_intermediate
|
173 |
+
def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None):
|
174 |
+
output = tgt
|
175 |
+
for layer in self.layers:
|
176 |
+
output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos)
|
177 |
+
if self.norm is not None:
|
178 |
+
output = self.norm(output)
|
179 |
+
return output.unsqueeze(0)
|
180 |
+
|
181 |
+
class GRUWaypointsPredictor(nn.Module):
|
182 |
+
def __init__(self, input_dim, waypoints=10):
|
183 |
+
super().__init__()
|
184 |
+
self.gru = torch.nn.GRU(input_size=input_dim, hidden_size=64, batch_first=True)
|
185 |
+
self.encoder = nn.Linear(2, 64)
|
186 |
+
self.decoder = nn.Linear(64, 2)
|
187 |
+
self.waypoints = waypoints
|
188 |
+
def forward(self, x, target_point):
|
189 |
+
bs = x.shape[0]
|
190 |
+
z = self.encoder(target_point).unsqueeze(0)
|
191 |
+
output, _ = self.gru(x, z)
|
192 |
+
output = output.reshape(bs * self.waypoints, -1)
|
193 |
+
output = self.decoder(output).reshape(bs, self.waypoints, 2)
|
194 |
+
output = torch.cumsum(output, 1)
|
195 |
+
return output
|
196 |
+
|
197 |
+
# ... (Add other dependency classes like SpatialSoftmax, MultiPath_Generator, etc. if needed by other configs)
|
198 |
+
|
199 |
+
# --- The ORIGINAL Interfuser Model Class ---
|
200 |
+
class Interfuser(nn.Module):
|
201 |
+
def __init__(self, img_size=224, multi_view_img_size=112, patch_size=8, in_chans=3, embed_dim=768, enc_depth=6, dec_depth=6, dim_feedforward=2048, normalize_before=False, rgb_backbone_name="r26", lidar_backbone_name="r26", num_heads=8, norm_layer=None, dropout=0.1, end2end=False, direct_concat=True, separate_view_attention=False, separate_all_attention=False, act_layer=None, weight_init="", freeze_num=-1, with_lidar=False, with_right_left_sensors=True, with_center_sensor=False, traffic_pred_head_type="det", waypoints_pred_head="heatmap", reverse_pos=True, use_different_backbone=False, use_view_embed=True, use_mmad_pretrain=None):
|
202 |
+
super().__init__()
|
203 |
+
self.num_features = self.embed_dim = embed_dim
|
204 |
+
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
|
205 |
+
act_layer = act_layer or nn.GELU
|
206 |
+
|
207 |
+
self.waypoints_pred_head = waypoints_pred_head
|
208 |
+
self.with_lidar = with_lidar
|
209 |
+
self.with_right_left_sensors = with_right_left_sensors
|
210 |
+
self.attn_mask = None # Simplified
|
211 |
+
|
212 |
+
if use_different_backbone:
|
213 |
+
if rgb_backbone_name == "r50": self.rgb_backbone = resnet50d(pretrained=False, in_chans=3, features_only=True, out_indices=[4])
|
214 |
+
if rgb_backbone_name == "r26": self.rgb_backbone = resnet26d(pretrained=False, in_chans=3, features_only=True, out_indices=[4])
|
215 |
+
if lidar_backbone_name == "r18": self.lidar_backbone = resnet18d(pretrained=False, in_chans=3, features_only=True, out_indices=[4])
|
216 |
+
|
217 |
+
rgb_embed_layer = partial(HybridEmbed, backbone=self.rgb_backbone)
|
218 |
+
lidar_embed_layer = partial(HybridEmbed, backbone=self.lidar_backbone)
|
219 |
+
self.rgb_patch_embed = rgb_embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
|
220 |
+
self.lidar_patch_embed = lidar_embed_layer(img_size=img_size, patch_size=patch_size, in_chans=3, embed_dim=embed_dim)
|
221 |
+
else: raise NotImplementedError("Only use_different_backbone=True supported in this wrapper")
|
222 |
+
|
223 |
+
self.global_embed = nn.Parameter(torch.zeros(1, embed_dim, 5))
|
224 |
+
self.view_embed = nn.Parameter(torch.zeros(1, embed_dim, 5, 1))
|
225 |
+
self.query_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, 11))
|
226 |
+
self.query_embed = nn.Parameter(torch.zeros(400 + 11, 1, embed_dim))
|
227 |
+
|
228 |
+
if self.waypoints_pred_head == "gru": self.waypoints_generator = GRUWaypointsPredictor(embed_dim)
|
229 |
+
else: raise NotImplementedError("Only GRU waypoints head supported in this wrapper")
|
230 |
+
|
231 |
+
self.junction_pred_head = nn.Linear(embed_dim, 2)
|
232 |
+
self.traffic_light_pred_head = nn.Linear(embed_dim, 2)
|
233 |
+
self.stop_sign_head = nn.Linear(embed_dim, 2)
|
234 |
+
self.traffic_pred_head = nn.Sequential(*[nn.Linear(embed_dim + 32, 64), nn.ReLU(), nn.Linear(64, 7), nn.Sigmoid()])
|
235 |
+
self.position_encoding = PositionEmbeddingSine(embed_dim // 2, normalize=True)
|
236 |
+
|
237 |
+
encoder_layer = TransformerEncoderLayer(embed_dim, num_heads, dim_feedforward, dropout, act_layer, normalize_before)
|
238 |
+
self.encoder = TransformerEncoder(encoder_layer, enc_depth, None)
|
239 |
+
decoder_layer = TransformerDecoderLayer(embed_dim, num_heads, dim_feedforward, dropout, act_layer, normalize_before)
|
240 |
+
decoder_norm = nn.LayerNorm(embed_dim)
|
241 |
+
self.decoder = TransformerDecoder(decoder_layer, dec_depth, decoder_norm, return_intermediate=False)
|
242 |
+
|
243 |
+
def forward_features(self, front_image, left_image, right_image, front_center_image, lidar, measurements):
|
244 |
+
features = []
|
245 |
+
front_image_token, front_image_token_global = self.rgb_patch_embed(front_image)
|
246 |
+
front_image_token = (front_image_token + self.position_encoding(front_image_token))
|
247 |
+
front_image_token = front_image_token.flatten(2).permute(2, 0, 1)
|
248 |
+
front_image_token_global = (front_image_token_global + self.global_embed[:, :, 0:1])
|
249 |
+
front_image_token_global = front_image_token_global.permute(2, 0, 1)
|
250 |
+
features.extend([front_image_token, front_image_token_global])
|
251 |
+
left_image_token, left_image_token_global = self.rgb_patch_embed(left_image)
|
252 |
+
left_image_token = (left_image_token + self.position_encoding(left_image_token)).flatten(2).permute(2, 0, 1)
|
253 |
+
left_image_token_global = (left_image_token_global + self.global_embed[:, :, 1:2]).permute(2, 0, 1)
|
254 |
+
right_image_token, right_image_token_global = self.rgb_patch_embed(right_image)
|
255 |
+
right_image_token = (right_image_token + self.position_encoding(right_image_token)).flatten(2).permute(2, 0, 1)
|
256 |
+
right_image_token_global = (right_image_token_global + self.global_embed[:, :, 2:3]).permute(2, 0, 1)
|
257 |
+
features.extend([left_image_token, left_image_token_global, right_image_token, right_image_token_global])
|
258 |
+
return torch.cat(features, 0)
|
259 |
+
|
260 |
+
def forward(self, x):
|
261 |
+
front_image, left_image, right_image = x["rgb"], x["rgb_left"], x["rgb_right"]
|
262 |
+
measurements, target_point = x["measurements"], x["target_point"]
|
263 |
+
features = self.forward_features(front_image, left_image, right_image, x["rgb_center"], x["lidar"], measurements)
|
264 |
+
bs = front_image.shape[0]
|
265 |
+
tgt = self.position_encoding(torch.ones((bs, 1, 20, 20), device=x["rgb"].device)).flatten(2)
|
266 |
+
tgt = torch.cat([tgt, self.query_pos_embed.repeat(bs, 1, 1)], 2).permute(2, 0, 1)
|
267 |
+
memory = self.encoder(features, mask=self.attn_mask)
|
268 |
+
hs = self.decoder(self.query_embed.repeat(1, bs, 1), memory, query_pos=tgt)[0].permute(1, 0, 2)
|
269 |
+
traffic_feature = hs[:, :400]
|
270 |
+
waypoints_feature = hs[:, 401:411]
|
271 |
+
is_junction_feature = hs[:, 400]
|
272 |
+
traffic_light_state_feature, stop_sign_feature = hs[:, 400], hs[:, 400]
|
273 |
+
waypoints = self.waypoints_generator(waypoints_feature, target_point)
|
274 |
+
is_junction = self.junction_pred_head(is_junction_feature)
|
275 |
+
traffic_light_state = self.traffic_light_pred_head(traffic_light_state_feature)
|
276 |
+
stop_sign = self.stop_sign_head(stop_sign_feature)
|
277 |
+
velocity = measurements[:, 6:7].unsqueeze(-1).repeat(1, 400, 32)
|
278 |
+
traffic_feature_with_vel = torch.cat([traffic_feature, velocity], dim=2)
|
279 |
+
traffic = self.traffic_pred_head(traffic_feature_with_vel)
|
280 |
+
return traffic, waypoints, is_junction, traffic_light_state, stop_sign, traffic_feature
|
281 |
+
|
282 |
+
# ==============================================================================
|
283 |
+
# SECTION 2: HUGGING FACE WRAPPER CLASSES
|
284 |
+
# ==============================================================================
|
285 |
+
# ==============================================================================
|
286 |
+
# أضف هذا الكود في نهاية خلية تعريف النموذج الأصلي
|
287 |
+
# ==============================================================================
|
288 |
+
|
289 |
+
print("
|
290 |
+
--- Defining Hugging Face compatible wrapper classes ---")
|
291 |
+
|
292 |
+
|
293 |
+
# --- 2. فئة النموذج المتوافقة (HF-Compatible Model Class) ---
|
294 |
+
class InterfuserConfig(PretrainedConfig):
|
295 |
+
|
296 |
+
model_type = "interfuser"
|
297 |
+
|
298 |
+
def __init__(
|
299 |
+
self,
|
300 |
+
embed_dim=256,
|
301 |
+
enc_depth=6,
|
302 |
+
dec_depth=6,
|
303 |
+
num_heads=8,
|
304 |
+
dim_feedforward=2048,
|
305 |
+
rgb_backbone_name="r50",
|
306 |
+
lidar_backbone_name="r18",
|
307 |
+
waypoints_pred_head="gru",
|
308 |
+
use_different_backbone=True,
|
309 |
+
**kwargs
|
310 |
+
):
|
311 |
+
super().__init__(**kwargs)
|
312 |
+
self.embed_dim = embed_dim
|
313 |
+
self.enc_depth = enc_depth
|
314 |
+
self.dec_depth = dec_depth
|
315 |
+
self.num_heads = num_heads
|
316 |
+
self.dim_feedforward = dim_feedforward
|
317 |
+
self.rgb_backbone_name = rgb_backbone_name
|
318 |
+
self.lidar_backbone_name = lidar_backbone_name
|
319 |
+
self.waypoints_pred_head = waypoints_pred_head
|
320 |
+
self.use_different_backbone = use_different_backbone
|
321 |
+
# Add the architectures key for auto-mapping
|
322 |
+
self.architectures = ["InterfuserForHuggingFace"]
|
323 |
+
|
324 |
+
|
325 |
+
# --- 2. فئة النموذج المتوافقة (HF-Compatible Model Class) ---
|
326 |
+
# هذه هي النسخة الجديدة من نموذجك التي ترث من PreTrainedModel
|
327 |
+
class InterfuserForHuggingFace(PreTrainedModel):
|
328 |
+
|
329 |
+
config_class = InterfuserConfig # Link to the config class
|
330 |
+
|
331 |
+
def __init__(self, config: InterfuserConfig):
|
332 |
+
super().__init__(config)
|
333 |
+
self.config = config
|
334 |
+
|
335 |
+
# We instantiate the original Interfuser model inside our wrapper
|
336 |
+
# The parameters are taken from our config object.
|
337 |
+
# This requires the original 'Interfuser' class to be defined in the notebook.
|
338 |
+
self.interfuser_model = Interfuser(
|
339 |
+
embed_dim=self.config.embed_dim,
|
340 |
+
enc_depth=self.config.enc_depth,
|
341 |
+
dec_depth=self.config.dec_depth,
|
342 |
+
num_heads=self.config.num_heads,
|
343 |
+
dim_feedforward=self.config.dim_feedforward,
|
344 |
+
rgb_backbone_name=self.config.rgb_backbone_name,
|
345 |
+
lidar_backbone_name=self.config.lidar_backbone_name,
|
346 |
+
waypoints_pred_head=self.config.waypoints_pred_head,
|
347 |
+
use_different_backbone=self.config.use_different_backbone
|
348 |
+
)
|
349 |
+
|
350 |
+
def forward(self, rgb, rgb_left, rgb_right, rgb_center, lidar, measurements, target_point, **kwargs):
|
351 |
+
|
352 |
+
# The original model expects a dictionary, so we create one.
|
353 |
+
inputs_dict = {
|
354 |
+
'rgb': rgb,
|
355 |
+
'rgb_left': rgb_left,
|
356 |
+
'rgb_right': rgb_right,
|
357 |
+
'rgb_center': rgb_center,
|
358 |
+
'lidar': lidar,
|
359 |
+
'measurements': measurements,
|
360 |
+
'target_point': target_point
|
361 |
+
}
|
362 |
+
|
363 |
+
# Call the forward method of the original model
|
364 |
+
# The output is already a tuple, which is what HF expects.
|
365 |
+
return self.interfuser_model.forward(inputs_dict)
|
366 |
+
|
367 |
+
# --- رسالة تأكيد ---
|
368 |
+
print("✅ Hugging Face wrapper classes (InterfuserConfig, InterfuserForHuggingFace) are now defined.")
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7900840868237b4916efd78f996eb338480b79985dc09a29e574d1e9b130da29
|
3 |
+
size 212282626
|