In [None]:
#!git clone https://huggingface.co/spaces/depth-anything/Depth-Anything-V2
#!pip install -r Depth-Anything-V2/requirements.txt
#!pip install -q --upgrade coremltools
#!cp ./patch_dinov2.diff Depth-Anything-V2/
#!cd Depth-Anything-V2 && git apply patch_dinov2.diff
#!cd ..

In [2]:
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

In [3]:
import torch
import coremltools as ct
import numpy as np
from PIL import Image
import tempfile
from huggingface_hub import hf_hub_download
import sys
sys.path.append('./Depth-Anything-V2')



scikit-learn version 1.6.0 is not supported. Minimum required version: 0.17. Maximum required version: 1.5.1. Disabling scikit-learn conversion API.


In [4]:
from depth_anything_v2.dpt import DepthAnythingV2
from depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet

import torch.nn.functional as F

xFormers not available
xFormers not available


# 1. Load Depth-Anything-V2's vitl checkpoint

In [5]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
encoder2name = {
    'vits': 'Small',
    'vitb': 'Base',
    'vitl': 'Large',
    'vitg': 'Giant', # we are undergoing company review procedures to release our giant model checkpoint
}
encoder = 'vits'
model_name = encoder2name[encoder]
model = DepthAnythingV2(**model_configs[encoder])
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-{model_name}", filename=f"depth_anything_v2_{encoder}.pth", repo_type="model")
state_dict = torch.load(filepath, map_location="cpu")
model.load_state_dict(state_dict)
model = model.to(DEVICE).eval()

In [6]:
image = Image.open("./sample_images/IMG_4061.jpeg")
img = np.array(image)
print(img.shape)
h, w = img.shape[:2]
depth = model.infer_image(img)
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
depth = depth.astype(np.uint8)
depth_image = Image.fromarray(depth)
depth_image.save(f"depth_image_{model_name}_1.jpg")

(3024, 4032, 3)


In [7]:
original_image = Image.open("./sample_images/IMG_4061.jpeg")
origina_img = np.array(original_image)
print(origina_img.shape)
original_h, original_w = origina_img.shape[:2]
# Resize the image to the input size, width must be 518 and height must be divisible by 14
input_size_w = 518
#input_size_h = 392 #To have this work, you need to patch dinov2.py 
input_size_h = 518
image = original_image.resize((input_size_w,input_size_h), Image.Resampling.BILINEAR)
img = np.array(image)
input_image, (h, w) = model.image2tensor(img, input_size_h)
input_image = input_image.to(DEVICE)
with torch.no_grad():
    depth = model(input_image)
    depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
    depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    depth = depth.cpu().numpy().astype(np.uint8)
depth_image = Image.fromarray(depth).resize((original_w,original_h), Image.Resampling.BILINEAR)
depth_image.save(f"depth_image_{model_name}_2.jpg")

traced_model = torch.jit.trace(model, input_image)


(3024, 4032, 3)


  assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
  assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
  if npatch == N and w == h:
  out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)


In [8]:
example_output = traced_model(input_image)
print("Traced PyTorch ImageEncoder ckpt out for jpg:\n>>>", example_output[0, :10])

Traced PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([[0.0157, 0.0149, 0.0080,  ..., 0.0410, 0.0407, 0.0510],
        [0.0043, 0.0084, 0.0000,  ..., 0.0359, 0.0472, 0.0514],
        [0.0027, 0.0058, 0.0000,  ..., 0.0333, 0.0354, 0.0526],
        ...,
        [0.0135, 0.0170, 0.0090,  ..., 0.0534, 0.0506, 0.0532],
        [0.0157, 0.0203, 0.0122,  ..., 0.0559, 0.0546, 0.0420],
        [0.0191, 0.0238, 0.0168,  ..., 0.0588, 0.0576, 0.0648]],
       device='mps:0', grad_fn=<SliceBackward0>)


You can see that there is some loss in precision, but it is still acceptable.

# 2. Export ImageEncoder

In [9]:
image_means = [0.485, 0.456, 0.406]
image_stds = [0.229, 0.224, 0.225]

In [10]:
import torchvision.transforms as transforms

class Wrapper(torch.nn.Module):    
    def __init__(self, model):
        super().__init__()
        _means = image_means
        _stds = image_stds
        self.model = model   
        self.stds = torch.tensor(_stds).half()[:,None,None]
        self.means = torch.tensor(_means).half()[:,None,None]

    transform_model = torch.nn.Sequential(
        transforms.Normalize(mean=image_means, std=image_stds)
                             )

    def forward(self, input):        
        input = input/255.0
        intput = self.transform_model(input)
        output = self.model(input)
        output = (output - output.min()) / (output.max() - output.min())        
        # Fix "Image output, 'depthOutput', must have rank 4. Instead it has rank 3"
        output = output.unsqueeze(0)
        # Fix "Shape of the RGB/BGR image output, 'depthOutput', must be of kind (1, 3, H, W), i.e., first two dimensions must be (1, 3), instead they are: (1, 1)"ArithmeticError
        output = output.repeat(1, 3, 1, 1)
        output = output * 255.0
        return output

# Instantiate the Wrapper model passing the original PyTorch FCN model
wrapped_model = Wrapper(traced_model)

In [11]:
i = np.asarray(original_image.resize((input_size_w, input_size_h)))
i = i.astype("float32")
i = np.transpose(i, (2, 0, 1))
i = np.expand_dims(i, 0)
i = torch.from_numpy(i).to(DEVICE)

with torch.no_grad():
    out = wrapped_model(i)

print("wrapped PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0, :10])

traced_model_w = torch.jit.trace(wrapped_model, i)

with torch.no_grad():
    out = traced_model_w(i)

print("Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n>>>", out[0, :10])

wrapped PyTorch ImageEncoder ckpt out for jpg:
>>> tensor([[[  1.0442,   1.0795,   1.0259,  ...,   2.5866,   2.6540,   2.5864],
         [  0.9688,   1.2331,   1.0579,  ...,   2.8632,   2.9795,   2.7485],
         [  0.9795,   1.2034,   0.9449,  ...,   2.9342,   2.9196,   2.8207],
         ...,
         [100.1750, 100.6220, 100.7177,  ...,  97.1819,  96.7440,  97.0862],
         [100.6218, 100.7040, 100.8275,  ...,  97.2966,  97.6106,  97.7243],
         [ 99.4266, 100.6614, 100.1300,  ...,  97.4383,  98.1441,  98.3714]],

        [[  1.0442,   1.0795,   1.0259,  ...,   2.5866,   2.6540,   2.5864],
         [  0.9688,   1.2331,   1.0579,  ...,   2.8632,   2.9795,   2.7485],
         [  0.9795,   1.2034,   0.9449,  ...,   2.9342,   2.9196,   2.8207],
         ...,
         [100.1750, 100.6220, 100.7177,  ...,  97.1819,  96.7440,  97.0862],
         [100.6218, 100.7040, 100.8275,  ...,  97.2966,  97.6106,  97.7243],
         [ 99.4266, 100.6614, 100.1300,  ...,  97.4383,  98.1441,  98.37

In [12]:
i.shape, out.shape

(torch.Size([1, 3, 518, 518]), torch.Size([1, 3, 518, 518]))

In [13]:
tmp = out.cpu().numpy()

print(tmp.shape, tmp.max(), tmp.min(), tmp.mean())
# Convert to 3, 256, 256
tmp = np.transpose(tmp, (0, 2, 3, 1)).astype(np.uint8)
tmp = tmp.squeeze()
print(tmp.shape, tmp.max(), tmp.min(), tmp.mean())
Image.fromarray(tmp)
tmp_image = Image.fromarray(tmp).resize((original_w,original_h))
tmp_image.save(f"depth_image_{model_name}_3.png")

(1, 3, 518, 518) 255.0 0.0 101.90155
(518, 518, 3) 255 0 101.40160403094767


In [14]:
i.shape

torch.Size([1, 3, 518, 518])

In [15]:
traced_model_w.eval()
image_input = ct.ImageType(name="colorImage", shape=i.shape)
image_encoder_model = ct.converters.convert(
    traced_model_w,
    convert_to="mlprogram",
    inputs=[image_input],
    outputs=[ct.ImageType(name="depthOutput")],
    minimum_deployment_target=ct.target.iOS16,
)
image_encoder_model.save(f"DepthAnything_v2_{model_name}_{input_size_w}x{input_size_h}_Box.mlpackage")

Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 779/780 [00:00<00:00, 7178.40 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 150.72 passes/s]
Running MIL default pipeline: 100%|██████████| 89/89 [00:01<00:00, 64.35 passes/s] 
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 165.76 passes/s]
