lambertxiao commited on
Commit
2ae17e5
·
verified ·
1 Parent(s): b156b02

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +91 -1
README.md CHANGED
@@ -6,4 +6,94 @@ base_model:
6
  - Qwen/Qwen2.5-3B-Instruct
7
  - microsoft/Florence-2-large
8
  pipeline_tag: image-to-text
9
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  - Qwen/Qwen2.5-3B-Instruct
7
  - microsoft/Florence-2-large
8
  pipeline_tag: image-to-text
9
+ ---
10
+
11
+ # Vision-Language-Vision Auto-Encoder
12
+ **Scalable Knowledge Distillation from Diffusion Models**
13
+
14
+ ## Official Checkpoint · VLV Captioner (Qwen 2.5 3B)
15
+
16
+ This repository hosts the 3-billion-parameter **Vision-Language-Vision Captioner** model, distantly supervised by diffusion models and built on top of Qwen 2.5 3B.
17
+ Checkpoint URL: **<https://huggingface.co/lambertxiao/Vision-Language-Vision-Captioner-Qwen2.5-3B>**
18
+
19
+ ---
20
+
21
+ ## 1 · Install Dependencies
22
+
23
+ ```bash
24
+ # inside your virtualenv / conda env
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+ ## 2 · Example Usage
29
+ ```python
30
+ from transformers import AutoModel
31
+ from PIL import Image
32
+ import torch, numpy as np
33
+
34
+ MODEL_NAME = "lambertxiao/Vision-Language-Vision-Captioner-Qwen2.5-3B"
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+
37
+ # ────── load model ──────
38
+ model = (
39
+ AutoModel.from_pretrained(
40
+ MODEL_NAME,
41
+ trust_remote_code=True,
42
+ low_cpu_mem_usage=False,
43
+ )
44
+ .to(device)
45
+ .eval()
46
+ )
47
+
48
+ # ────── helpers ──────
49
+ def _trim_tail(text: str) -> str:
50
+ """Remove an incomplete trailing sentence fragment, if any."""
51
+ sentences = [s.strip() for s in text.split(".") if s.strip()]
52
+ if not text.rstrip().endswith("."):
53
+ sentences = sentences[:-1] # drop dangling fragment
54
+ return ". ".join(sentences) + ("." if sentences else "")
55
+
56
+ def caption_image(img: Image.Image, max_len: int = 77) -> str:
57
+ """Generate a caption for one PIL image."""
58
+ with torch.no_grad():
59
+ raw = model([img], max_len).generated_text[0]
60
+ return _trim_tail(raw)
61
+
62
+ def caption_from_numpy(arr: np.ndarray, max_len: int = 77) -> str:
63
+ """
64
+ Wrapper for NumPy arrays.
65
+ Accepts uint8 [0, 255] or float [0, 1] ranges.
66
+ """
67
+ if arr.dtype != np.uint8:
68
+ arr = (np.clip(arr, 0, 1) * 255).astype(np.uint8)
69
+ return caption_image(Image.fromarray(arr, mode="RGB"), max_len)
70
+ ```
71
+
72
+
73
+ ## 3 · Quick Test
74
+
75
+ ```python
76
+ # caption a remote sample image (cat photo) in one cell
77
+
78
+ import io, requests
79
+ from PIL import Image
80
+ from IPython.display import display # Jupyter/Colab only
81
+
82
+ IMG_URL = "https://huggingface.co/datasets/huggingface/cats-image/resolve/main/cats_image.jpeg"
83
+
84
+ # download & open
85
+ img = Image.open(io.BytesIO(requests.get(IMG_URL, timeout=10).content)).convert("RGB")
86
+
87
+ display(img) # show the image
88
+ print(caption_image(img)) # generate and print the caption
89
+
90
+ ```
91
+ ## 4 · Citation
92
+
93
+ ```bibtex
94
+ @article{zhang2025vision,
95
+ title = {Vision-Language-Vision Auto-Encoder: Scalable Knowledge Distillation from Diffusion Models},
96
+ author = {Zhang, Tiezheng and Li, Yitong and Chou, Yu-Cheng and Chen, Jieneng and Yuille, Alan and Wei, Chen and Xiao, Junfei},
97
+ journal = {arXiv preprint arXiv:2507.07104},
98
+ year = {2025}
99
+ }