openbmb
/

MiniCPM-V-4_5

Image-Text-to-Text

feature-extraction

Model card Files Files and versions

tc-mb commited on 2 days ago

Commit

d7bbb09

·

verified ·

1 Parent(s): a18bbd9

Update: README

Files changed (1) hide show

README.md +3 -6

README.md CHANGED Viewed

@@ -7,14 +7,11 @@ language:
 - multilingual
 tags:
 - minicpm-v
-- VLM
 - vision
 - ocr
-- document parsing
 - multi-image
 - video
 - custom_code
 ---
 <h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
@@ -289,7 +286,7 @@ for new_text in answer:
     print(new_text, flush=True, end='')
 # Second round chat, pass history context of multi-turn conversation
-msgs.append({"role": "assistant", "content": [answer]})
 msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
 answer = model.chat(
@@ -406,7 +403,7 @@ def encode_video(video_path, choose_fps=3, force_packing=None):
 video_path="video_test.mp4"
 fps = 5 # fps for video
-force_packing = None # You can set force_packing to ensure that 3D-Resampler packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
 frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
 question = "Describe the video"
@@ -418,7 +415,7 @@ msgs = [
 answer = model.chat(
     msgs=msgs,
     tokenizer=tokenizer,
-    use_image_id=False, # ensure use_image_id=False when video inference
     max_slice_nums=1,
     temporal_ids=frame_ts_id_group
 )

 - multilingual
 tags:
 - minicpm-v
 - vision
 - ocr
 - multi-image
 - video
 - custom_code
 ---
 <h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
     print(new_text, flush=True, end='')
 # Second round chat, pass history context of multi-turn conversation
+msgs.append({"role": "assistant", "content": [generated_text]})
 msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
 answer = model.chat(
 video_path="video_test.mp4"
 fps = 5 # fps for video
+force_packing = None # You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
 frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
 question = "Describe the video"
 answer = model.chat(
     msgs=msgs,
     tokenizer=tokenizer,
+    use_image_id=False,
     max_slice_nums=1,
     temporal_ids=frame_ts_id_group
 )