tc-mb commited on
Commit
d7bbb09
·
verified ·
1 Parent(s): a18bbd9

Update: README

Browse files
Files changed (1) hide show
  1. README.md +3 -6
README.md CHANGED
@@ -7,14 +7,11 @@ language:
7
  - multilingual
8
  tags:
9
  - minicpm-v
10
- - VLM
11
  - vision
12
  - ocr
13
- - document parsing
14
  - multi-image
15
  - video
16
  - custom_code
17
-
18
  ---
19
 
20
  <h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
@@ -289,7 +286,7 @@ for new_text in answer:
289
  print(new_text, flush=True, end='')
290
 
291
  # Second round chat, pass history context of multi-turn conversation
292
- msgs.append({"role": "assistant", "content": [answer]})
293
  msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
294
 
295
  answer = model.chat(
@@ -406,7 +403,7 @@ def encode_video(video_path, choose_fps=3, force_packing=None):
406
 
407
  video_path="video_test.mp4"
408
  fps = 5 # fps for video
409
- force_packing = None # You can set force_packing to ensure that 3D-Resampler packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
410
  frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
411
 
412
  question = "Describe the video"
@@ -418,7 +415,7 @@ msgs = [
418
  answer = model.chat(
419
  msgs=msgs,
420
  tokenizer=tokenizer,
421
- use_image_id=False, # ensure use_image_id=False when video inference
422
  max_slice_nums=1,
423
  temporal_ids=frame_ts_id_group
424
  )
 
7
  - multilingual
8
  tags:
9
  - minicpm-v
 
10
  - vision
11
  - ocr
 
12
  - multi-image
13
  - video
14
  - custom_code
 
15
  ---
16
 
17
  <h1>A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone</h1>
 
286
  print(new_text, flush=True, end='')
287
 
288
  # Second round chat, pass history context of multi-turn conversation
289
+ msgs.append({"role": "assistant", "content": [generated_text]})
290
  msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
291
 
292
  answer = model.chat(
 
403
 
404
  video_path="video_test.mp4"
405
  fps = 5 # fps for video
406
+ force_packing = None # You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
407
  frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
408
 
409
  question = "Describe the video"
 
415
  answer = model.chat(
416
  msgs=msgs,
417
  tokenizer=tokenizer,
418
+ use_image_id=False,
419
  max_slice_nums=1,
420
  temporal_ids=frame_ts_id_group
421
  )