release 2024-03-06

Jessy-L · Mar 6, 2024 · f9092f9 · f9092f9
1 parent 53fbcbf
commit f9092f9
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -11,14 +11,14 @@ moondream2 is a 1.86B parameter model initialized with weights from [SigLIP](htt
 | Model | VQAv2 | GQA | TextVQA | POPE | TallyQA |
 | --- | --- | --- | --- | --- | --- |
 | moondream1 | 74.7 | 57.9 | 35.6 | - | - |
-| **moondream2** (latest) | 74.2 | 58.5 | 36.4 | (coming soon) | (coming soon) |
+| **moondream2** (latest) | 75.4 | 59.8 | 43.1 | (coming soon) | (coming soon) |
 
 ## Examples
 
 | Image | Example |
 | --- | --- |
-| ![](assets/demo-1.jpg) | **What is the girl doing?**<br>The girl is sitting at a table, eating a burger.<br><br>**What color is the girl's hair?**<br>White |
-| ![](assets/demo-2.jpg) | **What is this?**<br>A metal stand is positioned in the center of the image, with CPUs and wires visible. The background features a wall, and a black object is situated in the top left corner.<br><br>**What is behind the stand?**<br>A wall made of red bricks is visible behind the stand, which holds several electronic devices and wires. |
+| ![](assets/demo-1.jpg) | **What is the girl doing?**<br>The girl is eating a hamburger.<br><br>**What color is the girl's hair?**<br>White |
+| ![](assets/demo-2.jpg) | **What is this?**<br>A rack is present in the image, containing various electronic devices. A chair is situated on the left side, and a brick wall is visible in the background.<br><br>**What is behind the stand?**<br>A brick wall is visible behind the stand. |
 
 ## Usage
 
@@ -33,7 +33,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
 
 model_id = "vikhyatk/moondream2"
-revision = "2024-03-05"
+revision = "2024-03-06"
 model = AutoModelForCausalLM.from_pretrained(
     model_id, trust_remote_code=True, revision=revision
 )
@@ -47,6 +47,26 @@ print(model.answer_question(enc_image, "Describe this image.", tokenizer))
 The model is updated regularly, so we recommend pinning the model version to a
 specific release as shown above.
 
+To enable Flash Attention on the text model, pass in `attn_implementation="flash_attention_2"`
+when instantiating the model.
+
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, trust_remote_code=True, revision=revision,
+    torch_dtype=torch.float16, attn_implementation="flash_attention_2"
+).to("cuda")
+```
+
+Batch inference is also supported.
+
+```python
+answers = moondream.batch_answer(
+    images=[Image.open('<IMAGE_PATH_1>'), Image.open('<IMAGE_PATH_2>')],
+    prompts=["Describe this image.", "Are there people in this image?"],
+    tokenizer=tokenizer,
+)
+```
+
 **Using this repository**
 
 Clone this repository and install dependencies.

diff --git a/gradio_demo.py b/gradio_demo.py
@@ -2,7 +2,7 @@
 import torch
 import re
 import gradio as gr
-from moondream import detect_device
+from moondream import detect_device, LATEST_REVISION
 from threading import Thread
 from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
 
@@ -21,10 +21,9 @@
         print()
 
 model_id = "vikhyatk/moondream2"
-revision = "2024-03-05"
-tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision=LATEST_REVISION)
 moondream = AutoModelForCausalLM.from_pretrained(
-    model_id, trust_remote_code=True, revision=revision
+    model_id, trust_remote_code=True, revision=LATEST_REVISION
 ).to(device=device, dtype=dtype)
 moondream.eval()
 

diff --git a/moondream/configuration_moondream.py b/moondream/configuration_moondream.py
@@ -1,8 +1,5 @@
 from transformers import PretrainedConfig
 
-from typing import Optional
-import math
-
 
 class PhiConfig(PretrainedConfig):
     model_type = "phi"

diff --git a/moondream/util.py b/moondream/util.py
@@ -1,6 +1,6 @@
 import torch
 
-LATEST_REVISION = "2024-03-05"
+LATEST_REVISION = "2024-03-06"
 
 def detect_device():
     """

diff --git a/webcam_gradio_demo.py b/webcam_gradio_demo.py
@@ -3,7 +3,7 @@
 import re
 import time
 import gradio as gr
-from moondream import detect_device
+from moondream import detect_device, LATEST_REVISION
 from threading import Thread
 from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
 
@@ -22,10 +22,9 @@
         print()
 
 model_id = "vikhyatk/moondream2"
-revision = "2024-03-05"
-tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision=LATEST_REVISION)
 moondream = AutoModelForCausalLM.from_pretrained(
-    model_id, trust_remote_code=True, revision=revision
+    model_id, trust_remote_code=True, revision=LATEST_REVISION
 ).to(device=device, dtype=dtype)
 moondream.eval()