Skip to content

Commit

Permalink
initial version of script
Browse files Browse the repository at this point in the history
  • Loading branch information
abi committed Mar 7, 2024
1 parent f2cb1a4 commit c2f230a
Show file tree
Hide file tree
Showing 4 changed files with 357 additions and 1 deletion.
4 changes: 4 additions & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,7 @@ cython_debug/

# Temporary eval output
evals_data


# Temporary video evals (Remove before merge)
video_evals
47 changes: 46 additions & 1 deletion backend/llm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Awaitable, Callable, List, cast
from typing import Any, Awaitable, Callable, List, cast
from anthropic import AsyncAnthropic
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
Expand Down Expand Up @@ -46,6 +46,7 @@ async def stream_openai_response(
return full_response


# TODO: Have a seperate function that translates OpenAI messages to Claude messages
async def stream_claude_response(
messages: List[ChatCompletionMessageParam],
api_key: str,
Expand Down Expand Up @@ -99,3 +100,47 @@ async def stream_claude_response(
# Return final message
response = await stream.get_final_message()
return response.content[0].text


async def stream_claude_response_native(
system_prompt: str,
messages: list[Any],
api_key: str,
callback: Callable[[str], Awaitable[None]],
include_thinking: bool = False,
model: str = MODEL_CLAUDE_OPUS,
) -> str:

client = AsyncAnthropic(api_key=api_key)

# Base parameters
max_tokens = 4096
temperature = 0.0

# Stream Claude response

# Set up message depending on whether we have a <thinking> prefix
messages = (
messages + [{"role": "assistant", "content": "<thinking>"}]
if include_thinking
else messages
)

async with client.messages.stream(
model=model,
max_tokens=max_tokens,
temperature=temperature,
system=system_prompt,
messages=messages, # type: ignore
) as stream:
async for text in stream.text_stream:
await callback(text)

# Return final message
response = await stream.get_final_message()

print(
f"Token usage: Input Tokens: {response.usage.input_tokens}, Output Tokens: {response.usage.output_tokens}"
)

return response.content[0].text
81 changes: 81 additions & 0 deletions backend/prompts/claude_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,53 @@
# https://docs.anthropic.com/claude/docs/prompt-engineering
# https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb

VIDEO_PROMPT = """
You are an expert at building single page, funtional apps using HTML, Jquery and Tailwind CSS.
You also have perfect vision and pay great attention to detail.
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
- Make sure the app looks exactly like the screenshot.
- Pay close attention to background color, text color, font size, font family,
padding, margin, border, etc. Match the colors and sizes exactly.
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
- If some fuctionality requires a backend call, just mock the data instead.
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
In terms of libraries,
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
- You can use Google Fonts
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
- Use jQuery: <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
"""

VIDEO_PROMPT_ALPINE_JS = """
You are an expert at building single page, funtional apps using HTML, Alpine.js and Tailwind CSS.
You also have perfect vision and pay great attention to detail.
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
- Make sure the app looks exactly like the screenshot.
- Pay close attention to background color, text color, font size, font family,
padding, margin, border, etc. Match the colors and sizes exactly.
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
- If some fuctionality requires a backend call, just mock the data instead.
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
In terms of libraries,
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
- You can use Google Fonts
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
- Use Alpine.js: <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/cdn.min.js"></script>
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
"""


HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS.
You take screenshots of a reference web page from the user, and then build single page apps
Expand Down Expand Up @@ -31,3 +78,37 @@
Return only the full code in <html></html> tags.
Do not include markdown "```" or "```html" at the start or end.
"""

#

REACT_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using React/Tailwind.
You take screenshots of a reference web page from the user, and then build single page apps
using React and Tailwind CSS.
You might also be given a screenshot (The second image) of a web page that you have already built, and asked to
update it to look more like the reference image(The first image).
- Make sure the app looks exactly like the screenshot.
- Do not leave out smaller UI elements. Make sure to include every single thing in the screenshot.
- Pay close attention to background color, text color, font size, font family,
padding, margin, border, etc. Match the colors and sizes exactly.
- In particular, pay attention to background color and overall color scheme.
- Use the exact text from the screenshot.
- Do not add comments in the code such as "<!-- Add other navigation links as needed -->" and "<!-- ... other news items ... -->" in place of writing the full code. WRITE THE FULL CODE.
- Make sure to always get the layout right (if things are arranged in a row in the screenshot, they should be in a row in the app as well)
- CREATE REUSABLE COMPONENTS FOR REPEATING ELEMENTS. For example, if there are 15 similar items in the screenshot, your code should include a reusable component that generates these items. and use loops to instantiate these components as needed.
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
In terms of libraries,
- Use these script to include React so that it can run on a standalone page:
<script src="https://unpkg.com/react/umd/react.development.js"></script>
<script src="https://unpkg.com/react-dom/umd/react-dom.development.js"></script>
<script src="https://unpkg.com/@babel/standalone/babel.js"></script>
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
- You can use Google Fonts
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
Return only the full code in <html></html> tags.
Do not include markdown "```" or "```html" at the start or end.
"""
226 changes: 226 additions & 0 deletions backend/video_to_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# Load environment variables first
import base64
import shutil
from dotenv import load_dotenv

load_dotenv()

import time
import subprocess
import os
from typing import Union
import asyncio
from datetime import datetime
from prompts.claude_prompts import VIDEO_PROMPT, VIDEO_PROMPT_ALPINE_JS
from utils import pprint_prompt
from config import ANTHROPIC_API_KEY
from llm import (
MODEL_CLAUDE_OPUS,
# MODEL_CLAUDE_SONNET,
stream_claude_response_native,
)

STACK = "html_tailwind"

VIDEO_DIR = "./video_evals/videos"
SCREENSHOTS_DIR = "./video_evals/screenshots"
OUTPUTS_DIR = "./video_evals/outputs"


async def main():

video_filename = "mortgage-calculator.mov"
screenshot_interval = 850
is_followup = False

# Get previous HTML
previous_html = ""
if is_followup:
previous_html_file = max(
[
os.path.join(OUTPUTS_DIR, f)
for f in os.listdir(OUTPUTS_DIR)
if f.endswith(".html")
],
key=os.path.getctime,
)
print(previous_html_file)
with open(previous_html_file, "r") as file:
previous_html = file.read()

if not ANTHROPIC_API_KEY:
raise ValueError("ANTHROPIC_API_KEY is not set")

# Create the SCREENSHOTS_DIR if it doesn't exist
if not os.path.exists(SCREENSHOTS_DIR):
os.makedirs(SCREENSHOTS_DIR)

# Clear out the SCREENSHOTS_DIR before generating new screenshots
for filename in os.listdir(SCREENSHOTS_DIR):
file_path = os.path.join(SCREENSHOTS_DIR, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")

# Split the video into screenshots
split_video_into_screenshots(
os.path.join(VIDEO_DIR, video_filename), SCREENSHOTS_DIR, screenshot_interval
)

# Get all the screenshots in the directory
screenshots = [f for f in os.listdir(SCREENSHOTS_DIR) if f.endswith(".jpg")]

if len(screenshots) > 20:
print(f"Too many screenshots: {len(screenshots)}")
return

input_image_urls: list[str] = []
sorted_screenshots = sorted(screenshots, key=lambda x: int(x.split(".")[0]))
for filename in sorted_screenshots:
filepath = os.path.join(SCREENSHOTS_DIR, filename)
data_url = await image_to_data_url(filepath)
print(filename)
input_image_urls.append(data_url)

# Convert images to the message format for Claude
content_messages: list[dict[str, Union[dict[str, str], str]]] = []
for url in input_image_urls:
media_type = url.split(";")[0].split(":")[1]
base64_data = url.split(",")[1]
content_messages.append(
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": base64_data,
},
}
)

prompt_messages = [
{
"role": "user",
"content": content_messages,
},
# {"role": "assistant", "content": SECOND_MESSAGE},
# {"role": "user", "content": "continue"},
]

if is_followup:
prompt_messages += [
{"role": "assistant", "content": previous_html},
{
"role": "user",
"content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional like in the original video.",
},
] # type: ignore

async def process_chunk(content: str):
print(content, end="", flush=True)

response_prefix = "<thinking>"

pprint_prompt(prompt_messages) # type: ignore

start_time = time.time()

completion = await stream_claude_response_native(
system_prompt=VIDEO_PROMPT,
messages=prompt_messages,
api_key=ANTHROPIC_API_KEY,
callback=lambda x: process_chunk(x),
model=MODEL_CLAUDE_OPUS,
include_thinking=True,
)

end_time = time.time()

# Prepend the response prefix to the completion
completion = response_prefix + completion

# Extract the outputs
html_content = extract_tag_content("html", completion)
thinking = extract_tag_content("thinking", completion)

print(thinking)
print(f"Operation took {end_time - start_time} seconds")

os.makedirs(OUTPUTS_DIR, exist_ok=True)

# Generate a unique filename based on the current time
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"video_test_output_{timestamp}.html"
output_path = os.path.join(OUTPUTS_DIR, filename)

# Write the HTML content to the file
with open(output_path, "w") as file:
file.write(html_content)

# Show a notification
subprocess.run(["osascript", "-e", 'display notification "Coding Complete"'])


# Extract HTML content from the completion string
def extract_tag_content(tag: str, text: str) -> str:
"""
Extracts content for a given tag from the provided text.
:param tag: The tag to search for.
:param text: The text to search within.
:return: The content found within the tag, if any.
"""
tag_start = f"<{tag}>"
tag_end = f"</{tag}>"
start_idx = text.find(tag_start)
end_idx = text.find(tag_end, start_idx)
if start_idx != -1 and end_idx != -1:
return text[start_idx : end_idx + len(tag_end)]
return ""


def split_video_into_screenshots(video_path: str, output_dir: str, interval: int):
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Calculate the number of zeros needed for padding
# duration = float(
# subprocess.check_output(
# [
# "ffprobe",
# "-v",
# "error",
# "-show_entries",
# "format=duration",
# "-of",
# "default=noprint_wrappers=1:nokey=1",
# video_path,
# ]
# )
# )

# Run the ffmpeg command to extract screenshots
subprocess.call(
[
"ffmpeg",
"-i",
video_path,
"-vf",
f"fps=1/{interval/1000}",
f"{output_dir}/%d.jpg",
]
)


# TODO: Don't hard-code the media type
async def image_to_data_url(filepath: str):
with open(filepath, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
return f"data:image/jpeg;base64,{encoded_string}"


asyncio.run(main())

0 comments on commit c2f230a

Please sign in to comment.