forked from abi/screenshot-to-code
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
357 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -154,3 +154,7 @@ cython_debug/ | |
|
||
# Temporary eval output | ||
evals_data | ||
|
||
|
||
# Temporary video evals (Remove before merge) | ||
video_evals |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,53 @@ | |
# https://docs.anthropic.com/claude/docs/prompt-engineering | ||
# https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb | ||
|
||
VIDEO_PROMPT = """ | ||
You are an expert at building single page, funtional apps using HTML, Jquery and Tailwind CSS. | ||
You also have perfect vision and pay great attention to detail. | ||
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build. | ||
- Make sure the app looks exactly like the screenshot. | ||
- Pay close attention to background color, text color, font size, font family, | ||
padding, margin, border, etc. Match the colors and sizes exactly. | ||
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later. | ||
- If some fuctionality requires a backend call, just mock the data instead. | ||
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video. | ||
In terms of libraries, | ||
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script> | ||
- You can use Google Fonts | ||
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link> | ||
- Use jQuery: <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script> | ||
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags. | ||
""" | ||
|
||
VIDEO_PROMPT_ALPINE_JS = """ | ||
You are an expert at building single page, funtional apps using HTML, Alpine.js and Tailwind CSS. | ||
You also have perfect vision and pay great attention to detail. | ||
You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build. | ||
- Make sure the app looks exactly like the screenshot. | ||
- Pay close attention to background color, text color, font size, font family, | ||
padding, margin, border, etc. Match the colors and sizes exactly. | ||
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later. | ||
- If some fuctionality requires a backend call, just mock the data instead. | ||
- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video. | ||
In terms of libraries, | ||
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script> | ||
- You can use Google Fonts | ||
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link> | ||
- Use Alpine.js: <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/cdn.min.js"></script> | ||
Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags. | ||
""" | ||
|
||
|
||
HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """ | ||
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS. | ||
You take screenshots of a reference web page from the user, and then build single page apps | ||
|
@@ -31,3 +78,37 @@ | |
Return only the full code in <html></html> tags. | ||
Do not include markdown "```" or "```html" at the start or end. | ||
""" | ||
|
||
# | ||
|
||
REACT_TAILWIND_CLAUDE_SYSTEM_PROMPT = """ | ||
You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using React/Tailwind. | ||
You take screenshots of a reference web page from the user, and then build single page apps | ||
using React and Tailwind CSS. | ||
You might also be given a screenshot (The second image) of a web page that you have already built, and asked to | ||
update it to look more like the reference image(The first image). | ||
- Make sure the app looks exactly like the screenshot. | ||
- Do not leave out smaller UI elements. Make sure to include every single thing in the screenshot. | ||
- Pay close attention to background color, text color, font size, font family, | ||
padding, margin, border, etc. Match the colors and sizes exactly. | ||
- In particular, pay attention to background color and overall color scheme. | ||
- Use the exact text from the screenshot. | ||
- Do not add comments in the code such as "<!-- Add other navigation links as needed -->" and "<!-- ... other news items ... -->" in place of writing the full code. WRITE THE FULL CODE. | ||
- Make sure to always get the layout right (if things are arranged in a row in the screenshot, they should be in a row in the app as well) | ||
- CREATE REUSABLE COMPONENTS FOR REPEATING ELEMENTS. For example, if there are 15 similar items in the screenshot, your code should include a reusable component that generates these items. and use loops to instantiate these components as needed. | ||
- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later. | ||
In terms of libraries, | ||
- Use these script to include React so that it can run on a standalone page: | ||
<script src="https://unpkg.com/react/umd/react.development.js"></script> | ||
<script src="https://unpkg.com/react-dom/umd/react-dom.development.js"></script> | ||
<script src="https://unpkg.com/@babel/standalone/babel.js"></script> | ||
- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script> | ||
- You can use Google Fonts | ||
- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link> | ||
Return only the full code in <html></html> tags. | ||
Do not include markdown "```" or "```html" at the start or end. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
# Load environment variables first | ||
import base64 | ||
import shutil | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
|
||
import time | ||
import subprocess | ||
import os | ||
from typing import Union | ||
import asyncio | ||
from datetime import datetime | ||
from prompts.claude_prompts import VIDEO_PROMPT, VIDEO_PROMPT_ALPINE_JS | ||
from utils import pprint_prompt | ||
from config import ANTHROPIC_API_KEY | ||
from llm import ( | ||
MODEL_CLAUDE_OPUS, | ||
# MODEL_CLAUDE_SONNET, | ||
stream_claude_response_native, | ||
) | ||
|
||
STACK = "html_tailwind" | ||
|
||
VIDEO_DIR = "./video_evals/videos" | ||
SCREENSHOTS_DIR = "./video_evals/screenshots" | ||
OUTPUTS_DIR = "./video_evals/outputs" | ||
|
||
|
||
async def main(): | ||
|
||
video_filename = "mortgage-calculator.mov" | ||
screenshot_interval = 850 | ||
is_followup = False | ||
|
||
# Get previous HTML | ||
previous_html = "" | ||
if is_followup: | ||
previous_html_file = max( | ||
[ | ||
os.path.join(OUTPUTS_DIR, f) | ||
for f in os.listdir(OUTPUTS_DIR) | ||
if f.endswith(".html") | ||
], | ||
key=os.path.getctime, | ||
) | ||
print(previous_html_file) | ||
with open(previous_html_file, "r") as file: | ||
previous_html = file.read() | ||
|
||
if not ANTHROPIC_API_KEY: | ||
raise ValueError("ANTHROPIC_API_KEY is not set") | ||
|
||
# Create the SCREENSHOTS_DIR if it doesn't exist | ||
if not os.path.exists(SCREENSHOTS_DIR): | ||
os.makedirs(SCREENSHOTS_DIR) | ||
|
||
# Clear out the SCREENSHOTS_DIR before generating new screenshots | ||
for filename in os.listdir(SCREENSHOTS_DIR): | ||
file_path = os.path.join(SCREENSHOTS_DIR, filename) | ||
try: | ||
if os.path.isfile(file_path) or os.path.islink(file_path): | ||
os.unlink(file_path) | ||
elif os.path.isdir(file_path): | ||
shutil.rmtree(file_path) | ||
except Exception as e: | ||
print(f"Failed to delete {file_path}. Reason: {e}") | ||
|
||
# Split the video into screenshots | ||
split_video_into_screenshots( | ||
os.path.join(VIDEO_DIR, video_filename), SCREENSHOTS_DIR, screenshot_interval | ||
) | ||
|
||
# Get all the screenshots in the directory | ||
screenshots = [f for f in os.listdir(SCREENSHOTS_DIR) if f.endswith(".jpg")] | ||
|
||
if len(screenshots) > 20: | ||
print(f"Too many screenshots: {len(screenshots)}") | ||
return | ||
|
||
input_image_urls: list[str] = [] | ||
sorted_screenshots = sorted(screenshots, key=lambda x: int(x.split(".")[0])) | ||
for filename in sorted_screenshots: | ||
filepath = os.path.join(SCREENSHOTS_DIR, filename) | ||
data_url = await image_to_data_url(filepath) | ||
print(filename) | ||
input_image_urls.append(data_url) | ||
|
||
# Convert images to the message format for Claude | ||
content_messages: list[dict[str, Union[dict[str, str], str]]] = [] | ||
for url in input_image_urls: | ||
media_type = url.split(";")[0].split(":")[1] | ||
base64_data = url.split(",")[1] | ||
content_messages.append( | ||
{ | ||
"type": "image", | ||
"source": { | ||
"type": "base64", | ||
"media_type": media_type, | ||
"data": base64_data, | ||
}, | ||
} | ||
) | ||
|
||
prompt_messages = [ | ||
{ | ||
"role": "user", | ||
"content": content_messages, | ||
}, | ||
# {"role": "assistant", "content": SECOND_MESSAGE}, | ||
# {"role": "user", "content": "continue"}, | ||
] | ||
|
||
if is_followup: | ||
prompt_messages += [ | ||
{"role": "assistant", "content": previous_html}, | ||
{ | ||
"role": "user", | ||
"content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional like in the original video.", | ||
}, | ||
] # type: ignore | ||
|
||
async def process_chunk(content: str): | ||
print(content, end="", flush=True) | ||
|
||
response_prefix = "<thinking>" | ||
|
||
pprint_prompt(prompt_messages) # type: ignore | ||
|
||
start_time = time.time() | ||
|
||
completion = await stream_claude_response_native( | ||
system_prompt=VIDEO_PROMPT, | ||
messages=prompt_messages, | ||
api_key=ANTHROPIC_API_KEY, | ||
callback=lambda x: process_chunk(x), | ||
model=MODEL_CLAUDE_OPUS, | ||
include_thinking=True, | ||
) | ||
|
||
end_time = time.time() | ||
|
||
# Prepend the response prefix to the completion | ||
completion = response_prefix + completion | ||
|
||
# Extract the outputs | ||
html_content = extract_tag_content("html", completion) | ||
thinking = extract_tag_content("thinking", completion) | ||
|
||
print(thinking) | ||
print(f"Operation took {end_time - start_time} seconds") | ||
|
||
os.makedirs(OUTPUTS_DIR, exist_ok=True) | ||
|
||
# Generate a unique filename based on the current time | ||
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | ||
filename = f"video_test_output_{timestamp}.html" | ||
output_path = os.path.join(OUTPUTS_DIR, filename) | ||
|
||
# Write the HTML content to the file | ||
with open(output_path, "w") as file: | ||
file.write(html_content) | ||
|
||
# Show a notification | ||
subprocess.run(["osascript", "-e", 'display notification "Coding Complete"']) | ||
|
||
|
||
# Extract HTML content from the completion string | ||
def extract_tag_content(tag: str, text: str) -> str: | ||
""" | ||
Extracts content for a given tag from the provided text. | ||
:param tag: The tag to search for. | ||
:param text: The text to search within. | ||
:return: The content found within the tag, if any. | ||
""" | ||
tag_start = f"<{tag}>" | ||
tag_end = f"</{tag}>" | ||
start_idx = text.find(tag_start) | ||
end_idx = text.find(tag_end, start_idx) | ||
if start_idx != -1 and end_idx != -1: | ||
return text[start_idx : end_idx + len(tag_end)] | ||
return "" | ||
|
||
|
||
def split_video_into_screenshots(video_path: str, output_dir: str, interval: int): | ||
# Create the output directory if it doesn't exist | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
# Calculate the number of zeros needed for padding | ||
# duration = float( | ||
# subprocess.check_output( | ||
# [ | ||
# "ffprobe", | ||
# "-v", | ||
# "error", | ||
# "-show_entries", | ||
# "format=duration", | ||
# "-of", | ||
# "default=noprint_wrappers=1:nokey=1", | ||
# video_path, | ||
# ] | ||
# ) | ||
# ) | ||
|
||
# Run the ffmpeg command to extract screenshots | ||
subprocess.call( | ||
[ | ||
"ffmpeg", | ||
"-i", | ||
video_path, | ||
"-vf", | ||
f"fps=1/{interval/1000}", | ||
f"{output_dir}/%d.jpg", | ||
] | ||
) | ||
|
||
|
||
# TODO: Don't hard-code the media type | ||
async def image_to_data_url(filepath: str): | ||
with open(filepath, "rb") as image_file: | ||
encoded_string = base64.b64encode(image_file.read()).decode() | ||
return f"data:image/jpeg;base64,{encoded_string}" | ||
|
||
|
||
asyncio.run(main()) |