initial version of script

Hashim-K · Mar 7, 2024 · c2f230a · c2f230a
1 parent f2cb1a4
commit c2f230a
Show file tree

Hide file tree

Showing 4 changed files with 357 additions and 1 deletion.
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -154,3 +154,7 @@ cython_debug/
 
 # Temporary eval output
 evals_data
+
+
+# Temporary video evals (Remove before merge)
+video_evals
diff --git a/backend/llm.py b/backend/llm.py
@@ -1,4 +1,4 @@
-from typing import Awaitable, Callable, List, cast
+from typing import Any, Awaitable, Callable, List, cast
 from anthropic import AsyncAnthropic
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionMessageParam, ChatCompletionChunk
@@ -46,6 +46,7 @@ async def stream_openai_response(
     return full_response
 
 
+# TODO: Have a seperate function that translates OpenAI messages to Claude messages
 async def stream_claude_response(
     messages: List[ChatCompletionMessageParam],
     api_key: str,
@@ -99,3 +100,47 @@ async def stream_claude_response(
     # Return final message
     response = await stream.get_final_message()
     return response.content[0].text
+
+
+async def stream_claude_response_native(
+    system_prompt: str,
+    messages: list[Any],
+    api_key: str,
+    callback: Callable[[str], Awaitable[None]],
+    include_thinking: bool = False,
+    model: str = MODEL_CLAUDE_OPUS,
+) -> str:
+
+    client = AsyncAnthropic(api_key=api_key)
+
+    # Base parameters
+    max_tokens = 4096
+    temperature = 0.0
+
+    # Stream Claude response
+
+    # Set up message depending on whether we have a <thinking> prefix
+    messages = (
+        messages + [{"role": "assistant", "content": "<thinking>"}]
+        if include_thinking
+        else messages
+    )
+
+    async with client.messages.stream(
+        model=model,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        system=system_prompt,
+        messages=messages,  # type: ignore
+    ) as stream:
+        async for text in stream.text_stream:
+            await callback(text)
+
+    # Return final message
+    response = await stream.get_final_message()
+
+    print(
+        f"Token usage: Input Tokens: {response.usage.input_tokens}, Output Tokens: {response.usage.output_tokens}"
+    )
+
+    return response.content[0].text
diff --git a/backend/prompts/claude_prompts.py b/backend/prompts/claude_prompts.py
@@ -4,6 +4,53 @@
 # https://docs.anthropic.com/claude/docs/prompt-engineering
 # https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb
 
+VIDEO_PROMPT = """
+You are an expert at building single page, funtional apps using HTML, Jquery and Tailwind CSS.
+You also have perfect vision and pay great attention to detail.
+
+You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
+
+- Make sure the app looks exactly like the screenshot.
+- Pay close attention to background color, text color, font size, font family, 
+padding, margin, border, etc. Match the colors and sizes exactly.
+- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
+- If some fuctionality requires a backend call, just mock the data instead.
+- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
+
+In terms of libraries,
+
+- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
+- You can use Google Fonts
+- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
+- Use jQuery: <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
+
+Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
+"""
+
+VIDEO_PROMPT_ALPINE_JS = """
+You are an expert at building single page, funtional apps using HTML, Alpine.js and Tailwind CSS.
+You also have perfect vision and pay great attention to detail.
+
+You will be given screenshots in order at consistent intervals from a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build.
+
+- Make sure the app looks exactly like the screenshot.
+- Pay close attention to background color, text color, font size, font family, 
+padding, margin, border, etc. Match the colors and sizes exactly.
+- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
+- If some fuctionality requires a backend call, just mock the data instead.
+- MAKE THE APP FUNCTIONAL using Javascript. Allow the user to interact with the app and get the same behavior as the video.
+
+In terms of libraries,
+
+- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
+- You can use Google Fonts
+- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
+- Use Alpine.js: <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/cdn.min.js"></script>
+
+Before generating the code for the app, think step-by-step: first, about the user flow depicated in the video and then about you how would you build it and how you would structure the code. Do the thinking within <thinking></thinking> tags. Then, provide your code within <html></html> tags.
+"""
+
+
 HTML_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
 You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using Tailwind, HTML and JS.
 You take screenshots of a reference web page from the user, and then build single page apps 
@@ -31,3 +78,37 @@
 Return only the full code in <html></html> tags.
 Do not include markdown "```" or "```html" at the start or end.
 """
+
+#
+
+REACT_TAILWIND_CLAUDE_SYSTEM_PROMPT = """
+You have perfect vision and pay great attention to detail which makes you an expert at building single page apps using React/Tailwind.
+You take screenshots of a reference web page from the user, and then build single page apps 
+using React and Tailwind CSS.
+You might also be given a screenshot (The second image) of a web page that you have already built, and asked to
+update it to look more like the reference image(The first image).
+
+- Make sure the app looks exactly like the screenshot.
+- Do not leave out smaller UI elements. Make sure to include every single thing in the screenshot.
+- Pay close attention to background color, text color, font size, font family, 
+padding, margin, border, etc. Match the colors and sizes exactly.
+- In particular, pay attention to background color and overall color scheme.
+- Use the exact text from the screenshot.
+- Do not add comments in the code such as "<!-- Add other navigation links as needed -->" and "<!-- ... other news items ... -->" in place of writing the full code. WRITE THE FULL CODE.
+- Make sure to always get the layout right (if things are arranged in a row in the screenshot, they should be in a row in the app as well)
+- CREATE REUSABLE COMPONENTS FOR REPEATING ELEMENTS. For example, if there are 15 similar items in the screenshot, your code should include a reusable component that generates these items. and use loops to instantiate these components as needed.
+- For images, use placeholder images from https://placehold.co and include a detailed description of the image in the alt text so that an image generation AI can generate the image later.
+
+In terms of libraries,
+
+- Use these script to include React so that it can run on a standalone page:
+    <script src="https://unpkg.com/react/umd/react.development.js"></script>
+    <script src="https://unpkg.com/react-dom/umd/react-dom.development.js"></script>
+    <script src="https://unpkg.com/@babel/standalone/babel.js"></script>
+- Use this script to include Tailwind: <script src="https://cdn.tailwindcss.com"></script>
+- You can use Google Fonts
+- Font Awesome for icons: <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"></link>
+
+Return only the full code in <html></html> tags.
+Do not include markdown "```" or "```html" at the start or end.
+"""
diff --git a/backend/video_to_app.py b/backend/video_to_app.py
@@ -0,0 +1,226 @@
+# Load environment variables first
+import base64
+import shutil
+from dotenv import load_dotenv
+
+load_dotenv()
+
+import time
+import subprocess
+import os
+from typing import Union
+import asyncio
+from datetime import datetime
+from prompts.claude_prompts import VIDEO_PROMPT, VIDEO_PROMPT_ALPINE_JS
+from utils import pprint_prompt
+from config import ANTHROPIC_API_KEY
+from llm import (
+    MODEL_CLAUDE_OPUS,
+    # MODEL_CLAUDE_SONNET,
+    stream_claude_response_native,
+)
+
+STACK = "html_tailwind"
+
+VIDEO_DIR = "./video_evals/videos"
+SCREENSHOTS_DIR = "./video_evals/screenshots"
+OUTPUTS_DIR = "./video_evals/outputs"
+
+
+async def main():
+
+    video_filename = "mortgage-calculator.mov"
+    screenshot_interval = 850
+    is_followup = False
+
+    # Get previous HTML
+    previous_html = ""
+    if is_followup:
+        previous_html_file = max(
+            [
+                os.path.join(OUTPUTS_DIR, f)
+                for f in os.listdir(OUTPUTS_DIR)
+                if f.endswith(".html")
+            ],
+            key=os.path.getctime,
+        )
+        print(previous_html_file)
+        with open(previous_html_file, "r") as file:
+            previous_html = file.read()
+
+    if not ANTHROPIC_API_KEY:
+        raise ValueError("ANTHROPIC_API_KEY is not set")
+
+    # Create the SCREENSHOTS_DIR if it doesn't exist
+    if not os.path.exists(SCREENSHOTS_DIR):
+        os.makedirs(SCREENSHOTS_DIR)
+
+    # Clear out the SCREENSHOTS_DIR before generating new screenshots
+    for filename in os.listdir(SCREENSHOTS_DIR):
+        file_path = os.path.join(SCREENSHOTS_DIR, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print(f"Failed to delete {file_path}. Reason: {e}")
+
+    # Split the video into screenshots
+    split_video_into_screenshots(
+        os.path.join(VIDEO_DIR, video_filename), SCREENSHOTS_DIR, screenshot_interval
+    )
+
+    # Get all the screenshots in the directory
+    screenshots = [f for f in os.listdir(SCREENSHOTS_DIR) if f.endswith(".jpg")]
+
+    if len(screenshots) > 20:
+        print(f"Too many screenshots: {len(screenshots)}")
+        return
+
+    input_image_urls: list[str] = []
+    sorted_screenshots = sorted(screenshots, key=lambda x: int(x.split(".")[0]))
+    for filename in sorted_screenshots:
+        filepath = os.path.join(SCREENSHOTS_DIR, filename)
+        data_url = await image_to_data_url(filepath)
+        print(filename)
+        input_image_urls.append(data_url)
+
+    # Convert images to the message format for Claude
+    content_messages: list[dict[str, Union[dict[str, str], str]]] = []
+    for url in input_image_urls:
+        media_type = url.split(";")[0].split(":")[1]
+        base64_data = url.split(",")[1]
+        content_messages.append(
+            {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": media_type,
+                    "data": base64_data,
+                },
+            }
+        )
+
+    prompt_messages = [
+        {
+            "role": "user",
+            "content": content_messages,
+        },
+        # {"role": "assistant", "content": SECOND_MESSAGE},
+        # {"role": "user", "content": "continue"},
+    ]
+
+    if is_followup:
+        prompt_messages += [
+            {"role": "assistant", "content": previous_html},
+            {
+                "role": "user",
+                "content": "You've done a good job with a first draft. Improve this further based on the original instructions so that the app is fully functional like in the original video.",
+            },
+        ]  # type: ignore
+
+    async def process_chunk(content: str):
+        print(content, end="", flush=True)
+
+    response_prefix = "<thinking>"
+
+    pprint_prompt(prompt_messages)  # type: ignore
+
+    start_time = time.time()
+
+    completion = await stream_claude_response_native(
+        system_prompt=VIDEO_PROMPT,
+        messages=prompt_messages,
+        api_key=ANTHROPIC_API_KEY,
+        callback=lambda x: process_chunk(x),
+        model=MODEL_CLAUDE_OPUS,
+        include_thinking=True,
+    )
+
+    end_time = time.time()
+
+    # Prepend the response prefix to the completion
+    completion = response_prefix + completion
+
+    # Extract the outputs
+    html_content = extract_tag_content("html", completion)
+    thinking = extract_tag_content("thinking", completion)
+
+    print(thinking)
+    print(f"Operation took {end_time - start_time} seconds")
+
+    os.makedirs(OUTPUTS_DIR, exist_ok=True)
+
+    # Generate a unique filename based on the current time
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    filename = f"video_test_output_{timestamp}.html"
+    output_path = os.path.join(OUTPUTS_DIR, filename)
+
+    # Write the HTML content to the file
+    with open(output_path, "w") as file:
+        file.write(html_content)
+
+    # Show a notification
+    subprocess.run(["osascript", "-e", 'display notification "Coding Complete"'])
+
+
+# Extract HTML content from the completion string
+def extract_tag_content(tag: str, text: str) -> str:
+    """
+    Extracts content for a given tag from the provided text.
+
+    :param tag: The tag to search for.
+    :param text: The text to search within.
+    :return: The content found within the tag, if any.
+    """
+    tag_start = f"<{tag}>"
+    tag_end = f"</{tag}>"
+    start_idx = text.find(tag_start)
+    end_idx = text.find(tag_end, start_idx)
+    if start_idx != -1 and end_idx != -1:
+        return text[start_idx : end_idx + len(tag_end)]
+    return ""
+
+
+def split_video_into_screenshots(video_path: str, output_dir: str, interval: int):
+    # Create the output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Calculate the number of zeros needed for padding
+    # duration = float(
+    #     subprocess.check_output(
+    #         [
+    #             "ffprobe",
+    #             "-v",
+    #             "error",
+    #             "-show_entries",
+    #             "format=duration",
+    #             "-of",
+    #             "default=noprint_wrappers=1:nokey=1",
+    #             video_path,
+    #         ]
+    #     )
+    # )
+
+    # Run the ffmpeg command to extract screenshots
+    subprocess.call(
+        [
+            "ffmpeg",
+            "-i",
+            video_path,
+            "-vf",
+            f"fps=1/{interval/1000}",
+            f"{output_dir}/%d.jpg",
+        ]
+    )
+
+
+# TODO: Don't hard-code the media type
+async def image_to_data_url(filepath: str):
+    with open(filepath, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read()).decode()
+        return f"data:image/jpeg;base64,{encoded_string}"
+
+
+asyncio.run(main())