Merge pull request #10 from VikParuchuri/dev

Better scalability
VikParuchuri · Oct 12, 2023 · 055a92a · 055a92a
2 parents e806b3b + 7ceae0d
commit 055a92a
Show file tree

Hide file tree

Showing 24 changed files with 571 additions and 197 deletions.
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ By default, this will use `gpt-3.5`.  You can use `gpt-4` by setting the env var
 - Set the model name and max tokens in the `LLM_TYPES` setting.
 - Follow the instructions above for the retrieval setup.
 
-The generator ideally needs a context length of up to `16k`, but you can get away with `12k` if you need to.
+The generator ideally needs a context length of up to `16k`, but you can get away with `12k` if you need to.  If you've finetuned your own model for textbook gen (based on the prompts cached in this repo), you can use the `FINETUNED` and `INCLUDE_EXAMPLES` settings to reduce token usage.
 
 ### Without retrieval
 
@@ -73,6 +73,8 @@ Usage example:
 
 ## Generate textbooks
 
+### From titles
+
 This will take a file with a flat json list of topics, and generate one textbook per topic.  The workers flag controls the number of parallel generations.  Lower it if you hit rate limits.
 
 Usage example:
@@ -83,7 +85,31 @@ You can also override settings with environment variables (instead of using `loc
 
 `LLM_TYPE=llama LLM_INSTRUCT_TYPE=llama LLM_EXTENDED_TYPE=llama OPENAI_KEY="llama" OPENAI_BASE_URL="https://vllm-api.com/v1" python book_generator.py topics.json books.jsonl --workers 10`
 
-Note that courses are cached by default, so regenerating a course with the same name twice will not hit the API again.  The cache is specific to each model and each topic.
+You can see all options by running `python book_generator.py --help`.
+
+Note that courses are cached by default, so regenerating a course with the same name twice will not hit the API again.  The cache is specific to each model and each topic.  You can skip the cache by using the `--revision` option to specify a revision number for the courses.
+
+### From outlines
+
+You can also generate a book from an existing outline by creating a jsonl file with the following fields:
+
+- `topic` - The topic/title of the book
+- `outline` - The outline of the book, as a flat json list.  This needs to be in a specific format, see "clean table of contents" below.
+- `queries` - Up to 2 search queries to use for retrieval.  If you don't want to use retrieval, set this to an empty list.
+
+## Clean tables of contents
+
+This will take in a jsonl file with an existing table of contents and title, and process it into the correct format for book generation.
+
+Usage example:
+
+`python toc_cleaner.py toc.jsonl clean_toc.jsonl`
+
+`toc.jsonl` should have the following fields in each line:
+
+- `title` - The title of the book
+- `toc` - a string containing the table of contents.  This can be poorly formatted
+
 
 # Extending
 

diff --git a/app/course/models.py b/app/course/models.py
@@ -41,9 +41,9 @@ async def load_cached_course(model: str, topic: str, revision: int):
             select(Course).where(Course.topic == topic, Course.model == model, Course.version == revision)
         )
         course = query.all()
-        if len(course) == 0:
-            return None
-        course = course[0]
+    if len(course) == 0:
+        return None
+    course = course[0]
 
     if course.context is not None:
         course.context = [ResearchNote(**json.loads(v)) for v in course.context]

diff --git a/app/course/tasks.py b/app/course/tasks.py
@@ -1,25 +1,27 @@
 from typing import List
 
+from tenacity import RetryError
+
 from app.course.embeddings import EmbeddingContext
 from app.course.schemas import ResearchNote
 from app.llm.exceptions import GenerationError, InvalidRequestError, RateLimitError
 from app.llm.generators.concepts import generate_concepts
 from app.llm.generators.outline import generate_outline
 from app.services.generators.pdf import download_and_parse_pdfs, search_pdfs
+from app.settings import settings
 from app.util import debug_print_trace
 
 
 async def create_course_concepts(course_name: str, revision: int):
     """
     Set the topic and concepts for a course async.
     """
-    topic = None
     generated_concepts = None
     try:
-        concepts = await generate_concepts(course_name, revision)
+        concepts = await generate_concepts(course_name, revision, include_examples=settings.INCLUDE_EXAMPLES)
         if concepts.feasible:
             generated_concepts = concepts.concepts
-    except (GenerationError, RateLimitError, InvalidRequestError) as e:
+    except (GenerationError, RateLimitError, InvalidRequestError, RetryError) as e:
         debug_print_trace()
         print(f"Error generating concepts for {course_name}: {e}")
 
@@ -32,13 +34,13 @@ async def create_course_outline(
     outline_list = None
     queries = None
     try:
-        response = generate_outline(course_name, concepts, revision, item_count=outline_items)
+        response = generate_outline(course_name, concepts, revision, item_count=outline_items, include_examples=settings.INCLUDE_EXAMPLES)
 
         # Stream outline as it generates
         async for outline_data in response:
             outline_list = outline_data.outline
             queries = outline_data.queries
-    except (GenerationError, RateLimitError, InvalidRequestError) as e:
+    except (GenerationError, RateLimitError, InvalidRequestError, RetryError) as e:
         debug_print_trace()
         print(f"Error generating outline for {course_name}")
 

diff --git a/app/lesson/output.py b/app/lesson/output.py
@@ -24,12 +24,14 @@ def render_components_to_output_markdown(
                     )
                 )
             case ComponentNames.section:
-                if not component.markdown.startswith("#"):
-                    component.markdown = f"# {component.markdown}"
-                tuples.append((component.type, component.markdown))
+                markdown_data = component.markdown.strip()
+                if not markdown_data.startswith("#"):
+                    markdown_data = f"# {markdown_data}"
+                tuples.append((component.type, markdown_data))
             case ComponentNames.text:
+                markdown_data = component.markdown.strip()
                 tuples.append(
-                    (component.type, remove_section_paragraphs(component.markdown))
+                    (component.type, remove_section_paragraphs(markdown_data))
                 )
             case _:
                 tuples.append((component.type, component.markdown))
@@ -50,4 +52,6 @@ def remove_section_paragraphs(text):
             paragraphs.pop()
 
     # Reconstruct the text from the remaining paragraphs
-    return "\n".join(paragraphs).strip()
+    replaced = "\n".join(paragraphs).strip()
+    replaced = re.sub(r"\n\n+", "\n\n", replaced)
+    return replaced
diff --git a/app/lesson/tasks.py b/app/lesson/tasks.py
@@ -22,6 +22,7 @@ async def generate_lesson(
     outline: List[str],
     revision: int,
     research_notes: List[ResearchNote] | None = None,
+    sections_per_generation: int = settings.SECTIONS_PER_GENERATION,
 ) -> List[AllLessonComponentData] | None:
     # Add numbers to the outline - needed for generating the lesson
     numbered_outline = outline
@@ -62,9 +63,14 @@ async def generate_lesson(
         current_section = f"{last_section.strip()}\n\n{current_section_header.strip()}"
         current_section = f"{current_section}\n"
 
+        # When to stop generation
+        stop_section = None
+        if generated_sections + sections_per_generation < len(numbered_outline):
+            stop_section = numbered_outline[generated_sections + sections_per_generation]
+
         # Filter research notes to save tokens, only keep notes relevant to the next 5 sections
         # Find the indices of the next sections
-        future_sections = set(list(range(generated_sections, len(numbered_outline)))[:5])
+        future_sections = set(list(range(generated_sections, len(numbered_outline)))[:sections_per_generation])
         selected_research_notes = None
         if research_notes is not None:
             selected_research_notes = []
@@ -84,6 +90,7 @@ async def generate_lesson(
                 research_notes=selected_research_notes,
                 include_examples=settings.INCLUDE_EXAMPLES,
                 cache=use_cache,
+                stop_section=stop_section,
             )
             new_components = []
             new_component_keys = []
@@ -137,6 +144,7 @@ async def generate_single_lesson_chunk(
     research_notes: List[ResearchNote] | None,
     include_examples: bool,
     cache: bool,
+    stop_section: str | None = None,
 ) -> AsyncGenerator[List[AllLessonComponentData], None]:
     response = generate_lessons(
         numbered_outline,
@@ -148,8 +156,15 @@ async def generate_single_lesson_chunk(
         research_notes=research_notes,
         include_examples=include_examples,
         cache=cache,
+        stop_section=stop_section,
     )
 
+    section_start = f"---{ComponentNames.section}"
+
     async for chunk in response:
+        # Remove the final section header from the chunk
+        # This happens when we hit the stop token
+        if chunk.strip().endswith(section_start):
+            chunk = chunk.strip()[:-len(section_start)]
         new_components = parse_lesson_markdown(chunk)
         yield new_components
diff --git a/app/llm/adaptors/oai.py b/app/llm/adaptors/oai.py
@@ -30,7 +30,8 @@ async def oai_chat_wrapped(
     history: List,
     temperature: float,
     max_tokens: int,
-    stop_tokens: Optional[List] = None,
+    inner_timeout: int = settings.LLM_TIMEOUT,
+    stop_sequences: Optional[List] = None,
     model: str = settings.LLM_TYPE,
 ) -> AsyncGenerator[str, None]:
     response = await openai.ChatCompletion.acreate(
@@ -39,8 +40,9 @@ async def oai_chat_wrapped(
         temperature=temperature,
         max_tokens=max_tokens,
         n=1,
-        stop=stop_tokens,
+        stop=stop_sequences,
         stream=True,
+        request_timeout=inner_timeout,
     )
     async for chunk in response:
         stream = chunk
@@ -54,7 +56,8 @@ async def oai_prompt_wrapped(
     prompt: str,
     temperature: float,
     max_tokens: int,
-    stop_tokens: Optional[List] = None,
+    inner_timeout: int = settings.LLM_TIMEOUT,
+    stop_sequences: Optional[List] = None,
     model: str = settings.LLM_TYPE,
 ) -> AsyncGenerator[str, None]:
     response = await openai.Completion.acreate(
@@ -63,8 +66,9 @@ async def oai_prompt_wrapped(
         temperature=temperature,
         max_tokens=max_tokens,
         n=1,
-        stop=stop_tokens,
+        stop=stop_sequences,
         stream=True,
+        request_timeout=inner_timeout,
     )
     async for chunk in response:
         stream = chunk
@@ -78,7 +82,7 @@ async def oai_prompt_response(
     temperature: float = settings.LLM_TEMPERATURE,
     timeout: int = settings.LLM_TIMEOUT,
     max_tokens: int = settings.LLM_MAX_RESPONSE_TOKENS,
-    stop_tokens=None,
+    stop_sequences=None,
     model: str = settings.LLM_TYPE,
 ) -> Optional[AsyncGenerator[LLMResponse, None]]:
     response_tokens = 0
@@ -88,7 +92,8 @@ async def oai_prompt_response(
             temperature,
             max_tokens,
             timeout=timeout,
-            stop_tokens=stop_tokens,
+            inner_timeout=timeout,
+            stop_sequences=stop_sequences,
             model=model,
         )
         async for chunk in response:
@@ -113,7 +118,7 @@ async def oai_chat_response(
     timeout: int = settings.LLM_TIMEOUT,
     max_tokens: int = settings.LLM_MAX_RESPONSE_TOKENS,
     history=None,
-    stop_tokens=None,
+    stop_sequences=None,
     model: str = settings.LLM_TYPE,
 ) -> Optional[AsyncGenerator[LLMResponse, None]]:
     current_message = {"role": "user", "content": prompt}
@@ -130,7 +135,8 @@ async def oai_chat_response(
             temperature,
             max_tokens,
             timeout=timeout,
-            stop_tokens=stop_tokens,
+            inner_timeout=timeout,
+            stop_sequences=stop_sequences,
             model=model,
         )
         async for chunk in response:

diff --git a/app/llm/examples/toc.json b/app/llm/examples/toc.json
@@ -0,0 +1,45 @@
+[
+  {
+    "topic": "Python Programming for Beginners",
+    "draft_outline": "Introduction\n*What is Programming? *Why Python? *Historical Background of Python *Applications of Python Setting Up the Environment\n     *Installing Python *Interactive Shell vs. Script Mode *Setting Up an IDE (e.g., PyCharm, VSCode)",
+    "json": {
+      "outline": [
+        "1. Python Programming for Beginners",
+        "1.1. What is Programming?",
+        "1.2. Why Python?",
+        "1.3. Historical Background of Python",
+        "1.4. Applications of Python",
+        "2. Setting Up the Environment",
+        "2.1. Installing Python",
+        "2.2. Interactive Shell vs. Script Mode",
+        "2.3. Setting Up an IDE (e.g., PyCharm, VSCode)"
+      ],
+      "queries": [
+        "Python programming beginner guide",
+        "Python programming introduction book"
+      ]
+    }
+  },
+  {
+    "topic": "PLZ/SYS Programming Language Manual",
+    "draft_outline": "1. Introduction.- 1.1 PLZ/SYS objectives.- 2. Summary Of The Language.- 2.1 Data and Statements.- 2.2 The Construction of a Program.- 3. Notation, Terminology, And Vocabulary.- 3.1 Vocabulary.- 3.2 Lexical Structure.- 4. Identifiers And Literal Constants.- PLZ/SYS Grammar. Conclusion. References.",
+    "json": {
+      "outline": [
+        "1. Introduction",
+        "1.1 PLZ/SYS objectives",
+        "2. Summary Of The Language",
+        "2.1 Data and Statements",
+        "2.2 The Construction of a Program",
+        "3. Notation, Terminology, And Vocabulary",
+        "3.1 Vocabulary",
+        "3.2 Lexical Structure",
+        "4. Identifiers And Literal Constants",
+        "5. PLZ/SYS Grammar"
+      ],
+      "queries": [
+        "PLZ/SYS programming language overview",
+        "Best practices for Structured Statements in programming"
+      ]
+    }
+  }
+]
diff --git a/app/llm/generators/concepts.py b/app/llm/generators/concepts.py
@@ -4,6 +4,7 @@
 from json import JSONDecodeError
 from typing import List
 
+import ftfy
 from pydantic import BaseModel
 from tenacity import stop_after_attempt, wait_fixed, before, after, retry, retry_if_exception_type
 import threading
@@ -24,8 +25,7 @@ class CourseGeneratedConcepts(BaseModel):
 concept_settings = GenerationSettings(
     temperature=0.7,
     max_tokens=256,
-    timeout=40,
-    stop_tokens=None,
+    timeout=1200,
     prompt_type="concept",
     model=settings.LLM_INSTRUCT_TYPE,
 )
@@ -52,14 +52,14 @@ def after_retry_callback(retry_state):
 
 @retry(
     retry=retry_if_exception_type(GenerationError),
-    stop=stop_after_attempt(2),
+    stop=stop_after_attempt(5),
     wait=wait_fixed(2),
     before_sleep=before_retry_callback,
     after=after_retry_callback,
     reraise=True,
 )
-async def generate_concepts(topic: str, revision: int) -> CourseGeneratedConcepts:
-    prompt = concept_prompt(topic)
+async def generate_concepts(topic: str, revision: int, include_examples: bool = True) -> CourseGeneratedConcepts:
+    prompt = concept_prompt(topic, include_examples=include_examples)
     text = ""
     # If we should cache the prompt - skip cache if we're retrying
     should_cache = not getattr(local_data, "is_retry", False)
@@ -68,6 +68,7 @@ async def generate_concepts(topic: str, revision: int) -> CourseGeneratedConcept
         text += chunk
     try:
         text = extract_only_json_dict(text)
+        text = str(ftfy.fix_text(text))
         data = json.loads(text.strip())
         concepts = data["concepts"]
         feasible = data["feasible"]