Configure stopping criteria

VikParuchuri · Oct 9, 2023 · bb3c043 · bb3c043
1 parent 9934ee9
commit bb3c043
Show file tree

Hide file tree

Showing 10 changed files with 60 additions and 24 deletions.
diff --git a/app/lesson/tasks.py b/app/lesson/tasks.py
@@ -22,6 +22,7 @@ async def generate_lesson(
     outline: List[str],
     revision: int,
     research_notes: List[ResearchNote] | None = None,
+    sections_per_generation: int = settings.SECTIONS_PER_GENERATION,
 ) -> List[AllLessonComponentData] | None:
     # Add numbers to the outline - needed for generating the lesson
     numbered_outline = outline
@@ -62,9 +63,14 @@ async def generate_lesson(
         current_section = f"{last_section.strip()}\n\n{current_section_header.strip()}"
         current_section = f"{current_section}\n"
 
+        # When to stop generation
+        stop_section = None
+        if generated_sections + sections_per_generation < len(numbered_outline):
+            stop_section = numbered_outline[generated_sections + sections_per_generation]
+
         # Filter research notes to save tokens, only keep notes relevant to the next 5 sections
         # Find the indices of the next sections
-        future_sections = set(list(range(generated_sections, len(numbered_outline)))[:5])
+        future_sections = set(list(range(generated_sections, len(numbered_outline)))[:sections_per_generation])
         selected_research_notes = None
         if research_notes is not None:
             selected_research_notes = []
@@ -84,6 +90,7 @@ async def generate_lesson(
                 research_notes=selected_research_notes,
                 include_examples=settings.INCLUDE_EXAMPLES,
                 cache=use_cache,
+                stop_section=stop_section,
             )
             new_components = []
             new_component_keys = []
@@ -137,6 +144,7 @@ async def generate_single_lesson_chunk(
     research_notes: List[ResearchNote] | None,
     include_examples: bool,
     cache: bool,
+    stop_section: str | None = None,
 ) -> AsyncGenerator[List[AllLessonComponentData], None]:
     response = generate_lessons(
         numbered_outline,
@@ -148,8 +156,15 @@ async def generate_single_lesson_chunk(
         research_notes=research_notes,
         include_examples=include_examples,
         cache=cache,
+        stop_section=stop_section,
     )
 
+    section_start = f"---{ComponentNames.section}"
+
     async for chunk in response:
+        # Remove the final section header from the chunk
+        # This happens when we hit the stop token
+        if chunk.strip().endswith(section_start):
+            chunk = chunk.strip()[:-len(section_start)]
         new_components = parse_lesson_markdown(chunk)
         yield new_components
diff --git a/app/llm/adaptors/oai.py b/app/llm/adaptors/oai.py
@@ -30,7 +30,7 @@ async def oai_chat_wrapped(
     history: List,
     temperature: float,
     max_tokens: int,
-    stop_tokens: Optional[List] = None,
+    stop_sequences: Optional[List] = None,
     model: str = settings.LLM_TYPE,
 ) -> AsyncGenerator[str, None]:
     response = await openai.ChatCompletion.acreate(
@@ -39,7 +39,7 @@ async def oai_chat_wrapped(
         temperature=temperature,
         max_tokens=max_tokens,
         n=1,
-        stop=stop_tokens,
+        stop=stop_sequences,
         stream=True,
     )
     async for chunk in response:
@@ -54,7 +54,7 @@ async def oai_prompt_wrapped(
     prompt: str,
     temperature: float,
     max_tokens: int,
-    stop_tokens: Optional[List] = None,
+    stop_sequences: Optional[List] = None,
     model: str = settings.LLM_TYPE,
 ) -> AsyncGenerator[str, None]:
     response = await openai.Completion.acreate(
@@ -63,7 +63,7 @@ async def oai_prompt_wrapped(
         temperature=temperature,
         max_tokens=max_tokens,
         n=1,
-        stop=stop_tokens,
+        stop=stop_sequences,
         stream=True,
     )
     async for chunk in response:
@@ -78,7 +78,7 @@ async def oai_prompt_response(
     temperature: float = settings.LLM_TEMPERATURE,
     timeout: int = settings.LLM_TIMEOUT,
     max_tokens: int = settings.LLM_MAX_RESPONSE_TOKENS,
-    stop_tokens=None,
+    stop_sequences=None,
     model: str = settings.LLM_TYPE,
 ) -> Optional[AsyncGenerator[LLMResponse, None]]:
     response_tokens = 0
@@ -88,7 +88,7 @@ async def oai_prompt_response(
             temperature,
             max_tokens,
             timeout=timeout,
-            stop_tokens=stop_tokens,
+            stop_sequences=stop_sequences,
             model=model,
         )
         async for chunk in response:
@@ -113,7 +113,7 @@ async def oai_chat_response(
     timeout: int = settings.LLM_TIMEOUT,
     max_tokens: int = settings.LLM_MAX_RESPONSE_TOKENS,
     history=None,
-    stop_tokens=None,
+    stop_sequences=None,
     model: str = settings.LLM_TYPE,
 ) -> Optional[AsyncGenerator[LLMResponse, None]]:
     current_message = {"role": "user", "content": prompt}
@@ -130,7 +130,7 @@ async def oai_chat_response(
             temperature,
             max_tokens,
             timeout=timeout,
-            stop_tokens=stop_tokens,
+            stop_sequences=stop_sequences,
             model=model,
         )
         async for chunk in response:

diff --git a/app/llm/generators/concepts.py b/app/llm/generators/concepts.py
@@ -26,7 +26,6 @@ class CourseGeneratedConcepts(BaseModel):
     temperature=0.7,
     max_tokens=256,
     timeout=40,
-    stop_tokens=None,
     prompt_type="concept",
     model=settings.LLM_INSTRUCT_TYPE,
 )

diff --git a/app/llm/generators/lesson.py b/app/llm/generators/lesson.py
@@ -12,8 +12,7 @@
 lesson_settings = GenerationSettings(
     temperature=0.4,
     max_tokens=6000,
-    timeout=480,
-    stop_tokens=None,
+    timeout=1200,
     prompt_type="lesson",
 )
 
@@ -121,6 +120,7 @@ async def generate_lessons(
     include_examples: bool = True,
     update_after_chars: int = 500,
     cache: bool = True,
+    stop_section: str | None = None,
 ) -> AsyncGenerator[str, None]:
     prompt = lesson_prompt(
         outline,
@@ -133,7 +133,11 @@ async def generate_lessons(
     )
 
     text = ""
-    response = generate_response(prompt, lesson_settings, cache=cache, revision=revision)
+    stop_sequences = None
+    if stop_section is not None:
+        stop_sequences = [stop_section]
+
+    response = generate_response(prompt, lesson_settings, cache=cache, revision=revision, stop_sequences=stop_sequences)
     chunk_len = 0
 
     # Yield text in batches, to avoid creating too many DB models

diff --git a/app/llm/generators/outline.py b/app/llm/generators/outline.py
@@ -20,7 +20,6 @@
     temperature=0.6,
     max_tokens=2048,
     timeout=60,
-    stop_tokens=None,
     prompt_type="outline",
     model=settings.LLM_INSTRUCT_TYPE,
 )

diff --git a/app/llm/generators/topic.py b/app/llm/generators/topic.py
@@ -14,7 +14,6 @@
     temperature=0.9,
     max_tokens=512,
     timeout=40,
-    stop_tokens=None,
     prompt_type="topic",
     model=settings.LLM_INSTRUCT_TYPE,
 )

diff --git a/app/llm/llm.py b/app/llm/llm.py
@@ -26,16 +26,30 @@ async def generate_response(
     max_tries: int = 2,
     cache: bool = True,
     revision: int = 1,
+    stop_sequences: Optional[List[str]] = None,
 ) -> AsyncGenerator[str, None]:
     temperature = prompt_settings.temperature
     max_tokens = prompt_settings.max_tokens
     timeout = prompt_settings.timeout
-    stop_tokens = prompt_settings.stop_tokens
+    prompt_stops = prompt_settings.stop_sequences
     prompt_type = prompt_settings.prompt_type
     model = (
         prompt_settings.model or settings.LLM_TYPE
     )  # Use default model if not specified
 
+    # Stop sequences for the llm
+    stops = []
+    if prompt_stops is not None:
+        stops.extend(prompt_stops)
+    if stop_sequences is not None:
+        stops.extend(stop_sequences)
+
+    # Only support up to 4 stop sequences
+    if len(stops) == 0:
+        stops = None
+    else:
+        stops = stops[:4]
+
     # Remove utf-8 surrogate characters
     prompt = fix_unicode_text(prompt)
 
@@ -84,7 +98,7 @@ async def generate_response(
                         timeout,
                         max_tokens,
                         history,
-                        stop_tokens,
+                        stops,
                         model=model,
                     )
                 case "gpt-3.5-turbo-instruct":
@@ -102,7 +116,7 @@ async def generate_response(
                         temperature,
                         timeout,
                         max_tokens,
-                        stop_tokens,
+                        stops,
                         model=model,
                     )
                 case _:
@@ -127,7 +141,7 @@ async def generate_response(
                         temperature,
                         timeout,
                         max_tokens,
-                        stop_tokens,
+                        stops,
                         model=model,
                     )
             break

diff --git a/app/llm/schemas.py b/app/llm/schemas.py
@@ -12,7 +12,7 @@ class GenerationSettings(BaseModel):
     temperature: float
     max_tokens: int
     timeout: int
-    stop_tokens: Optional[List[str]]
+    stop_sequences: Optional[List[str]]
     prompt_type: str
     component_name: Optional[str]
     model: Optional[str]
diff --git a/app/settings.py b/app/settings.py
@@ -19,6 +19,7 @@ class Settings(BaseSettings):
 
     # Content
     SECTIONS_PER_LESSON: int = 30  # Lower this to make books shorter
+    SECTIONS_PER_GENERATION: int = 5 # How many sections to generate in one prompt
     MAX_DOWNLOAD_SIZE: int = 6 * 1024 * 1024  # Max pdf size to download, 6 MB
     FINETUNED: bool = False # If we're using a finetuned textbook gen model
     INCLUDE_EXAMPLES: bool = (
@@ -36,7 +37,7 @@ class Settings(BaseSettings):
     }
 
     LLM_TEMPERATURE: float = 0.5
-    LLM_TIMEOUT: int = 120
+    LLM_TIMEOUT: int = 480
     LLM_MAX_RESPONSE_TOKENS: int = 2048
     OPENAI_KEY: str = ""
     OPENAI_BASE_URL: Optional[str] = None
@@ -56,6 +57,7 @@ class Settings(BaseSettings):
     # General
     THREADS_PER_WORKER: int = 1 # How many threads to use per worker process to save RAM
     RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
+    RAY_DASHBOARD_HOST: str = "0.0.0.0"
 
     class Config:
         env_file = find_dotenv("local.env")

diff --git a/book_generator.py b/book_generator.py
@@ -51,7 +51,7 @@ def get_json_data_from_course(course: Course, extended_fields=False):
     return json.dumps(json_data)
 
 
-async def generate_single_course(model, course_data: Dict | str, revision=1, outline_items=12):
+async def generate_single_course(model, course_data: Dict | str, revision=1, outline_items=12, cache_only=False):
     components = ["exercise", "example"]
 
     outline = None
@@ -69,6 +69,9 @@ async def generate_single_course(model, course_data: Dict | str, revision=1, out
         await asyncio.sleep(.001) # Sleep to avoid high CPU usage with many workers
         return course
 
+    if cache_only:
+        return None
+
     if not outline:
         # Only generate outline if one was not passed in
         concepts = await create_course_concepts(course_name, revision)
@@ -123,7 +126,7 @@ async def generate_single_course(model, course_data: Dict | str, revision=1, out
 
 async def _process_course(model, topic, args):
     try:
-        return await generate_single_course(model, topic, revision=args.revision)
+        return await generate_single_course(model, topic, revision=args.revision, cache_only=args.cache_only)
     except Exception as e:
         debug_print_trace()
         print(f"Unhandled error generating course: {e}")
@@ -176,6 +179,7 @@ def to_iterator(obj_ids):
     parser.add_argument("--extended-fields", action="store_true", default=False, help="Include extended fields in output")
     parser.add_argument("--no_cache", action="store_true", default=False, help="Don't use the cache")
     parser.add_argument("--revision", type=int, default=1, help="Revision number for the course.  Change this to avoid hitting cache if you want to regenerate a course.")
+    parser.add_argument("--cache-only", action="store_true", default=False, help="Only use the cache, don't generate any new courses")
 
     args = parser.parse_args()
 
@@ -202,7 +206,7 @@ def to_iterator(obj_ids):
         total_processes = math.ceil(args.workers / settings.THREADS_PER_WORKER)
         func = process_courses
 
-    ray.init(num_cpus=total_processes, storage=settings.RAY_CACHE_PATH, _temp_dir=settings.RAY_CACHE_PATH)
+    ray.init(num_cpus=total_processes, storage=settings.RAY_CACHE_PATH, _temp_dir=settings.RAY_CACHE_PATH, dashboard_host=settings.RAY_DASHBOARD_HOST)
 
     model = SentenceTransformer("thenlper/gte-small")
     model_ref = ray.put(model)