Skip to content

Commit

Permalink
Merge pull request #10 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Better scalability
  • Loading branch information
VikParuchuri authored Oct 12, 2023
2 parents e806b3b + 7ceae0d commit 055a92a
Show file tree
Hide file tree
Showing 24 changed files with 571 additions and 197 deletions.
30 changes: 28 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ By default, this will use `gpt-3.5`. You can use `gpt-4` by setting the env var
- Set the model name and max tokens in the `LLM_TYPES` setting.
- Follow the instructions above for the retrieval setup.

The generator ideally needs a context length of up to `16k`, but you can get away with `12k` if you need to.
The generator ideally needs a context length of up to `16k`, but you can get away with `12k` if you need to. If you've finetuned your own model for textbook gen (based on the prompts cached in this repo), you can use the `FINETUNED` and `INCLUDE_EXAMPLES` settings to reduce token usage.

### Without retrieval

Expand Down Expand Up @@ -73,6 +73,8 @@ Usage example:

## Generate textbooks

### From titles

This will take a file with a flat json list of topics, and generate one textbook per topic. The workers flag controls the number of parallel generations. Lower it if you hit rate limits.

Usage example:
Expand All @@ -83,7 +85,31 @@ You can also override settings with environment variables (instead of using `loc

`LLM_TYPE=llama LLM_INSTRUCT_TYPE=llama LLM_EXTENDED_TYPE=llama OPENAI_KEY="llama" OPENAI_BASE_URL="https://vllm-api.com/v1" python book_generator.py topics.json books.jsonl --workers 10`

Note that courses are cached by default, so regenerating a course with the same name twice will not hit the API again. The cache is specific to each model and each topic.
You can see all options by running `python book_generator.py --help`.

Note that courses are cached by default, so regenerating a course with the same name twice will not hit the API again. The cache is specific to each model and each topic. You can skip the cache by using the `--revision` option to specify a revision number for the courses.

### From outlines

You can also generate a book from an existing outline by creating a jsonl file with the following fields:

- `topic` - The topic/title of the book
- `outline` - The outline of the book, as a flat json list. This needs to be in a specific format, see "clean table of contents" below.
- `queries` - Up to 2 search queries to use for retrieval. If you don't want to use retrieval, set this to an empty list.

## Clean tables of contents

This will take in a jsonl file with an existing table of contents and title, and process it into the correct format for book generation.

Usage example:

`python toc_cleaner.py toc.jsonl clean_toc.jsonl`

`toc.jsonl` should have the following fields in each line:

- `title` - The title of the book
- `toc` - a string containing the table of contents. This can be poorly formatted


# Extending

Expand Down
6 changes: 3 additions & 3 deletions app/course/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ async def load_cached_course(model: str, topic: str, revision: int):
select(Course).where(Course.topic == topic, Course.model == model, Course.version == revision)
)
course = query.all()
if len(course) == 0:
return None
course = course[0]
if len(course) == 0:
return None
course = course[0]

if course.context is not None:
course.context = [ResearchNote(**json.loads(v)) for v in course.context]
Expand Down
12 changes: 7 additions & 5 deletions app/course/tasks.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
from typing import List

from tenacity import RetryError

from app.course.embeddings import EmbeddingContext
from app.course.schemas import ResearchNote
from app.llm.exceptions import GenerationError, InvalidRequestError, RateLimitError
from app.llm.generators.concepts import generate_concepts
from app.llm.generators.outline import generate_outline
from app.services.generators.pdf import download_and_parse_pdfs, search_pdfs
from app.settings import settings
from app.util import debug_print_trace


async def create_course_concepts(course_name: str, revision: int):
"""
Set the topic and concepts for a course async.
"""
topic = None
generated_concepts = None
try:
concepts = await generate_concepts(course_name, revision)
concepts = await generate_concepts(course_name, revision, include_examples=settings.INCLUDE_EXAMPLES)
if concepts.feasible:
generated_concepts = concepts.concepts
except (GenerationError, RateLimitError, InvalidRequestError) as e:
except (GenerationError, RateLimitError, InvalidRequestError, RetryError) as e:
debug_print_trace()
print(f"Error generating concepts for {course_name}: {e}")

Expand All @@ -32,13 +34,13 @@ async def create_course_outline(
outline_list = None
queries = None
try:
response = generate_outline(course_name, concepts, revision, item_count=outline_items)
response = generate_outline(course_name, concepts, revision, item_count=outline_items, include_examples=settings.INCLUDE_EXAMPLES)

# Stream outline as it generates
async for outline_data in response:
outline_list = outline_data.outline
queries = outline_data.queries
except (GenerationError, RateLimitError, InvalidRequestError) as e:
except (GenerationError, RateLimitError, InvalidRequestError, RetryError) as e:
debug_print_trace()
print(f"Error generating outline for {course_name}")

Expand Down
14 changes: 9 additions & 5 deletions app/lesson/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ def render_components_to_output_markdown(
)
)
case ComponentNames.section:
if not component.markdown.startswith("#"):
component.markdown = f"# {component.markdown}"
tuples.append((component.type, component.markdown))
markdown_data = component.markdown.strip()
if not markdown_data.startswith("#"):
markdown_data = f"# {markdown_data}"
tuples.append((component.type, markdown_data))
case ComponentNames.text:
markdown_data = component.markdown.strip()
tuples.append(
(component.type, remove_section_paragraphs(component.markdown))
(component.type, remove_section_paragraphs(markdown_data))
)
case _:
tuples.append((component.type, component.markdown))
Expand All @@ -50,4 +52,6 @@ def remove_section_paragraphs(text):
paragraphs.pop()

# Reconstruct the text from the remaining paragraphs
return "\n".join(paragraphs).strip()
replaced = "\n".join(paragraphs).strip()
replaced = re.sub(r"\n\n+", "\n\n", replaced)
return replaced
17 changes: 16 additions & 1 deletion app/lesson/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ async def generate_lesson(
outline: List[str],
revision: int,
research_notes: List[ResearchNote] | None = None,
sections_per_generation: int = settings.SECTIONS_PER_GENERATION,
) -> List[AllLessonComponentData] | None:
# Add numbers to the outline - needed for generating the lesson
numbered_outline = outline
Expand Down Expand Up @@ -62,9 +63,14 @@ async def generate_lesson(
current_section = f"{last_section.strip()}\n\n{current_section_header.strip()}"
current_section = f"{current_section}\n"

# When to stop generation
stop_section = None
if generated_sections + sections_per_generation < len(numbered_outline):
stop_section = numbered_outline[generated_sections + sections_per_generation]

# Filter research notes to save tokens, only keep notes relevant to the next 5 sections
# Find the indices of the next sections
future_sections = set(list(range(generated_sections, len(numbered_outline)))[:5])
future_sections = set(list(range(generated_sections, len(numbered_outline)))[:sections_per_generation])
selected_research_notes = None
if research_notes is not None:
selected_research_notes = []
Expand All @@ -84,6 +90,7 @@ async def generate_lesson(
research_notes=selected_research_notes,
include_examples=settings.INCLUDE_EXAMPLES,
cache=use_cache,
stop_section=stop_section,
)
new_components = []
new_component_keys = []
Expand Down Expand Up @@ -137,6 +144,7 @@ async def generate_single_lesson_chunk(
research_notes: List[ResearchNote] | None,
include_examples: bool,
cache: bool,
stop_section: str | None = None,
) -> AsyncGenerator[List[AllLessonComponentData], None]:
response = generate_lessons(
numbered_outline,
Expand All @@ -148,8 +156,15 @@ async def generate_single_lesson_chunk(
research_notes=research_notes,
include_examples=include_examples,
cache=cache,
stop_section=stop_section,
)

section_start = f"---{ComponentNames.section}"

async for chunk in response:
# Remove the final section header from the chunk
# This happens when we hit the stop token
if chunk.strip().endswith(section_start):
chunk = chunk.strip()[:-len(section_start)]
new_components = parse_lesson_markdown(chunk)
yield new_components
22 changes: 14 additions & 8 deletions app/llm/adaptors/oai.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ async def oai_chat_wrapped(
history: List,
temperature: float,
max_tokens: int,
stop_tokens: Optional[List] = None,
inner_timeout: int = settings.LLM_TIMEOUT,
stop_sequences: Optional[List] = None,
model: str = settings.LLM_TYPE,
) -> AsyncGenerator[str, None]:
response = await openai.ChatCompletion.acreate(
Expand All @@ -39,8 +40,9 @@ async def oai_chat_wrapped(
temperature=temperature,
max_tokens=max_tokens,
n=1,
stop=stop_tokens,
stop=stop_sequences,
stream=True,
request_timeout=inner_timeout,
)
async for chunk in response:
stream = chunk
Expand All @@ -54,7 +56,8 @@ async def oai_prompt_wrapped(
prompt: str,
temperature: float,
max_tokens: int,
stop_tokens: Optional[List] = None,
inner_timeout: int = settings.LLM_TIMEOUT,
stop_sequences: Optional[List] = None,
model: str = settings.LLM_TYPE,
) -> AsyncGenerator[str, None]:
response = await openai.Completion.acreate(
Expand All @@ -63,8 +66,9 @@ async def oai_prompt_wrapped(
temperature=temperature,
max_tokens=max_tokens,
n=1,
stop=stop_tokens,
stop=stop_sequences,
stream=True,
request_timeout=inner_timeout,
)
async for chunk in response:
stream = chunk
Expand All @@ -78,7 +82,7 @@ async def oai_prompt_response(
temperature: float = settings.LLM_TEMPERATURE,
timeout: int = settings.LLM_TIMEOUT,
max_tokens: int = settings.LLM_MAX_RESPONSE_TOKENS,
stop_tokens=None,
stop_sequences=None,
model: str = settings.LLM_TYPE,
) -> Optional[AsyncGenerator[LLMResponse, None]]:
response_tokens = 0
Expand All @@ -88,7 +92,8 @@ async def oai_prompt_response(
temperature,
max_tokens,
timeout=timeout,
stop_tokens=stop_tokens,
inner_timeout=timeout,
stop_sequences=stop_sequences,
model=model,
)
async for chunk in response:
Expand All @@ -113,7 +118,7 @@ async def oai_chat_response(
timeout: int = settings.LLM_TIMEOUT,
max_tokens: int = settings.LLM_MAX_RESPONSE_TOKENS,
history=None,
stop_tokens=None,
stop_sequences=None,
model: str = settings.LLM_TYPE,
) -> Optional[AsyncGenerator[LLMResponse, None]]:
current_message = {"role": "user", "content": prompt}
Expand All @@ -130,7 +135,8 @@ async def oai_chat_response(
temperature,
max_tokens,
timeout=timeout,
stop_tokens=stop_tokens,
inner_timeout=timeout,
stop_sequences=stop_sequences,
model=model,
)
async for chunk in response:
Expand Down
45 changes: 45 additions & 0 deletions app/llm/examples/toc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
[
{
"topic": "Python Programming for Beginners",
"draft_outline": "Introduction\n*What is Programming? *Why Python? *Historical Background of Python *Applications of Python Setting Up the Environment\n *Installing Python *Interactive Shell vs. Script Mode *Setting Up an IDE (e.g., PyCharm, VSCode)",
"json": {
"outline": [
"1. Python Programming for Beginners",
"1.1. What is Programming?",
"1.2. Why Python?",
"1.3. Historical Background of Python",
"1.4. Applications of Python",
"2. Setting Up the Environment",
"2.1. Installing Python",
"2.2. Interactive Shell vs. Script Mode",
"2.3. Setting Up an IDE (e.g., PyCharm, VSCode)"
],
"queries": [
"Python programming beginner guide",
"Python programming introduction book"
]
}
},
{
"topic": "PLZ/SYS Programming Language Manual",
"draft_outline": "1. Introduction.- 1.1 PLZ/SYS objectives.- 2. Summary Of The Language.- 2.1 Data and Statements.- 2.2 The Construction of a Program.- 3. Notation, Terminology, And Vocabulary.- 3.1 Vocabulary.- 3.2 Lexical Structure.- 4. Identifiers And Literal Constants.- PLZ/SYS Grammar. Conclusion. References.",
"json": {
"outline": [
"1. Introduction",
"1.1 PLZ/SYS objectives",
"2. Summary Of The Language",
"2.1 Data and Statements",
"2.2 The Construction of a Program",
"3. Notation, Terminology, And Vocabulary",
"3.1 Vocabulary",
"3.2 Lexical Structure",
"4. Identifiers And Literal Constants",
"5. PLZ/SYS Grammar"
],
"queries": [
"PLZ/SYS programming language overview",
"Best practices for Structured Statements in programming"
]
}
}
]
11 changes: 6 additions & 5 deletions app/llm/generators/concepts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from json import JSONDecodeError
from typing import List

import ftfy
from pydantic import BaseModel
from tenacity import stop_after_attempt, wait_fixed, before, after, retry, retry_if_exception_type
import threading
Expand All @@ -24,8 +25,7 @@ class CourseGeneratedConcepts(BaseModel):
concept_settings = GenerationSettings(
temperature=0.7,
max_tokens=256,
timeout=40,
stop_tokens=None,
timeout=1200,
prompt_type="concept",
model=settings.LLM_INSTRUCT_TYPE,
)
Expand All @@ -52,14 +52,14 @@ def after_retry_callback(retry_state):

@retry(
retry=retry_if_exception_type(GenerationError),
stop=stop_after_attempt(2),
stop=stop_after_attempt(5),
wait=wait_fixed(2),
before_sleep=before_retry_callback,
after=after_retry_callback,
reraise=True,
)
async def generate_concepts(topic: str, revision: int) -> CourseGeneratedConcepts:
prompt = concept_prompt(topic)
async def generate_concepts(topic: str, revision: int, include_examples: bool = True) -> CourseGeneratedConcepts:
prompt = concept_prompt(topic, include_examples=include_examples)
text = ""
# If we should cache the prompt - skip cache if we're retrying
should_cache = not getattr(local_data, "is_retry", False)
Expand All @@ -68,6 +68,7 @@ async def generate_concepts(topic: str, revision: int) -> CourseGeneratedConcept
text += chunk
try:
text = extract_only_json_dict(text)
text = str(ftfy.fix_text(text))
data = json.loads(text.strip())
concepts = data["concepts"]
feasible = data["feasible"]
Expand Down
Loading

0 comments on commit 055a92a

Please sign in to comment.