Skip to content

Commit

Permalink
folder structure change, example driven optimization, mix of lm gener…
Browse files Browse the repository at this point in the history
…ation, teacher based output gen.
  • Loading branch information
krypticmouse committed Mar 11, 2024
1 parent cab9eb4 commit 56d4378
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 88 deletions.
1 change: 1 addition & 0 deletions dspy/experimental/synthesizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .synthesizer import *
25 changes: 25 additions & 0 deletions dspy/experimental/synthesizer/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import dspy

from typing import Union, List, Optional
from pydantic import BaseModel, field_validator

class SynthesizerArguments(BaseModel):
# [TODO]
feedback_mode: Optional[str] = None
num_example_for_feedback: Optional[int] = None

input_lm_model: Optional[dspy.LM] = None
output_lm_model: Optional[dspy.LM] = None
output_teacher_module: Optional[Union[dspy.Module, dspy.Predict]] = None

num_example_for_optim: Optional[int] = None

@field_validator(fields=["feedback_mode", "num_example_for_feedback"])
def validate_feedback_mode(cls, value):
if value and value not in ["human", "llm"]:
raise ValueError("Feedback mode should be either 'human' or 'llm'.")

if value and not cls.num_example_for_feedback:
raise ValueError("Number of examples for feedback is required when feedback mode is provided.")

return value
5 changes: 5 additions & 0 deletions dspy/experimental/synthesizer/instructions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
INPUT_GENERATION_TASK_WITH_EXAMPLES = f"""Create synthetic data using the task description and the provided knowledge seed. Your task is to generate diverse and imaginative data that aligns with the given task description and knowledge seed. You are encouraged to be creative and not limit yourself, allowing for a wide range of synthetic data that reflects the characteristics and details provided in the task description. The data should be unique and varied, showcasing originality and creativity while maintaining relevance to the task and knowledge seed.
Additionally I'll be providing you some data I generated before hand, make sure the data you generate if consistent with task I provided but different from the data I provided in every way possible."""

INPUT_GENERATION_TASK_WITH_FEEDBACK = f""""""
62 changes: 62 additions & 0 deletions dspy/experimental/synthesizer/signatures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import dspy

from .utils import format_examples

class UnderstandTask(dspy.Signature):
"""I'll be providing you a task description, your task is to prepare a concise, comprehensible summary that captures the broad essence and purpose of the task this description aim to address. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances of individual datapoints, specifics about models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise, without touching on methodologies or solutions."""

task_description = dspy.InputField(
prefix="Task Description:",
desc="Description of the task.",
)
explanation = dspy.OutputField(
prefix="Task Description:",
desc="Explanation of the task.",
)

class ExplainTask(dspy.Signature):
"""Analyze the provided set of datapoints carefully, and prepare a concise, comprehensible summary that captures the broad essence and purpose of the task these datapoints aim to address. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances of individual datapoints, specifics about models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise, without touching on methodologies or solutions."""

examples = dspy.InputField(
prefix="Examples Datapoints:-",
desc="List of datapoints to analyze and explain the task.",
format=format_examples,
)
explanation = dspy.OutputField(
prefix="Task Description:",
desc="Explanation of the task.",
)

class GenerateFieldDescription(dspy.Signature):
"""Generate a concise and informative description for a given field based on the provided name and task description. This description should be no longer than 10 words and should be in simple english."""

task_description = dspy.InputField(
prefix="Task Description:",
desc="Description of the task the field is an input to.",
)
field_name = dspy.InputField(
prefix="Field Name:",
desc="Name of the field to generate synthetic data for.",
)
field_description = dspy.OutputField(
prefix="Field Description:",
desc="Description of the field.",
)

class GenerateInputFieldsData(dspy.Signature):
"""Create synthetic data using the task description and the provided knowledge seed. Your task is to generate diverse and imaginative data that aligns with the given task description and knowledge seed. You are encouraged to be creative and not limit yourself, allowing for a wide range of synthetic data that reflects the characteristics and details provided in the task description. The data should be unique and varied, showcasing originality and creativity while maintaining relevance to the task and knowledge seed.
A knowledge seed is the index of the knowledge base you have, each index represents a different knowledge base."""

knowledge_seed = dspy.InputField(
prefix="Knowledge Seed:",
desc="Seed for the knowledge base search to base the inputs around.",
format=lambda x: str(x),
)
task_description = dspy.InputField(
prefix="Task Description:",
desc="Description of the task the field is an input to.",
)

class GenerateOutputFieldsData(dspy.Signature):
pass
Original file line number Diff line number Diff line change
@@ -1,93 +1,31 @@
import dspy
import random
from collections.abc import Mapping
from typing import List, Union

from datasets import Dataset
from tqdm import tqdm, trange
from typing import List, Union, Optional, Mapping

import dspy

from .signatures import (
ExplainTask,
GenerateFieldDescription,
GenerateInputFieldsData,
GenerateOutputFieldsData,
UnderstandTask,
)
from .config import SynthesizerArguments
from .instructions import INPUT_GENERATION_TASK_WITH_EXAMPLES
from .utils import format_examples

def format_examples(examples: List[dspy.Example]) -> str:
if isinstance(examples, str):
return examples

formatted_example = ""

for example in examples:
input_keys = example.inputs().keys()
label_keys = example.labels().keys()

formatted_example += "Inputs:\n"
for key in input_keys:
formatted_example += f"{key}: {example[key]}\n"

formatted_example += "Outputs:\n"
for key in label_keys:
formatted_example += f"{key}: {example[key]}\n"

return formatted_example

class UnderstandTask(dspy.Signature):
"""I'll be providing you a task description, your task is to prepare a concise, comprehensible summary that captures the broad essence and purpose of the task this description aim to address. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances of individual datapoints, specifics about models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise, without touching on methodologies or solutions."""

task_description = dspy.InputField(
prefix="Task Description:",
desc="Description of the task.",
)
explanation = dspy.OutputField(
prefix="Task Description:",
desc="Explanation of the task.",
)

class ExplainTask(dspy.Signature):
"""Analyze the provided set of datapoints carefully, and prepare a concise, comprehensible summary that captures the broad essence and purpose of the task these datapoints aim to address. Your summary should illuminate the general objective and the type of problem being solved, offering a clear picture of what the task entails at a high level. Avoid getting into the nuances of individual datapoints, specifics about models, examples, algorithms, or any intricate technicalities. Your explanation should serve to clarify the task's overall goal and its basic premise, without touching on methodologies or solutions."""

examples = dspy.InputField(
prefix="Examples Datapoints:-",
desc="List of datapoints to analyze and explain the task.",
format=format_examples,
)
explanation = dspy.OutputField(
prefix="Task Description:",
desc="Explanation of the task.",
)

class GenerateFieldDescription(dspy.Signature):
"""Generate a concise and informative description for a given field based on the provided name and task description. This description should be no longer than 10 words and should be in simple english."""

task_description = dspy.InputField(
prefix="Task Description:",
desc="Description of the task the field is an input to.",
)
field_name = dspy.InputField(
prefix="Field Name:",
desc="Name of the field to generate synthetic data for.",
)
field_description = dspy.OutputField(
prefix="Field Description:",
desc="Description of the field.",
)

class GenerateInputFieldsData(dspy.Signature):
"""Create synthetic data using the task description and the provided knowledge seed. Your task is to generate diverse and imaginative data that aligns with the given task description and knowledge seed. You are encouraged to be creative and not limit yourself, allowing for a wide range of synthetic data that reflects the characteristics and details provided in the task description. The data should be unique and varied, showcasing originality and creativity while maintaining relevance to the task and knowledge seed."""

knowledge_seed = dspy.InputField(
prefix="Knowledge Seed:",
desc="Seed for the knowledge base search to base the inputs around.",
format=lambda x: str(x),
)
task_description = dspy.InputField(
prefix="Task Description:",
desc="Description of the task the field is an input to.",
)

class GenerateOutputFieldsData(dspy.Signature):
pass
__all__ = ["Synthesizer"]

class Synthesizer:
def __init__(self):
def __init__(self, config: SynthesizerArguments):
self.config = config
self.input_lm = config.input_lm_model or dspy.settings.lm
self.output_lm = config.output_lm_model or dspy.settings.lm

self.explain_task = dspy.Predict(ExplainTask)
self.understand_task = dspy.Predict(UnderstandTask)
self.generate_field_description = dspy.Predict(GenerateFieldDescription)

self.generate_input_data = GenerateInputFieldsData
Expand All @@ -111,7 +49,12 @@ def _get_field_data(self, key: str, keys_dict: Mapping[str, str]):

return field_name, field_description

def _prepare_synthetic_data_predictors(self, input_keys: Mapping[str, str], output_keys: Mapping[str, str], task_description: str):
def _prepare_synthetic_data_predictors(
self,
input_keys: Mapping[str, str],
output_keys: Mapping[str, str],
ground_source: Optional[Union[List[dspy.Example], dspy.Signature]] = None,
):
for key in tqdm(input_keys, desc="Preparing Input Fields"):
field_name, field_description = self._get_field_data(key, input_keys)

Expand All @@ -125,6 +68,17 @@ def _prepare_synthetic_data_predictors(self, input_keys: Mapping[str, str], outp
output_field,
)

if ground_source:
self.generate_input_data = self.generate_input_data.insert(
-1,
"ground_source",
dspy.InputField(
prefix=f"Pre-Generated Examples:",
desc="Pre-Generated Examples to differ the inputs around.",
format=format_examples,
),
)

input_field = dspy.InputField(
prefix=f"{field_name}:",
desc=field_description,
Expand Down Expand Up @@ -152,7 +106,10 @@ def _prepare_synthetic_data_predictors(self, input_keys: Mapping[str, str], outp

def _get_dataset_metadata(self, ground_source: Union[List[dspy.Example], dspy.Signature]):
if isinstance(ground_source, dspy.SignatureMeta):
task_description = self.explain_task(examples=ground_source.__doc__).explanation
task_description = ground_source.__doc__
if task_description.startswith("Given the fields"):
task_description = self.understand_task(examples=ground_source.__doc__).explanation

input_keys = {k:v.json_schema_extra["desc"] for k,v in ground_source.input_fields.items()}
output_keys = {k:v.json_schema_extra["desc"] for k,v in ground_source.output_fields.items()}

Expand All @@ -172,17 +129,19 @@ def generate(
self,
ground_source: Union[List[dspy.Example], dspy.Signature],
num_data: int,
batch_size: int = None,
batch_size: int = 1,
):
batch_size = batch_size or 1
task_description, input_keys, output_keys = self._get_dataset_metadata(ground_source)

if self.config.num_example_for_optim:
self.generate_input_data.__doc__ = INPUT_GENERATION_TASK_WITH_EXAMPLES
self.generate_output_data.__doc__ = task_description

self.input_predictor, self.output_predictor = self._prepare_synthetic_data_predictors(
input_keys=input_keys,
output_keys=output_keys,
task_description=task_description,
ground_source=ground_source if self.config.num_example_for_optim else None,
)

data = []
Expand All @@ -191,15 +150,38 @@ def generate(
iter_temperature = 0.7+0.01*idx
iter_seed = random.randint(0, 1000000)

inputs = self.input_predictor(task_description=task_description, knowledge_seed=iter_seed, config=dict(temperature=iter_temperature, n=batch_size))
inputs = None

with dspy.context(lm=self.input_lm):
if self.config.num_example_for_optim:
example_for_optimization = random.sample(ground_source, self.config.num_example_for_optim)
inputs = self.input_predictor(
task_description=task_description,
knowledge_seed=iter_seed,
ground_source=example_for_optimization,
config=dict(temperature=iter_temperature, n=batch_size)
)
else:
inputs = self.input_predictor(
task_description=task_description,
knowledge_seed=iter_seed,
config=dict(temperature=iter_temperature, n=batch_size)
)

input_kwargs = [{
key: getattr(completions, key)
for key in input_keys
} for completions in inputs.completions]

for kwargs in input_kwargs:
outputs = self.output_predictor(**kwargs, config=dict(temperature=iter_temperature))
outputs = None

with dspy.context(lm=self.output_lm, temperature=iter_temperature):
if self.config.output_teacher_module:
outputs = self.config.output_teacher_module(**kwargs)

else:
outputs = self.output_predictor(**kwargs, config=dict(temperature=iter_temperature))

output_kwargs = {
key: getattr(outputs, key)
Expand All @@ -210,7 +192,6 @@ def generate(

return data


def export(self, data: List[dspy.Example], path: str, mode: str = None, **kwargs):
extention = mode or path.split(".")[-1]

Expand Down
22 changes: 22 additions & 0 deletions dspy/experimental/synthesizer/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import dspy
from typing import List

def format_examples(examples: List[dspy.Example]) -> str:
if isinstance(examples, str):
return examples

formatted_example = ""

for example in examples:
input_keys = example.inputs().keys()
label_keys = example.labels().keys()

formatted_example += "Inputs:\n"
for key in input_keys:
formatted_example += f"{key}: {example[key]}\n"

formatted_example += "Outputs:\n"
for key in label_keys:
formatted_example += f"{key}: {example[key]}\n"

return formatted_example

0 comments on commit 56d4378

Please sign in to comment.