fixed handling of newlines in custom instructions and curated datasets

rajivpant · May 8, 2024 · 39d5285 · 39d5285
1 parent 7fe5990
commit 39d5285
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 29 deletions.
diff --git a/generate_prompt_template.py b/generate_prompt_template.py
@@ -7,31 +7,23 @@
 
 def generate_prompt_template(instructions_content, datasets_content, output_file):
     prompt_template = f"""
-<prompt>
-You are an AI assistant created to be helpful, harmless, and honest. Your role is to provide guidance, advice, and assistance to the user, drawing upon the custom instructions and curated datasets provided in the attached files.
+# Initial Instructions
+
+You are an AI assistant created to be helpful, and honest. Your role is to provide guidance, advice, and assistance to the user, drawing upon the custom instructions and curated datasets provided here.
 
 When responding, please adhere to the following guidelines:
-- Carefully review the custom instructions in the 'instructions.md' file and ensure your responses align with the specified guidelines, communication style, and preferences.
-- Refer to the relevant information in the 'datasets.md' file to provide informed and personalized responses when applicable.
-- If you are unsure about something or if the curated datasets don't cover the specific query, it's okay to say that you don't have enough information to provide a complete answer.
+
+- Carefully review the custom instructions provided here and ensure your responses align with the specified guidelines, communication style, and preferences.
+- Refer to the relevant information in the curatd datasets to provide informed and personalized responses when applicable.
+- If you are unsure about something or if the curated datasets don't cover the specific query, it's preferable to say that you don't have enough information to provide a complete answer rather than hallucinate.
 - Always prioritize being helpful, truthful, and aligned with the user's best interests.
 - If there are any contradictions or inconsistencies between the query and the provided custom instructions or curated datasets, seek clarification before responding.
 
-<documents>
-<document index="1">
-<source>instructions.md</source>
-<document_content>
 {instructions_content}
-</document_content>
-</document>
 
-<document index="2">
-<source>datasets.md</source>
-<document_content>
 {datasets_content}
-</document_content>
-</document>
-</documents>
+
+<prompt>
 
 [User Query Here]
 

diff --git a/helpers.py b/helpers.py
@@ -32,27 +32,37 @@ def load_profiles(profiles_file):
         profiles = yaml.safe_load(stream)
     return profiles['profiles']
 
-# Function to load files containing custom instructions or curated datasets
+def process_file(filepath, file_type):
+    """Helper function to read and format the content of a file."""
+    unique_id = str(uuid.uuid4())
+    document_start_tag = f"<document:{unique_id} path=\"{filepath}\" type=\"{file_type}\">"
+    document_end_tag = f"</document:{unique_id}>"
+    with open(filepath, "r") as file:
+        # Read the entire file content as a single string
+        file_content = file.read() 
+
+    # Ensuring newline characters are added only where needed
+    full_content = f"{document_start_tag}\n{file_content}{document_end_tag}\n"
+    return full_content, filepath
+
 def load_files(file_paths, file_type):
     """Load files containing custom instructions or curated datasets."""
     files_content = []
     files_list = []  # to store file names
     for path in file_paths:
         if os.path.isfile(path):
-            with open(path, "r") as file:
-                unique_id = str(uuid.uuid4())
-                files_content.append(f"<ragbot-file:{unique_id} path=\"{path}\" type=\"{file_type}\">")
-                files_content.append(file.read())
-                files_content.append(f"</ragbot-file:{unique_id}>")
-                files_list.append(path)  # save file name
+            content, filename = process_file(path, file_type)
+            files_content.append(content)
+            files_list.append(filename)  # save file name
         elif os.path.isdir(path):
             for filepath in glob.glob(os.path.join(path, "*")):
-                if os.path.isfile(filepath):  # Check if the path is a file
-                    with open(filepath, "r") as file:
-                        files_content.append(file.read())
-                        files_list.append(filepath)  # save file name
+                if os.path.isfile(filepath):
+                    content, filename = process_file(filepath, file_type)
+                    files_content.append(content)
+                    files_list.append(filename)  # save file name
 
-    return files_content, files_list
+    files_content_str = "\n".join(files_content)
+    return files_content_str, files_list
 
 # Function to count tokens in a list of files
 def count_tokens(file_paths):