Merge branch 'pre-release' into pre-release

Optimose · Mar 6, 2024 · c480b5a · c480b5a
2 parents bfd0bf6 + 0ebe839
commit c480b5a
Show file tree

Hide file tree

Showing 12 changed files with 380 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,12 +12,9 @@ __pycache__/
 
 # Ignore the config file
 ufo/config/config.yaml
-*.yaml.test
-*.yaml
+
 
 # Ignore the helper files
 ufo/rag/app_docs/*
 learner/records.json
 vectordb/*
-
-
diff --git a/learner/__init__.py b/learner/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/learner/__main__.py b/learner/__main__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from . import learn
+
+if __name__ == "__main__":
+    # Execute the main script
+    learn.main()
diff --git a/learner/basic.py b/learner/basic.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from . import utils
+
+
+class BasicDocumentLoader:
+    """
+    A class to load documents from a list of files with a given extension list.
+    """
+
+    def __init__(self, extensions: str = None, directory: str = None):
+        """
+        Create a new BasicDocumentLoader.
+        :param extensions: The extensions to load.
+        """
+        self.extensions = extensions
+        self.directory = directory
+
+
+    def load_file_name(self):
+        """
+        Load the documents from the given directory.
+        :param directory: The directory to load from.
+        :return: The list of loaded documents.
+        """
+        return utils.find_files_with_extension(self.directory, self.extensions)
+
+
+    def construct_document_list(self):
+        """
+        Load the metadata from the given directory.
+        :param directory: The directory to load from.
+        :return: The list of metadata for the loaded documents.
+        """
+        pass
+
+
+
+
diff --git a/learner/indexer.py b/learner/indexer.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from . import xml_loader
+from .utils import load_json_file, save_json_file, print_with_color
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+import os
+
+
+def create_indexer(app: str, docs: str, format: str, incremental: bool, save_path: str):
+    """
+    Create an indexer for the given application.
+    :param app: The name of the application to create an indexer for.
+    :param docs: The help documents dir for the application.
+    :param format: The format of the help documents.
+    :param incremental: Whether to enable incremental updates.
+    :param save_path: The path to save the indexer to.
+    :return: The created indexer.
+    """
+
+    if os.path.exists("./learner/records.json"):
+        records = load_json_file("./learner/records.json")
+    else:
+        records = {}
+
+    print_with_color("Loading documents from {docs}...".format(docs=docs), "cyan")
+
+    loader = xml_loader.XMLLoader(docs)
+    documents = loader.construct_document()
+
+    print_with_color("Creating indexer for {num} documents for {app}...".format(num=len(documents), app=app), "yellow")
+
+    if format == "xml":
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+    else:
+        raise ValueError("Invalid format: " + format)
+
+    db = FAISS.from_documents(documents, embeddings)
+
+    if incremental:
+        if app in records:
+            print_with_color("Merging with previous indexer...", "yellow")
+            prev_db = FAISS.load_local(records[app], embeddings)
+            db.merge_from(prev_db)
+
+    db_file_path = os.path.join(save_path, app)
+    db_file_path = os.path.abspath(db_file_path)
+    db.save_local(db_file_path)
+
+    records[app] = db_file_path
+
+
+    save_json_file("./learner/records.json", records)
+
+    print_with_color("Indexer for {app} created successfully. Save in {path}.".format(app=app, path=db_file_path), "green")
+
+    return db_file_path
+
+
+
+
+
diff --git a/learner/learn.py b/learner/learn.py
@@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from . import indexer
+
+
+
+# configs = load_config()
+
+args = argparse.ArgumentParser()
+args.add_argument("--app", help="The name of application to learn.",
+                  type=str, default="./")
+args.add_argument("--docs", help="The help application of the app.", type=str,
+                  default="./")
+args.add_argument("--format", help="The format of the help doc.", type=str,
+                  default="xml")
+args.add_argument('--incremental', action='store_true', help='Enable incremental update.')
+args.add_argument("--save_path", help="The format of the help doc.", type=str,
+                  default="./vectordb/docs/")
+
+
+
+
+parsed_args = args.parse_args()
+
+def main():
+    """
+    Main function.
+    """
+
+    db_file_path = indexer.create_indexer(parsed_args.app, parsed_args.docs, parsed_args.format, parsed_args.incremental, parsed_args.save_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/learner/utils.py b/learner/utils.py
@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import os
+import json
+from colorama import Fore, Style, init
+
+# init colorama
+init()
+
+def print_with_color(text: str, color: str = ""):
+    """
+    Print text with specified color using ANSI escape codes from Colorama library.
+
+    :param text: The text to print.
+    :param color: The color of the text (options: red, green, yellow, blue, magenta, cyan, white, black).
+    """
+    color_mapping = {
+        "red": Fore.RED,
+        "green": Fore.GREEN,
+        "yellow": Fore.YELLOW,
+        "blue": Fore.BLUE,
+        "magenta": Fore.MAGENTA,
+        "cyan": Fore.CYAN,
+        "white": Fore.WHITE,
+        "black": Fore.BLACK
+    }
+
+    selected_color = color_mapping.get(color.lower(), "")
+    colored_text = selected_color + text + Style.RESET_ALL
+
+    print(colored_text)
+
+
+
+def find_files_with_extension(directory, extension):
+    """
+    Find files with the given extension in the given directory.
+    :param directory: The directory to search.
+    :param extension: The extension to search for.
+    :return: The list of matching files.
+    """
+    matching_files = []
+
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(extension):
+                path = os.path.join(root, file)
+                path = os.path.realpath(path)
+                matching_files.append(path)
+
+    return matching_files
+
+
+
+def find_files_with_extension_list(directory, extensions):
+    """
+    Find files with the given extensions in the given directory.
+    :param directory: The directory to search.
+    :param extensions: The list of extensions to search for.
+    :return: The list of matching files.
+    """
+    matching_files = []
+
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(tuple(extensions)):
+                path = os.path.join(root, file)
+                path = os.path.realpath(path)
+                matching_files.append(path)
+
+    return matching_files
+
+
+
+def load_json_file(file_path):
+    """
+    Load a JSON file.
+    :param file_path: The path to the file to load.
+    :return: The loaded JSON data.
+    """
+
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+
+
+def save_json_file(file_path, data):
+    """
+    Save a JSON file.
+    :param file_path: The path to the file to save.
+    """
+
+    with open(file_path, 'w') as file:
+        json.dump(data, file, indent=4)
diff --git a/learner/xml_loader.py b/learner/xml_loader.py
@@ -0,0 +1,112 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from . import basic
+import os
+from langchain_community.document_loaders import UnstructuredXMLLoader
+from langchain.docstore.document import Document
+import xml.etree.ElementTree as ET
+
+
+class XMLLoader(basic.BasicDocumentLoader):
+    """
+    Class to load XML documents.
+    """
+
+    def __init__(self, directory: str = None):
+        """
+        Create a new XMLLoader.
+        """
+
+        super().__init__()
+        self.extensions = ".xml"
+        self.directory = directory
+
+
+    def get_microsoft_document_metadata(self, file: str):
+        """
+        Get the metadata for the given file.
+        :param file: The file to get the metadata for.
+        :return: The metadata for the given file.
+        """
+
+        if not os.path.exists(file):
+            return {'title': os.path.basename(file), 'summary': os.path.basename(file)}
+
+        tree = ET.parse(file)
+        root = tree.getroot()
+
+        # Extracting title
+        if root.find('title') is not None:
+            title = root.find('title').text
+        else:
+            title = None
+
+        # Extracting content summary
+
+        if root.find('Content-Summary') is not None:
+            summary = root.find('Content-Summary').attrib['value']
+        else:
+            summary = None
+
+        return {'title': title, 'summary': summary}
+
+
+    def get_microsoft_document_text(self, file: str):
+        """
+        Get the text for the given file.
+        :param file: The file to get the text for.
+        :return: The text for the given file.
+        """
+        return UnstructuredXMLLoader(file).load()[0]
+
+
+    def construct_document_list(self):
+        """
+        Construct a list of documents.
+        :return: The list of documents.
+        """
+        documents = []
+        for file in self.load_file_name():
+            text = self.get_microsoft_document_text(file)
+            metadata = self.get_microsoft_document_metadata(file + ".meta")
+            title = metadata["title"]
+            summary = metadata["summary"]
+
+            document = {
+                'title': title,
+                'summary': summary,
+                'text':text
+            }
+            documents.append(document)
+
+        return documents
+
+
+
+    def construct_document(self):
+        """
+        Construct a langchain document list.
+        :return: The langchain document list.
+        """
+        documents = []
+        for file in self.load_file_name():
+            text = self.get_microsoft_document_text(file)
+            metadata = self.get_microsoft_document_metadata(file + ".meta")
+            title = metadata["title"]
+            summary = metadata["summary"]
+            page_content = """{title} - {summary}""".format(title=title, summary=summary)
+
+            metadata = {
+                'title': title,
+                'summary': summary,
+                'text':text
+            }
+            document = Document(page_content=page_content, metadata=metadata)
+
+            documents.append(document)
+        return documents
+
+
+
+
diff --git a/ufo/config/config.py b/ufo/config/config.py
@@ -22,6 +22,16 @@ def load_config(config_path="ufo/config/config.yaml"):
         if yaml_data:
             configs.update(yaml_data)
     except FileNotFoundError:
-        print(f"Warning: Config file not found at {config_path}. Using only environment variables.")
+        print(
+            f"Warning: Config file not found at {config_path}. Using only environment variables.")
 
-    return configs
+    # Update the API base URL for AOAI
+    if configs["API_TYPE"].lower() == "aoai":
+        configs["OPENAI_API_BASE"] = "{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}".format(
+            endpoint=configs["OPENAI_API_BASE"][:-1] if configs["OPENAI_API_BASE"].endswith(
+                "/") else configs["OPENAI_API_BASE"],
+            deployment_name=configs["AOAI_DEPLOYMENT"],
+            api_version="2024-02-15-preview"
+        )
+
+    return configs
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (c) Microsoft Corporation.
		# Licensed under the MIT License.