Skip to content

Commit

Permalink
Merge branch 'pre-release' into pre-release
Browse files Browse the repository at this point in the history
  • Loading branch information
vyokky authored Mar 6, 2024
2 parents bfd0bf6 + 0ebe839 commit c480b5a
Show file tree
Hide file tree
Showing 12 changed files with 380 additions and 14 deletions.
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,9 @@ __pycache__/

# Ignore the config file
ufo/config/config.yaml
*.yaml.test
*.yaml


# Ignore the helper files
ufo/rag/app_docs/*
learner/records.json
vectordb/*


2 changes: 2 additions & 0 deletions learner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
8 changes: 8 additions & 0 deletions learner/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from . import learn

if __name__ == "__main__":
# Execute the main script
learn.main()
39 changes: 39 additions & 0 deletions learner/basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from . import utils


class BasicDocumentLoader:
"""
A class to load documents from a list of files with a given extension list.
"""

def __init__(self, extensions: str = None, directory: str = None):
"""
Create a new BasicDocumentLoader.
:param extensions: The extensions to load.
"""
self.extensions = extensions
self.directory = directory


def load_file_name(self):
"""
Load the documents from the given directory.
:param directory: The directory to load from.
:return: The list of loaded documents.
"""
return utils.find_files_with_extension(self.directory, self.extensions)


def construct_document_list(self):
"""
Load the metadata from the given directory.
:param directory: The directory to load from.
:return: The list of metadata for the loaded documents.
"""
pass




63 changes: 63 additions & 0 deletions learner/indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from . import xml_loader
from .utils import load_json_file, save_json_file, print_with_color
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os


def create_indexer(app: str, docs: str, format: str, incremental: bool, save_path: str):
"""
Create an indexer for the given application.
:param app: The name of the application to create an indexer for.
:param docs: The help documents dir for the application.
:param format: The format of the help documents.
:param incremental: Whether to enable incremental updates.
:param save_path: The path to save the indexer to.
:return: The created indexer.
"""

if os.path.exists("./learner/records.json"):
records = load_json_file("./learner/records.json")
else:
records = {}

print_with_color("Loading documents from {docs}...".format(docs=docs), "cyan")

loader = xml_loader.XMLLoader(docs)
documents = loader.construct_document()

print_with_color("Creating indexer for {num} documents for {app}...".format(num=len(documents), app=app), "yellow")

if format == "xml":
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
else:
raise ValueError("Invalid format: " + format)

db = FAISS.from_documents(documents, embeddings)

if incremental:
if app in records:
print_with_color("Merging with previous indexer...", "yellow")
prev_db = FAISS.load_local(records[app], embeddings)
db.merge_from(prev_db)

db_file_path = os.path.join(save_path, app)
db_file_path = os.path.abspath(db_file_path)
db.save_local(db_file_path)

records[app] = db_file_path


save_json_file("./learner/records.json", records)

print_with_color("Indexer for {app} created successfully. Save in {path}.".format(app=app, path=db_file_path), "green")

return db_file_path





36 changes: 36 additions & 0 deletions learner/learn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import argparse
from . import indexer



# configs = load_config()

args = argparse.ArgumentParser()
args.add_argument("--app", help="The name of application to learn.",
type=str, default="./")
args.add_argument("--docs", help="The help application of the app.", type=str,
default="./")
args.add_argument("--format", help="The format of the help doc.", type=str,
default="xml")
args.add_argument('--incremental', action='store_true', help='Enable incremental update.')
args.add_argument("--save_path", help="The format of the help doc.", type=str,
default="./vectordb/docs/")




parsed_args = args.parse_args()

def main():
"""
Main function.
"""

db_file_path = indexer.create_indexer(parsed_args.app, parsed_args.docs, parsed_args.format, parsed_args.incremental, parsed_args.save_path)


if __name__ == "__main__":
main()
95 changes: 95 additions & 0 deletions learner/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import json
from colorama import Fore, Style, init

# init colorama
init()

def print_with_color(text: str, color: str = ""):
"""
Print text with specified color using ANSI escape codes from Colorama library.
:param text: The text to print.
:param color: The color of the text (options: red, green, yellow, blue, magenta, cyan, white, black).
"""
color_mapping = {
"red": Fore.RED,
"green": Fore.GREEN,
"yellow": Fore.YELLOW,
"blue": Fore.BLUE,
"magenta": Fore.MAGENTA,
"cyan": Fore.CYAN,
"white": Fore.WHITE,
"black": Fore.BLACK
}

selected_color = color_mapping.get(color.lower(), "")
colored_text = selected_color + text + Style.RESET_ALL

print(colored_text)



def find_files_with_extension(directory, extension):
"""
Find files with the given extension in the given directory.
:param directory: The directory to search.
:param extension: The extension to search for.
:return: The list of matching files.
"""
matching_files = []

for root, _, files in os.walk(directory):
for file in files:
if file.endswith(extension):
path = os.path.join(root, file)
path = os.path.realpath(path)
matching_files.append(path)

return matching_files



def find_files_with_extension_list(directory, extensions):
"""
Find files with the given extensions in the given directory.
:param directory: The directory to search.
:param extensions: The list of extensions to search for.
:return: The list of matching files.
"""
matching_files = []

for root, _, files in os.walk(directory):
for file in files:
if file.endswith(tuple(extensions)):
path = os.path.join(root, file)
path = os.path.realpath(path)
matching_files.append(path)

return matching_files



def load_json_file(file_path):
"""
Load a JSON file.
:param file_path: The path to the file to load.
:return: The loaded JSON data.
"""

with open(file_path, 'r') as file:
data = json.load(file)
return data



def save_json_file(file_path, data):
"""
Save a JSON file.
:param file_path: The path to the file to save.
"""

with open(file_path, 'w') as file:
json.dump(data, file, indent=4)
112 changes: 112 additions & 0 deletions learner/xml_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from . import basic
import os
from langchain_community.document_loaders import UnstructuredXMLLoader
from langchain.docstore.document import Document
import xml.etree.ElementTree as ET


class XMLLoader(basic.BasicDocumentLoader):
"""
Class to load XML documents.
"""

def __init__(self, directory: str = None):
"""
Create a new XMLLoader.
"""

super().__init__()
self.extensions = ".xml"
self.directory = directory


def get_microsoft_document_metadata(self, file: str):
"""
Get the metadata for the given file.
:param file: The file to get the metadata for.
:return: The metadata for the given file.
"""

if not os.path.exists(file):
return {'title': os.path.basename(file), 'summary': os.path.basename(file)}

tree = ET.parse(file)
root = tree.getroot()

# Extracting title
if root.find('title') is not None:
title = root.find('title').text
else:
title = None

# Extracting content summary

if root.find('Content-Summary') is not None:
summary = root.find('Content-Summary').attrib['value']
else:
summary = None

return {'title': title, 'summary': summary}


def get_microsoft_document_text(self, file: str):
"""
Get the text for the given file.
:param file: The file to get the text for.
:return: The text for the given file.
"""
return UnstructuredXMLLoader(file).load()[0]


def construct_document_list(self):
"""
Construct a list of documents.
:return: The list of documents.
"""
documents = []
for file in self.load_file_name():
text = self.get_microsoft_document_text(file)
metadata = self.get_microsoft_document_metadata(file + ".meta")
title = metadata["title"]
summary = metadata["summary"]

document = {
'title': title,
'summary': summary,
'text':text
}
documents.append(document)

return documents



def construct_document(self):
"""
Construct a langchain document list.
:return: The langchain document list.
"""
documents = []
for file in self.load_file_name():
text = self.get_microsoft_document_text(file)
metadata = self.get_microsoft_document_metadata(file + ".meta")
title = metadata["title"]
summary = metadata["summary"]
page_content = """{title} - {summary}""".format(title=title, summary=summary)

metadata = {
'title': title,
'summary': summary,
'text':text
}
document = Document(page_content=page_content, metadata=metadata)

documents.append(document)
return documents




14 changes: 12 additions & 2 deletions ufo/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ def load_config(config_path="ufo/config/config.yaml"):
if yaml_data:
configs.update(yaml_data)
except FileNotFoundError:
print(f"Warning: Config file not found at {config_path}. Using only environment variables.")
print(
f"Warning: Config file not found at {config_path}. Using only environment variables.")

return configs
# Update the API base URL for AOAI
if configs["API_TYPE"].lower() == "aoai":
configs["OPENAI_API_BASE"] = "{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}".format(
endpoint=configs["OPENAI_API_BASE"][:-1] if configs["OPENAI_API_BASE"].endswith(
"/") else configs["OPENAI_API_BASE"],
deployment_name=configs["AOAI_DEPLOYMENT"],
api_version="2024-02-15-preview"
)

return configs
Loading

0 comments on commit c480b5a

Please sign in to comment.