forked from microsoft/UFO
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'pre-release' into pre-release
- Loading branch information
Showing
12 changed files
with
380 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
|
||
from . import learn | ||
|
||
if __name__ == "__main__": | ||
# Execute the main script | ||
learn.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
from . import utils | ||
|
||
|
||
class BasicDocumentLoader: | ||
""" | ||
A class to load documents from a list of files with a given extension list. | ||
""" | ||
|
||
def __init__(self, extensions: str = None, directory: str = None): | ||
""" | ||
Create a new BasicDocumentLoader. | ||
:param extensions: The extensions to load. | ||
""" | ||
self.extensions = extensions | ||
self.directory = directory | ||
|
||
|
||
def load_file_name(self): | ||
""" | ||
Load the documents from the given directory. | ||
:param directory: The directory to load from. | ||
:return: The list of loaded documents. | ||
""" | ||
return utils.find_files_with_extension(self.directory, self.extensions) | ||
|
||
|
||
def construct_document_list(self): | ||
""" | ||
Load the metadata from the given directory. | ||
:param directory: The directory to load from. | ||
:return: The list of metadata for the loaded documents. | ||
""" | ||
pass | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
|
||
from . import xml_loader | ||
from .utils import load_json_file, save_json_file, print_with_color | ||
from langchain_community.embeddings import HuggingFaceEmbeddings | ||
from langchain_community.vectorstores import FAISS | ||
import os | ||
|
||
|
||
def create_indexer(app: str, docs: str, format: str, incremental: bool, save_path: str): | ||
""" | ||
Create an indexer for the given application. | ||
:param app: The name of the application to create an indexer for. | ||
:param docs: The help documents dir for the application. | ||
:param format: The format of the help documents. | ||
:param incremental: Whether to enable incremental updates. | ||
:param save_path: The path to save the indexer to. | ||
:return: The created indexer. | ||
""" | ||
|
||
if os.path.exists("./learner/records.json"): | ||
records = load_json_file("./learner/records.json") | ||
else: | ||
records = {} | ||
|
||
print_with_color("Loading documents from {docs}...".format(docs=docs), "cyan") | ||
|
||
loader = xml_loader.XMLLoader(docs) | ||
documents = loader.construct_document() | ||
|
||
print_with_color("Creating indexer for {num} documents for {app}...".format(num=len(documents), app=app), "yellow") | ||
|
||
if format == "xml": | ||
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | ||
else: | ||
raise ValueError("Invalid format: " + format) | ||
|
||
db = FAISS.from_documents(documents, embeddings) | ||
|
||
if incremental: | ||
if app in records: | ||
print_with_color("Merging with previous indexer...", "yellow") | ||
prev_db = FAISS.load_local(records[app], embeddings) | ||
db.merge_from(prev_db) | ||
|
||
db_file_path = os.path.join(save_path, app) | ||
db_file_path = os.path.abspath(db_file_path) | ||
db.save_local(db_file_path) | ||
|
||
records[app] = db_file_path | ||
|
||
|
||
save_json_file("./learner/records.json", records) | ||
|
||
print_with_color("Indexer for {app} created successfully. Save in {path}.".format(app=app, path=db_file_path), "green") | ||
|
||
return db_file_path | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
|
||
import argparse | ||
from . import indexer | ||
|
||
|
||
|
||
# configs = load_config() | ||
|
||
args = argparse.ArgumentParser() | ||
args.add_argument("--app", help="The name of application to learn.", | ||
type=str, default="./") | ||
args.add_argument("--docs", help="The help application of the app.", type=str, | ||
default="./") | ||
args.add_argument("--format", help="The format of the help doc.", type=str, | ||
default="xml") | ||
args.add_argument('--incremental', action='store_true', help='Enable incremental update.') | ||
args.add_argument("--save_path", help="The format of the help doc.", type=str, | ||
default="./vectordb/docs/") | ||
|
||
|
||
|
||
|
||
parsed_args = args.parse_args() | ||
|
||
def main(): | ||
""" | ||
Main function. | ||
""" | ||
|
||
db_file_path = indexer.create_indexer(parsed_args.app, parsed_args.docs, parsed_args.format, parsed_args.incremental, parsed_args.save_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
import os | ||
import json | ||
from colorama import Fore, Style, init | ||
|
||
# init colorama | ||
init() | ||
|
||
def print_with_color(text: str, color: str = ""): | ||
""" | ||
Print text with specified color using ANSI escape codes from Colorama library. | ||
:param text: The text to print. | ||
:param color: The color of the text (options: red, green, yellow, blue, magenta, cyan, white, black). | ||
""" | ||
color_mapping = { | ||
"red": Fore.RED, | ||
"green": Fore.GREEN, | ||
"yellow": Fore.YELLOW, | ||
"blue": Fore.BLUE, | ||
"magenta": Fore.MAGENTA, | ||
"cyan": Fore.CYAN, | ||
"white": Fore.WHITE, | ||
"black": Fore.BLACK | ||
} | ||
|
||
selected_color = color_mapping.get(color.lower(), "") | ||
colored_text = selected_color + text + Style.RESET_ALL | ||
|
||
print(colored_text) | ||
|
||
|
||
|
||
def find_files_with_extension(directory, extension): | ||
""" | ||
Find files with the given extension in the given directory. | ||
:param directory: The directory to search. | ||
:param extension: The extension to search for. | ||
:return: The list of matching files. | ||
""" | ||
matching_files = [] | ||
|
||
for root, _, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith(extension): | ||
path = os.path.join(root, file) | ||
path = os.path.realpath(path) | ||
matching_files.append(path) | ||
|
||
return matching_files | ||
|
||
|
||
|
||
def find_files_with_extension_list(directory, extensions): | ||
""" | ||
Find files with the given extensions in the given directory. | ||
:param directory: The directory to search. | ||
:param extensions: The list of extensions to search for. | ||
:return: The list of matching files. | ||
""" | ||
matching_files = [] | ||
|
||
for root, _, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith(tuple(extensions)): | ||
path = os.path.join(root, file) | ||
path = os.path.realpath(path) | ||
matching_files.append(path) | ||
|
||
return matching_files | ||
|
||
|
||
|
||
def load_json_file(file_path): | ||
""" | ||
Load a JSON file. | ||
:param file_path: The path to the file to load. | ||
:return: The loaded JSON data. | ||
""" | ||
|
||
with open(file_path, 'r') as file: | ||
data = json.load(file) | ||
return data | ||
|
||
|
||
|
||
def save_json_file(file_path, data): | ||
""" | ||
Save a JSON file. | ||
:param file_path: The path to the file to save. | ||
""" | ||
|
||
with open(file_path, 'w') as file: | ||
json.dump(data, file, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
|
||
from . import basic | ||
import os | ||
from langchain_community.document_loaders import UnstructuredXMLLoader | ||
from langchain.docstore.document import Document | ||
import xml.etree.ElementTree as ET | ||
|
||
|
||
class XMLLoader(basic.BasicDocumentLoader): | ||
""" | ||
Class to load XML documents. | ||
""" | ||
|
||
def __init__(self, directory: str = None): | ||
""" | ||
Create a new XMLLoader. | ||
""" | ||
|
||
super().__init__() | ||
self.extensions = ".xml" | ||
self.directory = directory | ||
|
||
|
||
def get_microsoft_document_metadata(self, file: str): | ||
""" | ||
Get the metadata for the given file. | ||
:param file: The file to get the metadata for. | ||
:return: The metadata for the given file. | ||
""" | ||
|
||
if not os.path.exists(file): | ||
return {'title': os.path.basename(file), 'summary': os.path.basename(file)} | ||
|
||
tree = ET.parse(file) | ||
root = tree.getroot() | ||
|
||
# Extracting title | ||
if root.find('title') is not None: | ||
title = root.find('title').text | ||
else: | ||
title = None | ||
|
||
# Extracting content summary | ||
|
||
if root.find('Content-Summary') is not None: | ||
summary = root.find('Content-Summary').attrib['value'] | ||
else: | ||
summary = None | ||
|
||
return {'title': title, 'summary': summary} | ||
|
||
|
||
def get_microsoft_document_text(self, file: str): | ||
""" | ||
Get the text for the given file. | ||
:param file: The file to get the text for. | ||
:return: The text for the given file. | ||
""" | ||
return UnstructuredXMLLoader(file).load()[0] | ||
|
||
|
||
def construct_document_list(self): | ||
""" | ||
Construct a list of documents. | ||
:return: The list of documents. | ||
""" | ||
documents = [] | ||
for file in self.load_file_name(): | ||
text = self.get_microsoft_document_text(file) | ||
metadata = self.get_microsoft_document_metadata(file + ".meta") | ||
title = metadata["title"] | ||
summary = metadata["summary"] | ||
|
||
document = { | ||
'title': title, | ||
'summary': summary, | ||
'text':text | ||
} | ||
documents.append(document) | ||
|
||
return documents | ||
|
||
|
||
|
||
def construct_document(self): | ||
""" | ||
Construct a langchain document list. | ||
:return: The langchain document list. | ||
""" | ||
documents = [] | ||
for file in self.load_file_name(): | ||
text = self.get_microsoft_document_text(file) | ||
metadata = self.get_microsoft_document_metadata(file + ".meta") | ||
title = metadata["title"] | ||
summary = metadata["summary"] | ||
page_content = """{title} - {summary}""".format(title=title, summary=summary) | ||
|
||
metadata = { | ||
'title': title, | ||
'summary': summary, | ||
'text':text | ||
} | ||
document = Document(page_content=page_content, metadata=metadata) | ||
|
||
documents.append(document) | ||
return documents | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.