Merge remote-tracking branch 'upstream/main' into github-reader-test-…

…and-fix
crecterle · Feb 28, 2023 · 1543509 · 1543509
2 parents 62ea978 + 65a1f5e
commit 1543509
Show file tree

Hide file tree

Showing 27 changed files with 441 additions and 118 deletions.
diff --git a/loader_hub/bilibili/base.py b/loader_hub/bilibili/base.py
@@ -1,20 +1,21 @@
 """Simple Reader that reads transcript and general infor of Bilibili video."""
-from typing import Any, List, Optional
 import warnings
+from typing import Any, List
 
-from gpt_index.readers.base import BaseReader
-from gpt_index.readers.schema.base import Document
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
 
 
 class BilibiliTranscriptReader(BaseReader):
     """Bilibili Transcript and video info reader."""
 
     @staticmethod
     def get_bilibili_info_and_subs(bili_url):
+        import json
         import re
-        from bilibili_api import sync, video
+
         import requests
-        import json
+        from bilibili_api import sync, video
 
         bvid = re.search(r"BV\w+", bili_url).group()
         # Create credential object

diff --git a/loader_hub/dad_jokes/base.py b/loader_hub/dad_jokes/base.py
@@ -1,11 +1,12 @@
 """dad_jokes reader"""
 
-import requests
 from typing import List
 
+import requests
 from llama_index.readers.base import BaseReader
 from llama_index.readers.schema.base import Document
 
+
 class DadJokesReader(BaseReader):
     """Dad jokes reader.
 
@@ -14,7 +15,9 @@ class DadJokesReader(BaseReader):
     """
 
     def _get_random_dad_joke(self):
-        response = requests.get("https://icanhazdadjoke.com/", headers={"Accept": "application/json"})
+        response = requests.get(
+            "https://icanhazdadjoke.com/", headers={"Accept": "application/json"}
+        )
         response.raise_for_status()
         json_data = response.json()
         return json_data["joke"]
@@ -26,4 +29,4 @@ def load_data(self) -> List[Document]:
             None.
 
         """
-        return [Document(self._get_random_dad_joke())]
+        return [Document(self._get_random_dad_joke())]
diff --git a/loader_hub/file/rdf/README.md b/loader_hub/file/rdf/README.md
@@ -0,0 +1,19 @@
+# RDF Loader
+
+This loader extracts triples from a local [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) file using the `rdflib` Python package. The loader currently supports the RDF and RDF Schema namespaces. A single local file is passed in each time you call `load_data`.
+
+## Usage
+
+To use this loader, you need to pass in a `Path` to a local file.
+
+```python
+from pathlib import Path
+from gpt_index import download_loader
+
+RDFReader = download_loader("RDFReader")
+
+loader = RDFReader()
+documents = loader.load_data(file=Path('./knowledge-graph.nt'))
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/loader_hub/file/rdf/__init__.py b/loader_hub/file/rdf/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/loader_hub/file/rdf/base.py b/loader_hub/file/rdf/base.py
@@ -0,0 +1,79 @@
+"""Read RDF files."""
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+
+class RDFReader(BaseReader):
+    """RDF reader."""
+
+    def __init__(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize loader."""
+        super().__init__(*args, **kwargs)
+
+        from rdflib import Graph, URIRef
+        from rdflib.namespace import RDF, RDFS
+
+        self.Graph = Graph
+        self.RDF = RDF
+        self.RDFS = RDFS
+
+    def fetch_labels(self, uri: Any, graph: Any, lang: str):
+        """Fetch all labels of a URI by language."""
+
+        return list(
+            filter(
+                lambda x: x.language in [lang, None],
+                graph.objects(uri, self.RDFS.label),
+            )
+        )
+
+    def fetch_label_in_graphs(self, uri: Any, lang: str = "en"):
+        """Fetch one label of a URI by language from the local or global graph."""
+
+        labels = self.fetch_labels(uri, self.g_local, lang)
+        if len(labels) > 0:
+            return labels[0].value
+
+        labels = self.fetch_labels(uri, self.g_global, lang)
+        if len(labels) > 0:
+            return labels[0].value
+
+        raise Exception(f"Label not found for: {uri}")
+
+    def load_data(
+        self, file: Path, extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Parse file."""
+
+        lang = extra_info["lang"] if extra_info is not None else "en"
+
+        self.g_local = self.Graph()
+        self.g_local.parse(file)
+
+        self.g_global = self.Graph()
+        self.g_global.parse(str(self.RDF))
+        self.g_global.parse(str(self.RDFS))
+
+        text_list = []
+
+        for s, p, o in self.g_local:
+            if p == self.RDFS.label:
+                continue
+            triple = (
+                f"<{self.fetch_label_in_graphs(s, lang=lang)}> "
+                f"<{self.fetch_label_in_graphs(p, lang=lang)}> "
+                f"<{self.fetch_label_in_graphs(o, lang=lang)}>"
+            )
+            text_list.append(triple)
+
+        text = "\n".join(text_list)
+
+        return [Document(text, extra_info=extra_info)]
diff --git a/loader_hub/file/rdf/requirements.txt b/loader_hub/file/rdf/requirements.txt
@@ -0,0 +1 @@
+rdflib~=6.2.0
diff --git a/loader_hub/github_repo/README.md b/loader_hub/github_repo/README.md
@@ -1,17 +1,17 @@
 # Github Repository Loader
 
-This loader takes in `owner`, `repo`, `branch`, `commit` and other optional parameters such as for filtering dicrectories or only  allowing some files with given extensions etc. It then fetches all the contents of the GitHub repository.
+This loader takes in `owner`, `repo`, `branch`, `commit` and other optional parameters such as for filtering dicrectories or only allowing some files with given extensions etc. It then fetches all the contents of the GitHub repository.
 
 As a prerequisite, you will need to generate a person access token. See [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for instructions.
 
 ## Usage
 
-To use this loader, you simply need to pass in the `owner` and `repo` and either `branch` or `commit` for example, you can  `owner = jerryjliu` and `repo = gpt_index` and also either branch or commit `branch = main` or `commit = a6c89159bf8e7086bea2f4305cff3f0a4102e370`
+To use this loader, you simply need to pass in the `owner` and `repo` and either `branch` or `commit` for example, you can `owner = jerryjliu` and `repo = gpt_index` and also either branch or commit `branch = main` or `commit = a6c89159bf8e7086bea2f4305cff3f0a4102e370`
 
 ```python
 import os
 
-from gpt_index import download_loader
+from llama_index import download_loader
 download_loader("GithubRepositoryReader")
 
 from modules.github_repo import GithubRepositoryReader, GithubClient
@@ -36,9 +36,9 @@ for doc in docs:
 
 ## Examples
 
-This loader designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
+This loader designed to be used as a way to load data into [Llama Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
 
-### GPT Index
+### Llama Index
 
 ```python
 import pickle
@@ -48,7 +48,7 @@ assert (
     os.getenv("OPENAI_API_KEY") is not None
 ), "Please set the OPENAI_API_KEY environment variable."
 
-from gpt_index import download_loader
+from llama_index import download_loader
 download_loader("GithubRepositoryReader")
 
 from modules.github_repo import GithubClient, GithubRepositoryReader
@@ -79,5 +79,5 @@ if docs is None:
 
 index = GPTSimpleVectorIndex(docs)
 
-index.query("Explain each GPTIndex class?")
+index.query("Explain each LlamaIndex class?")
 ```
diff --git a/loader_hub/github_repo/base.py b/loader_hub/github_repo/base.py
@@ -46,19 +46,29 @@
         get_file_extension,
     )
 else:
-    from github_client import (
-        BaseGithubClient,
+    from llama_index.readers.llamahub_modules.github_repo.github_client import (
         GithubClient,
         GitBranchResponseModel,
         GitCommitResponseModel,
         GitTreeResponseModel,
     )
-    from utils import (
+    from llama_index.readers.llamahub_modules.github_repo.utils import (
         BufferedGitBlobDataIterator,
         print_if_verbose,
         get_file_extension,
     )
 
+# from typing import Any, Callable, List, Optional, Tuple
+
+# from llama_index.readers.base import BaseReader
+# from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
+# from llama_index.readers.llamahub_modules.github_repo.github_client import (
+#     BaseGithubClient, GitBranchResponseModel, GitCommitResponseModel,
+#     GithubClient, GitTreeResponseModel)
+# from llama_index.readers.llamahub_modules.github_repo.utils import (
+#     BufferedGitBlobDataIterator, get_file_extension, print_if_verbose)
+# from llama_index.readers.schema.base import Document
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 

diff --git a/loader_hub/google_calendar/base.py b/loader_hub/google_calendar/base.py
@@ -1,12 +1,13 @@
 """Google Calendar reader."""
 
-import os
 import datetime
+import os
 from typing import Any, List, Optional, Union
+
 from llama_index.readers.base import BaseReader
 from llama_index.readers.schema.base import Document
 
-SCOPES = ['https://www.googleapis.com/auth/calendar.readonly']
+SCOPES = ["https://www.googleapis.com/auth/calendar.readonly"]
 
 # Copyright 2018 Google LLC
 #
@@ -22,22 +23,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 class GoogleCalendarReader(BaseReader):
     """Google Calendar reader.
 
-    	Reads events from Google Calendar
+    Reads events from Google Calendar
 
     """
 
-    def load_data(self, number_of_results: Optional[int] = 100, start_date: Optional[Union[str, datetime.date]] = None) -> List[Document]:
+    def load_data(
+        self,
+        number_of_results: Optional[int] = 100,
+        start_date: Optional[Union[str, datetime.date]] = None,
+    ) -> List[Document]:
 
         """Load data from user's calendar.
-            
-            Args:
-            	number_of_results (Optional[int]): the number of events to return. Defaults to 100.
-                start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
+
+        Args:
+            number_of_results (Optional[int]): the number of events to return. Defaults to 100.
+            start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
         """
-        
+
         from googleapiclient.discovery import build
 
         credentials = self._get_credentials()
@@ -50,45 +56,49 @@ def load_data(self, number_of_results: Optional[int] = 100, start_date: Optional
 
         start_datetime = datetime.datetime.combine(start_date, datetime.time.min)
         start_datetime_utc = start_datetime.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
-
-        events_result = service.events().list(
-            calendarId='primary',
-            timeMin=start_datetime_utc,
-            maxResults=number_of_results,
-            singleEvents=True,
-            orderBy='startTime'
-        ).execute()
 
-        events = events_result.get('items', [])
+        events_result = (
+            service.events()
+            .list(
+                calendarId="primary",
+                timeMin=start_datetime_utc,
+                maxResults=number_of_results,
+                singleEvents=True,
+                orderBy="startTime",
+            )
+            .execute()
+        )
+
+        events = events_result.get("items", [])
 
         if not events:
             return []
 
         results = []
         for event in events:
-            if 'dateTime' in event['start']:
-                start_time = event['start']['dateTime']
+            if "dateTime" in event["start"]:
+                start_time = event["start"]["dateTime"]
             else:
-                start_time = event['start']['date']
+                start_time = event["start"]["date"]
 
-            if 'dateTime' in event['end']:
-                end_time = event['end']['dateTime']
+            if "dateTime" in event["end"]:
+                end_time = event["end"]["dateTime"]
             else:
-                end_time = event['end']['date']
-                
+                end_time = event["end"]["date"]
+
             event_string = f"Status: {event['status']}, "
             event_string += f"Summary: {event['summary']}, "
             event_string += f"Start time: {start_time}, "
             event_string += f"End time: {end_time}, "
-            
-            organizer = event.get('organizer', {})
-            display_name = organizer.get('displayName', 'N/A')
-            email = organizer.get('email', 'N/A')
-            if display_name != 'N/A':
+
+            organizer = event.get("organizer", {})
+            display_name = organizer.get("displayName", "N/A")
+            email = organizer.get("email", "N/A")
+            if display_name != "N/A":
                 event_string += f"Organizer: {display_name} ({email})"
             else:
                 event_string += f"Organizer: {email}"
-    		
+
             results.append(Document(event_string))
 
         return results
@@ -125,8 +135,7 @@ def _get_credentials(self) -> Any:
 
         return creds
 
+
 if __name__ == "__main__":
     reader = GoogleCalendarReader()
-    print(
-        reader.load_data()
-    )
+    print(reader.load_data())