Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into github-reader-test-…
Browse files Browse the repository at this point in the history
…and-fix
  • Loading branch information
ahmetkca committed Feb 28, 2023
2 parents 62ea978 + 65a1f5e commit 1543509
Show file tree
Hide file tree
Showing 27 changed files with 441 additions and 118 deletions.
11 changes: 6 additions & 5 deletions loader_hub/bilibili/base.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
"""Simple Reader that reads transcript and general infor of Bilibili video."""
from typing import Any, List, Optional
import warnings
from typing import Any, List

from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class BilibiliTranscriptReader(BaseReader):
"""Bilibili Transcript and video info reader."""

@staticmethod
def get_bilibili_info_and_subs(bili_url):
import json
import re
from bilibili_api import sync, video

import requests
import json
from bilibili_api import sync, video

bvid = re.search(r"BV\w+", bili_url).group()
# Create credential object
Expand Down
9 changes: 6 additions & 3 deletions loader_hub/dad_jokes/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""dad_jokes reader"""

import requests
from typing import List

import requests
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class DadJokesReader(BaseReader):
"""Dad jokes reader.
Expand All @@ -14,7 +15,9 @@ class DadJokesReader(BaseReader):
"""

def _get_random_dad_joke(self):
response = requests.get("https://icanhazdadjoke.com/", headers={"Accept": "application/json"})
response = requests.get(
"https://icanhazdadjoke.com/", headers={"Accept": "application/json"}
)
response.raise_for_status()
json_data = response.json()
return json_data["joke"]
Expand All @@ -26,4 +29,4 @@ def load_data(self) -> List[Document]:
None.
"""
return [Document(self._get_random_dad_joke())]
return [Document(self._get_random_dad_joke())]
19 changes: 19 additions & 0 deletions loader_hub/file/rdf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# RDF Loader

This loader extracts triples from a local [RDF](https://en.wikipedia.org/wiki/Resource_Description_Framework) file using the `rdflib` Python package. The loader currently supports the RDF and RDF Schema namespaces. A single local file is passed in each time you call `load_data`.

## Usage

To use this loader, you need to pass in a `Path` to a local file.

```python
from pathlib import Path
from gpt_index import download_loader

RDFReader = download_loader("RDFReader")

loader = RDFReader()
documents = loader.load_data(file=Path('./knowledge-graph.nt'))
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
1 change: 1 addition & 0 deletions loader_hub/file/rdf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file."""
79 changes: 79 additions & 0 deletions loader_hub/file/rdf/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Read RDF files."""

from pathlib import Path
from typing import Any, Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class RDFReader(BaseReader):
"""RDF reader."""

def __init__(
self,
*args: Any,
**kwargs: Any,
) -> None:
"""Initialize loader."""
super().__init__(*args, **kwargs)

from rdflib import Graph, URIRef
from rdflib.namespace import RDF, RDFS

self.Graph = Graph
self.RDF = RDF
self.RDFS = RDFS

def fetch_labels(self, uri: Any, graph: Any, lang: str):
"""Fetch all labels of a URI by language."""

return list(
filter(
lambda x: x.language in [lang, None],
graph.objects(uri, self.RDFS.label),
)
)

def fetch_label_in_graphs(self, uri: Any, lang: str = "en"):
"""Fetch one label of a URI by language from the local or global graph."""

labels = self.fetch_labels(uri, self.g_local, lang)
if len(labels) > 0:
return labels[0].value

labels = self.fetch_labels(uri, self.g_global, lang)
if len(labels) > 0:
return labels[0].value

raise Exception(f"Label not found for: {uri}")

def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""

lang = extra_info["lang"] if extra_info is not None else "en"

self.g_local = self.Graph()
self.g_local.parse(file)

self.g_global = self.Graph()
self.g_global.parse(str(self.RDF))
self.g_global.parse(str(self.RDFS))

text_list = []

for s, p, o in self.g_local:
if p == self.RDFS.label:
continue
triple = (
f"<{self.fetch_label_in_graphs(s, lang=lang)}> "
f"<{self.fetch_label_in_graphs(p, lang=lang)}> "
f"<{self.fetch_label_in_graphs(o, lang=lang)}>"
)
text_list.append(triple)

text = "\n".join(text_list)

return [Document(text, extra_info=extra_info)]
1 change: 1 addition & 0 deletions loader_hub/file/rdf/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
rdflib~=6.2.0
14 changes: 7 additions & 7 deletions loader_hub/github_repo/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
# Github Repository Loader

This loader takes in `owner`, `repo`, `branch`, `commit` and other optional parameters such as for filtering dicrectories or only allowing some files with given extensions etc. It then fetches all the contents of the GitHub repository.
This loader takes in `owner`, `repo`, `branch`, `commit` and other optional parameters such as for filtering dicrectories or only allowing some files with given extensions etc. It then fetches all the contents of the GitHub repository.

As a prerequisite, you will need to generate a person access token. See [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) for instructions.

## Usage

To use this loader, you simply need to pass in the `owner` and `repo` and either `branch` or `commit` for example, you can `owner = jerryjliu` and `repo = gpt_index` and also either branch or commit `branch = main` or `commit = a6c89159bf8e7086bea2f4305cff3f0a4102e370`
To use this loader, you simply need to pass in the `owner` and `repo` and either `branch` or `commit` for example, you can `owner = jerryjliu` and `repo = gpt_index` and also either branch or commit `branch = main` or `commit = a6c89159bf8e7086bea2f4305cff3f0a4102e370`

```python
import os

from gpt_index import download_loader
from llama_index import download_loader
download_loader("GithubRepositoryReader")

from modules.github_repo import GithubRepositoryReader, GithubClient
Expand All @@ -36,9 +36,9 @@ for doc in docs:

## Examples

This loader designed to be used as a way to load data into [GPT Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
This loader designed to be used as a way to load data into [Llama Index](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.

### GPT Index
### Llama Index

```python
import pickle
Expand All @@ -48,7 +48,7 @@ assert (
os.getenv("OPENAI_API_KEY") is not None
), "Please set the OPENAI_API_KEY environment variable."

from gpt_index import download_loader
from llama_index import download_loader
download_loader("GithubRepositoryReader")

from modules.github_repo import GithubClient, GithubRepositoryReader
Expand Down Expand Up @@ -79,5 +79,5 @@ if docs is None:

index = GPTSimpleVectorIndex(docs)

index.query("Explain each GPTIndex class?")
index.query("Explain each LlamaIndex class?")
```
16 changes: 13 additions & 3 deletions loader_hub/github_repo/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,29 @@
get_file_extension,
)
else:
from github_client import (
BaseGithubClient,
from llama_index.readers.llamahub_modules.github_repo.github_client import (
GithubClient,
GitBranchResponseModel,
GitCommitResponseModel,
GitTreeResponseModel,
)
from utils import (
from llama_index.readers.llamahub_modules.github_repo.utils import (
BufferedGitBlobDataIterator,
print_if_verbose,
get_file_extension,
)

# from typing import Any, Callable, List, Optional, Tuple

# from llama_index.readers.base import BaseReader
# from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
# from llama_index.readers.llamahub_modules.github_repo.github_client import (
# BaseGithubClient, GitBranchResponseModel, GitCommitResponseModel,
# GithubClient, GitTreeResponseModel)
# from llama_index.readers.llamahub_modules.github_repo.utils import (
# BufferedGitBlobDataIterator, get_file_extension, print_if_verbose)
# from llama_index.readers.schema.base import Document

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Expand Down
77 changes: 43 additions & 34 deletions loader_hub/google_calendar/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""Google Calendar reader."""

import os
import datetime
import os
from typing import Any, List, Optional, Union

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

SCOPES = ['https://www.googleapis.com/auth/calendar.readonly']
SCOPES = ["https://www.googleapis.com/auth/calendar.readonly"]

# Copyright 2018 Google LLC
#
Expand All @@ -22,22 +23,27 @@
# See the License for the specific language governing permissions and
# limitations under the License.


class GoogleCalendarReader(BaseReader):
"""Google Calendar reader.
Reads events from Google Calendar
Reads events from Google Calendar
"""

def load_data(self, number_of_results: Optional[int] = 100, start_date: Optional[Union[str, datetime.date]] = None) -> List[Document]:
def load_data(
self,
number_of_results: Optional[int] = 100,
start_date: Optional[Union[str, datetime.date]] = None,
) -> List[Document]:

"""Load data from user's calendar.
Args:
number_of_results (Optional[int]): the number of events to return. Defaults to 100.
start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
Args:
number_of_results (Optional[int]): the number of events to return. Defaults to 100.
start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
"""

from googleapiclient.discovery import build

credentials = self._get_credentials()
Expand All @@ -50,45 +56,49 @@ def load_data(self, number_of_results: Optional[int] = 100, start_date: Optional

start_datetime = datetime.datetime.combine(start_date, datetime.time.min)
start_datetime_utc = start_datetime.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

events_result = service.events().list(
calendarId='primary',
timeMin=start_datetime_utc,
maxResults=number_of_results,
singleEvents=True,
orderBy='startTime'
).execute()

events = events_result.get('items', [])
events_result = (
service.events()
.list(
calendarId="primary",
timeMin=start_datetime_utc,
maxResults=number_of_results,
singleEvents=True,
orderBy="startTime",
)
.execute()
)

events = events_result.get("items", [])

if not events:
return []

results = []
for event in events:
if 'dateTime' in event['start']:
start_time = event['start']['dateTime']
if "dateTime" in event["start"]:
start_time = event["start"]["dateTime"]
else:
start_time = event['start']['date']
start_time = event["start"]["date"]

if 'dateTime' in event['end']:
end_time = event['end']['dateTime']
if "dateTime" in event["end"]:
end_time = event["end"]["dateTime"]
else:
end_time = event['end']['date']
end_time = event["end"]["date"]

event_string = f"Status: {event['status']}, "
event_string += f"Summary: {event['summary']}, "
event_string += f"Start time: {start_time}, "
event_string += f"End time: {end_time}, "
organizer = event.get('organizer', {})
display_name = organizer.get('displayName', 'N/A')
email = organizer.get('email', 'N/A')
if display_name != 'N/A':

organizer = event.get("organizer", {})
display_name = organizer.get("displayName", "N/A")
email = organizer.get("email", "N/A")
if display_name != "N/A":
event_string += f"Organizer: {display_name} ({email})"
else:
event_string += f"Organizer: {email}"

results.append(Document(event_string))

return results
Expand Down Expand Up @@ -125,8 +135,7 @@ def _get_credentials(self) -> Any:

return creds


if __name__ == "__main__":
reader = GoogleCalendarReader()
print(
reader.load_data()
)
print(reader.load_data())
Loading

0 comments on commit 1543509

Please sign in to comment.