Skip to content

Commit

Permalink
folder restructure and tested python code
Browse files Browse the repository at this point in the history
  • Loading branch information
rishiraj committed Jun 26, 2024
1 parent c322946 commit d60b6b6
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 4 deletions.
4 changes: 2 additions & 2 deletions docs/docs/examples/mistral/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ This folder contains cookbooks demonstrating how to leverage Indexify and Mistra

## Contents

1. [PDF Entity Extraction Cookbook](pdf-entity-extraction-cookbook.md)
2. [PDF Summarization Cookbook](pdf-summarization-cookbook.md)
1. [PDF Entity Extraction Cookbook](pdf-entity-extraction-cookbook)
2. [PDF Summarization Cookbook](pdf-summarization-cookbook)

## Overview

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def extract_entities_from_pdf(pdf_path):
)

# Parse the JSON response
entities = json.loads(entities_content.data.decode('utf-8'))
entities = json.loads(entities_content[0]['content'].decode('utf-8'))
return entities

# Example usage
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from indexify import IndexifyClient, ExtractionGraph

client = IndexifyClient()

extraction_graph_spec = """
name: 'pdf_entity_extractor'
extraction_policies:
- extractor: 'tensorlake/pdfextractor'
name: 'pdf_to_text'
- extractor: 'tensorlake/mistral'
name: 'text_to_entities'
input_params:
model_name: 'mistral-large-latest'
key: 'YOUR_MISTRAL_API_KEY'
system_prompt: 'Extract and categorize all named entities from the following text. Provide the results in a JSON format with categories: persons, organizations, locations, dates, and miscellaneous.'
content_source: 'pdf_to_text'
"""

extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
client.create_extraction_graph(extraction_graph)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import json
import os
import requests
from indexify import IndexifyClient

def download_pdf(url, save_path):
response = requests.get(url)
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"PDF downloaded and saved to {save_path}")


def extract_entities_from_pdf(pdf_path):
client = IndexifyClient()

# Upload the PDF file
content_id = client.upload_file("pdf_entity_extractor", pdf_path)

# Wait for the extraction to complete
client.wait_for_extraction(content_id)

# Retrieve the extracted entities
entities_content = client.get_extracted_content(
content_id=content_id,
graph_name="pdf_entity_extractor",
policy_name="text_to_entities"
)

# Parse the JSON response
entities = json.loads(entities_content[0]['content'].decode('utf-8'))
return entities

# Example usage
if __name__ == "__main__":
pdf_path = "/path/to/your/document.pdf"
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
pdf_path = "reference_document.pdf"

# Download the PDF
download_pdf(pdf_url, pdf_path)
extracted_entities = extract_entities_from_pdf(pdf_path)

print("Extracted Entities:")
for category, entities in extracted_entities.items():
print(f"\n{category.capitalize()}:")
for entity in entities:
print(f"- {entity}")
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def summarize_pdf(pdf_path):
policy_name="text_to_summary"
)

return summary.data.decode('utf-8')
return summary[0]['content'].decode('utf-8')

# Example usage
if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from indexify import IndexifyClient, ExtractionGraph

client = IndexifyClient()

extraction_graph_spec = """
name: 'pdf_summarizer'
extraction_policies:
- extractor: 'tensorlake/pdfextractor'
name: 'pdf_to_text'
- extractor: 'tensorlake/mistral'
name: 'text_to_summary'
input_params:
model_name: 'mistral-large-latest'
key: 'YOUR_MISTRAL_API_KEY'
system_prompt: 'Summarize the following text in a concise manner, highlighting the key points:'
content_source: 'pdf_to_text'
"""

extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
client.create_extraction_graph(extraction_graph)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
import requests
from indexify import IndexifyClient

def download_pdf(url, save_path):
response = requests.get(url)
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"PDF downloaded and saved to {save_path}")

def summarize_pdf(pdf_path):
client = IndexifyClient()

# Upload the PDF file
content_id = client.upload_file("pdf_summarizer", pdf_path)

# Wait for the extraction to complete
client.wait_for_extraction(content_id)

# Retrieve the summarized content
summary = client.get_extracted_content(
content_id=content_id,
graph_name="pdf_summarizer",
policy_name="text_to_summary"
)

return summary[0]['content'].decode('utf-8')

# Example usage
if __name__ == "__main__":
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
pdf_path = "reference_document.pdf"

# Download the PDF
download_pdf(pdf_url, pdf_path)

# Summarize the PDF
summary = summarize_pdf(pdf_path)
print("Summary of the PDF:")
print(summary)

0 comments on commit d60b6b6

Please sign in to comment.