forked from tensorlakeai/indexify
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
folder restructure and tested python code
- Loading branch information
Showing
7 changed files
with
131 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 20 additions & 0 deletions
20
docs/docs/examples/mistral/pdf-entity-extraction-cookbook/pdf_entity_extraction_pipeline.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from indexify import IndexifyClient, ExtractionGraph | ||
|
||
client = IndexifyClient() | ||
|
||
extraction_graph_spec = """ | ||
name: 'pdf_entity_extractor' | ||
extraction_policies: | ||
- extractor: 'tensorlake/pdfextractor' | ||
name: 'pdf_to_text' | ||
- extractor: 'tensorlake/mistral' | ||
name: 'text_to_entities' | ||
input_params: | ||
model_name: 'mistral-large-latest' | ||
key: 'YOUR_MISTRAL_API_KEY' | ||
system_prompt: 'Extract and categorize all named entities from the following text. Provide the results in a JSON format with categories: persons, organizations, locations, dates, and miscellaneous.' | ||
content_source: 'pdf_to_text' | ||
""" | ||
|
||
extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec) | ||
client.create_extraction_graph(extraction_graph) |
47 changes: 47 additions & 0 deletions
47
docs/docs/examples/mistral/pdf-entity-extraction-cookbook/upload_and_retreive.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import json | ||
import os | ||
import requests | ||
from indexify import IndexifyClient | ||
|
||
def download_pdf(url, save_path): | ||
response = requests.get(url) | ||
with open(save_path, 'wb') as f: | ||
f.write(response.content) | ||
print(f"PDF downloaded and saved to {save_path}") | ||
|
||
|
||
def extract_entities_from_pdf(pdf_path): | ||
client = IndexifyClient() | ||
|
||
# Upload the PDF file | ||
content_id = client.upload_file("pdf_entity_extractor", pdf_path) | ||
|
||
# Wait for the extraction to complete | ||
client.wait_for_extraction(content_id) | ||
|
||
# Retrieve the extracted entities | ||
entities_content = client.get_extracted_content( | ||
content_id=content_id, | ||
graph_name="pdf_entity_extractor", | ||
policy_name="text_to_entities" | ||
) | ||
|
||
# Parse the JSON response | ||
entities = json.loads(entities_content[0]['content'].decode('utf-8')) | ||
return entities | ||
|
||
# Example usage | ||
if __name__ == "__main__": | ||
pdf_path = "/path/to/your/document.pdf" | ||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" | ||
pdf_path = "reference_document.pdf" | ||
|
||
# Download the PDF | ||
download_pdf(pdf_url, pdf_path) | ||
extracted_entities = extract_entities_from_pdf(pdf_path) | ||
|
||
print("Extracted Entities:") | ||
for category, entities in extracted_entities.items(): | ||
print(f"\n{category.capitalize()}:") | ||
for entity in entities: | ||
print(f"- {entity}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 20 additions & 0 deletions
20
docs/docs/examples/mistral/pdf-summarization-cookbook/pdf_summarization_graph.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from indexify import IndexifyClient, ExtractionGraph | ||
|
||
client = IndexifyClient() | ||
|
||
extraction_graph_spec = """ | ||
name: 'pdf_summarizer' | ||
extraction_policies: | ||
- extractor: 'tensorlake/pdfextractor' | ||
name: 'pdf_to_text' | ||
- extractor: 'tensorlake/mistral' | ||
name: 'text_to_summary' | ||
input_params: | ||
model_name: 'mistral-large-latest' | ||
key: 'YOUR_MISTRAL_API_KEY' | ||
system_prompt: 'Summarize the following text in a concise manner, highlighting the key points:' | ||
content_source: 'pdf_to_text' | ||
""" | ||
|
||
extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec) | ||
client.create_extraction_graph(extraction_graph) |
40 changes: 40 additions & 0 deletions
40
docs/docs/examples/mistral/pdf-summarization-cookbook/upload_and_retreive.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import os | ||
import requests | ||
from indexify import IndexifyClient | ||
|
||
def download_pdf(url, save_path): | ||
response = requests.get(url) | ||
with open(save_path, 'wb') as f: | ||
f.write(response.content) | ||
print(f"PDF downloaded and saved to {save_path}") | ||
|
||
def summarize_pdf(pdf_path): | ||
client = IndexifyClient() | ||
|
||
# Upload the PDF file | ||
content_id = client.upload_file("pdf_summarizer", pdf_path) | ||
|
||
# Wait for the extraction to complete | ||
client.wait_for_extraction(content_id) | ||
|
||
# Retrieve the summarized content | ||
summary = client.get_extracted_content( | ||
content_id=content_id, | ||
graph_name="pdf_summarizer", | ||
policy_name="text_to_summary" | ||
) | ||
|
||
return summary[0]['content'].decode('utf-8') | ||
|
||
# Example usage | ||
if __name__ == "__main__": | ||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" | ||
pdf_path = "reference_document.pdf" | ||
|
||
# Download the PDF | ||
download_pdf(pdf_url, pdf_path) | ||
|
||
# Summarize the PDF | ||
summary = summarize_pdf(pdf_path) | ||
print("Summary of the PDF:") | ||
print(summary) |