folder restructure and tested python code

ockhamlabsinc · Jun 26, 2024 · d60b6b6 · d60b6b6
1 parent c322946
commit d60b6b6
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 4 deletions.
diff --git a/docs/docs/examples/mistral/README.md b/docs/docs/examples/mistral/README.md
@@ -10,8 +10,8 @@ This folder contains cookbooks demonstrating how to leverage Indexify and Mistra
 
 ## Contents
 
-1. [PDF Entity Extraction Cookbook](pdf-entity-extraction-cookbook.md)
-2. [PDF Summarization Cookbook](pdf-summarization-cookbook.md)
+1. [PDF Entity Extraction Cookbook](pdf-entity-extraction-cookbook)
+2. [PDF Summarization Cookbook](pdf-summarization-cookbook)
 
 ## Overview
 

diff --git a/...mistral/pdf-entity-extraction-cookbook.md → .../pdf-entity-extraction-cookbook/README.md b/...mistral/pdf-entity-extraction-cookbook.md → .../pdf-entity-extraction-cookbook/README.md
@@ -131,7 +131,7 @@ def extract_entities_from_pdf(pdf_path):
  )
 
  # Parse the JSON response
- entities = json.loads(entities_content.data.decode('utf-8'))
+ entities = json.loads(entities_content[0]['content'].decode('utf-8'))
  return entities
 
 # Example usage

diff --git a/docs/docs/examples/mistral/pdf-entity-extraction-cookbook/pdf_entity_extraction_pipeline.py b/docs/docs/examples/mistral/pdf-entity-extraction-cookbook/pdf_entity_extraction_pipeline.py
@@ -0,0 +1,20 @@
+from indexify import IndexifyClient, ExtractionGraph
+
+client = IndexifyClient()
+
+extraction_graph_spec = """
+name: 'pdf_entity_extractor'
+extraction_policies:
+ - extractor: 'tensorlake/pdfextractor'
+ name: 'pdf_to_text'
+ - extractor: 'tensorlake/mistral'
+ name: 'text_to_entities'
+ input_params:
+ model_name: 'mistral-large-latest'
+ key: 'YOUR_MISTRAL_API_KEY'
+ system_prompt: 'Extract and categorize all named entities from the following text. Provide the results in a JSON format with categories: persons, organizations, locations, dates, and miscellaneous.'
+ content_source: 'pdf_to_text'
+"""
+
+extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
+client.create_extraction_graph(extraction_graph)
diff --git a/docs/docs/examples/mistral/pdf-entity-extraction-cookbook/upload_and_retreive.py b/docs/docs/examples/mistral/pdf-entity-extraction-cookbook/upload_and_retreive.py
@@ -0,0 +1,47 @@
+import json
+import os
+import requests
+from indexify import IndexifyClient
+
+def download_pdf(url, save_path):
+ response = requests.get(url)
+ with open(save_path, 'wb') as f:
+ f.write(response.content)
+ print(f"PDF downloaded and saved to {save_path}")
+
+
+def extract_entities_from_pdf(pdf_path):
+ client = IndexifyClient()
+
+ # Upload the PDF file
+ content_id = client.upload_file("pdf_entity_extractor", pdf_path)
+
+ # Wait for the extraction to complete
+ client.wait_for_extraction(content_id)
+
+ # Retrieve the extracted entities
+ entities_content = client.get_extracted_content(
+ content_id=content_id,
+ graph_name="pdf_entity_extractor",
+ policy_name="text_to_entities"
+ )
+
+ # Parse the JSON response
+ entities = json.loads(entities_content[0]['content'].decode('utf-8'))
+ return entities
+
+# Example usage
+if __name__ == "__main__":
+ pdf_path = "/path/to/your/document.pdf"
+ pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
+ pdf_path = "reference_document.pdf"
+
+ # Download the PDF
+ download_pdf(pdf_url, pdf_path)
+ extracted_entities = extract_entities_from_pdf(pdf_path)
+
+ print("Extracted Entities:")
+ for category, entities in extracted_entities.items():
+ print(f"\n{category.capitalize()}:")
+ for entity in entities:
+ print(f"- {entity}")
diff --git a/...les/mistral/pdf-summarization-cookbook.md → ...tral/pdf-summarization-cookbook/README.md b/...les/mistral/pdf-summarization-cookbook.md → ...tral/pdf-summarization-cookbook/README.md
@@ -131,7 +131,7 @@ def summarize_pdf(pdf_path):
  policy_name="text_to_summary"
  )
 
- return summary.data.decode('utf-8')
+ return summary[0]['content'].decode('utf-8')
 
 # Example usage
 if __name__ == "__main__":

diff --git a/docs/docs/examples/mistral/pdf-summarization-cookbook/pdf_summarization_graph.py b/docs/docs/examples/mistral/pdf-summarization-cookbook/pdf_summarization_graph.py
@@ -0,0 +1,20 @@
+from indexify import IndexifyClient, ExtractionGraph
+
+client = IndexifyClient()
+
+extraction_graph_spec = """
+name: 'pdf_summarizer'
+extraction_policies:
+ - extractor: 'tensorlake/pdfextractor'
+ name: 'pdf_to_text'
+ - extractor: 'tensorlake/mistral'
+ name: 'text_to_summary'
+ input_params:
+ model_name: 'mistral-large-latest'
+ key: 'YOUR_MISTRAL_API_KEY'
+ system_prompt: 'Summarize the following text in a concise manner, highlighting the key points:'
+ content_source: 'pdf_to_text'
+"""
+
+extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
+client.create_extraction_graph(extraction_graph)
diff --git a/docs/docs/examples/mistral/pdf-summarization-cookbook/upload_and_retreive.py b/docs/docs/examples/mistral/pdf-summarization-cookbook/upload_and_retreive.py
@@ -0,0 +1,40 @@
+import os
+import requests
+from indexify import IndexifyClient
+
+def download_pdf(url, save_path):
+ response = requests.get(url)
+ with open(save_path, 'wb') as f:
+ f.write(response.content)
+ print(f"PDF downloaded and saved to {save_path}")
+
+def summarize_pdf(pdf_path):
+ client = IndexifyClient()
+
+ # Upload the PDF file
+ content_id = client.upload_file("pdf_summarizer", pdf_path)
+
+ # Wait for the extraction to complete
+ client.wait_for_extraction(content_id)
+
+ # Retrieve the summarized content
+ summary = client.get_extracted_content(
+ content_id=content_id,
+ graph_name="pdf_summarizer",
+ policy_name="text_to_summary"
+ )
+
+ return summary[0]['content'].decode('utf-8')
+
+# Example usage
+if __name__ == "__main__":
+ pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
+ pdf_path = "reference_document.pdf"
+
+ # Download the PDF
+ download_pdf(pdf_url, pdf_path)
+
+ # Summarize the PDF
+ summary = summarize_pdf(pdf_path)
+ print("Summary of the PDF:")
+ print(summary)