1 month ago · 57fa72e95c
--- a/end-to-end-use-cases/structured_parser/README.md
+++ b/end-to-end-use-cases/structured_parser/README.md
@@ -27,26 +27,39 @@ The tool is designed to handle complex documents with high accuracy and provides
 
				 2. Install dependencies:
			
 
				 
			
 
				 ```bash
			
 
				+git clone https://github.com/meta-llama/llama-cookbook.git
			
 
				+```
			
 
				+```bash
			
 
				+cd llama-cookbook
			
 
				+```
			
 
				+```bash
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+2. Install project specific dependencies:
			
 
				+```bash
			
 
				+cd end-to-end-use-cases/structured_parser
			
 
				+```
			
 
				+```bash
			
 
				 pip install -r requirements.txt
			
 
				 ```
			
 
				-
			
 
				-3. Configure the tool (see Configuration section)
			
 
				-
			
 
				 ## Quick Start
			
 
				 
			
 
				-Extract text from a PDF:
			
 
				+### Configure the tool (see [Configuration](#Configuration) section)
			
 
				+(Note: Setup API Key, Model for inferencing, etc.)
			
 
				+
			
 
				+### Extract text from a PDF:
			
 
				 
			
 
				 ```bash
			
 
				 python src/structured_extraction.py path/to/document.pdf --text
			
 
				 ```
			
 
				 
			
 
				-Extract text and tables, and save tables as CSV files:
			
 
				+### Extract text and tables, and save tables as CSV files:
			
 
				 
			
 
				 ```bash
			
 
				 python src/structured_extraction.py path/to/document.pdf --text --tables --save_tables_as_csv
			
 
				 ```
			
 
				 
			
 
				-Process a directory of PDFs and export tables to Excel:
			
 
				+### Process a directory of PDFs and export tables to Excel:
			
 
				 
			
 
				 ```bash
			
 
				 python src/structured_extraction.py path/to/pdf_directory --text --tables --export_excel
			
--- a/end-to-end-use-cases/structured_parser/requirements.txt
+++ b/end-to-end-use-cases/structured_parser/requirements.txt
@@ -14,7 +14,6 @@ vllm>=0.2.0
 
				 openai>=1.0.0
			
 
				 
			
 
				 # Database and vector search
			
 
				-sqlite3>=3.35.0
			
 
				 chromadb>=0.4.0
			
 
				 sqlalchemy>=2.0.0
			
 
				 
			
--- a/end-to-end-use-cases/structured_parser/src/config.yaml
+++ b/end-to-end-use-cases/structured_parser/src/config.yaml
--- a/end-to-end-use-cases/structured_parser/src/json_to_sql.py
+++ b/end-to-end-use-cases/structured_parser/src/json_to_sql.py
@@ -100,7 +100,8 @@ class DatabaseManager:
 
				                 cursor.execute("DROP TABLE IF EXISTS document_artifacts")
			
 
				 
			
 
				                 # Create table with schema
			
 
				-                cursor.execute("""
			
 
				+                cursor.execute(
			
 
				+                    """
			
 
				                 CREATE TABLE IF NOT EXISTS document_artifacts (
			
 
				                     id INTEGER PRIMARY KEY AUTOINCREMENT,
			
 
				                     doc_path TEXT,
			
@@ -124,7 +125,8 @@ class DatabaseManager:
 
				                     image_caption TEXT,
			
 
				                     image_type TEXT
			
 
				                 )
			
 
				-                """)
			
 
				+                """
			
 
				+                )
			
 
				 
			
 
				                 # Create indexes for common queries
			
 
				                 cursor.execute(
			
--- a/end-to-end-use-cases/structured_parser/src/structured_extraction.py
+++ b/end-to-end-use-cases/structured_parser/src/structured_extraction.py
@@ -196,6 +196,7 @@ class ArtifactExtractor:
 
				         artifact_types = [r[0] for r in requests]
			
 
				         inference_requests = [r[1] for r in requests]
			
 
				 
			
 
				+        response_batch = []
			
 
				         if backend == "offline-vllm":
			
 
				             request_batch = InferenceUtils.make_vllm_batch(inference_requests)
			
 
				             response_batch = InferenceUtils.run_vllm_inference(request_batch)
			
@@ -304,79 +305,6 @@ class ArtifactExtractor:
 
				 
			
 
				         return pdf_pages
			
 
				 
			
 
				-    # @staticmethod
			
 
				-    # async def _run_inference_async(
			
 
				-    #     requests: List[Tuple[str, InferenceRequest]],
			
 
				-    # ) -> List[Tuple[str, str]]:
			
 
				-    #     """
			
 
				-    #     Run inference asynchronously for all requests.
			
 
				-
			
 
				-    #     Args:
			
 
				-    #         requests: List of tuples containing (artifact_type, inference_request)
			
 
				-
			
 
				-    #     Returns:
			
 
				-    #         List of tuples containing (artifact_type, response)
			
 
				-
			
 
				-    #     Raises:
			
 
				-    #         ValueError: If the backend is not supported
			
 
				-    #     """
			
 
				-    #     backend = config["model"].get("backend")
			
 
				-    #     if backend not in SUPPORTED_BACKENDS:
			
 
				-    #         raise ValueError(
			
 
				-    #             f"Allowed config.model.backend: {SUPPORTED_BACKENDS}, got unknown value: {backend}"
			
 
				-    #         )
			
 
				-
			
 
				-    #     artifact_types = [r[0] for r in requests]
			
 
				-    #     inference_requests = [r[1] for r in requests]
			
 
				-
			
 
				-    #     if backend == "offline-vllm":
			
 
				-    #         request_batch = InferenceUtils.make_vllm_batch(inference_requests)
			
 
				-    #         response_batch = InferenceUtils.run_vllm_inference(request_batch)
			
 
				-    #     elif backend == "openai-compat":
			
 
				-    #         tasks = [
			
 
				-    #             InferenceUtils.async_run_openai_inference(request)
			
 
				-    #             for request in inference_requests
			
 
				-    #         ]
			
 
				-    #         response_batch = await asyncio.gather(*tasks)
			
 
				-
			
 
				-    #     return list(zip(artifact_types, response_batch))
			
 
				-
			
 
				-    # @staticmethod
			
 
				-    # async def from_image_async(
			
 
				-    #     img_path: str,
			
 
				-    #     artifact_types: Union[List[str], str],
			
 
				-    # ) -> ArtifactCollection:
			
 
				-    #     """
			
 
				-    #     Extract artifacts from an image asynchronously.
			
 
				-
			
 
				-    #     Args:
			
 
				-    #         img_path: Path to the image file
			
 
				-    #         artifact_types: Type(s) of artifacts to extract
			
 
				-
			
 
				-    #     Returns:
			
 
				-    #         ArtifactCollection: Extracted artifacts
			
 
				-
			
 
				-    #     Raises:
			
 
				-    #         ValueError: If the backend is not supported
			
 
				-    #         FileNotFoundError: If the image file doesn't exist
			
 
				-    #     """
			
 
				-    #     if not os.path.exists(img_path):
			
 
				-    #         raise FileNotFoundError(f"Image file not found: {img_path}")
			
 
				-
			
 
				-    #     if isinstance(artifact_types, str):
			
 
				-    #         artifact_types = [artifact_types]
			
 
				-
			
 
				-    #     # Prepare inference requests
			
 
				-    #     requests = ArtifactExtractor._prepare_inference_requests(
			
 
				-    #         img_path, artifact_types
			
 
				-    #     )
			
 
				-
			
 
				-    #     # Run inference asynchronously
			
 
				-    #     responses = await ArtifactExtractor._run_inference_async(requests)
			
 
				-
			
 
				-    #     # Process responses
			
 
				-    #     return ArtifactExtractor._process_responses(responses)
			
 
				-
			
 
				 
			
 
				 def get_artifact_types(text: bool, tables: bool, images: bool) -> List[str]:
			
 
				     """
			
@@ -422,16 +350,16 @@ def get_target_files(target_path: str) -> List[Path]:
 
				     if not os.path.exists(target_path):
			
 
				         raise FileNotFoundError(f"Target path not found: {target_path}")
			
 
				 
			
 
				-    target_path = Path(target_path)
			
 
				-    if target_path.is_file() and target_path.suffix not in SUPPORTED_FILE_TYPES:
			
 
				+    target_path_obj = Path(target_path)
			
 
				+    if target_path_obj.is_file() and target_path_obj.suffix not in SUPPORTED_FILE_TYPES:
			
 
				         raise ValueError(
			
 
				-            f"Unsupported file type: {target_path.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
			
 
				+            f"Unsupported file type: {target_path_obj.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
			
 
				         )
			
 
				 
			
 
				     targets = (
			
 
				-        [target_path]
			
 
				-        if target_path.is_file()
			
 
				-        else [f for f in target_path.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
			
 
				+        [target_path_obj]
			
 
				+        if target_path_obj.is_file()
			
 
				+        else [f for f in target_path_obj.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
			
 
				     )
			
 
				     logger.debug(f"Processing {len(targets)} files")
			
 
				     if not targets:
			
@@ -456,7 +384,7 @@ def process_files(
 
				     out_json = []
			
 
				     for target in targets:
			
 
				         try:
			
 
				-            artifacts = ArtifactExtractor.from_pdf(target, artifact_types)
			
 
				+            artifacts = ArtifactExtractor.from_pdf(str(target), artifact_types)
			
 
				             out_json.extend(artifacts)
			
 
				         except Exception as e:
			
 
				             logger.error(f"Failed to process {target}: {e}")
			
@@ -485,6 +413,7 @@ def save_results(
 
				     output_dir.mkdir(parents=True, exist_ok=True)
			
 
				 
			
 
				     # Save to JSON file
			
 
				+    output_path = None
			
 
				     try:
			
 
				         output_path = output_dir / f"artifacts_{timestamp}.json"
			
 
				         json_content = json.dumps(data, indent=2)
			
@@ -562,8 +491,8 @@ def main(
 
				     results = process_files(targets, artifact_types)
			
 
				 
			
 
				     # Save results
			
 
				-    target_path = Path(target_path)
			
 
				-    output_dir = target_path.parent / "extracted"
			
 
				+    target_path_obj = Path(target_path)
			
 
				+    output_dir = target_path_obj.parent / "extracted"
			
 
				     save_results(
			
 
				         output_dir,
			
 
				         results,