Browse Source

Improving step by step instructions, requirements and variable initialization

varunfb 1 month ago
parent
commit
57fa72e95c

+ 19 - 6
end-to-end-use-cases/structured_parser/README.md

@@ -27,26 +27,39 @@ The tool is designed to handle complex documents with high accuracy and provides
 2. Install dependencies:
 
 ```bash
+git clone https://github.com/meta-llama/llama-cookbook.git
+```
+```bash
+cd llama-cookbook
+```
+```bash
+pip install -r requirements.txt
+```
+2. Install project specific dependencies:
+```bash
+cd end-to-end-use-cases/structured_parser
+```
+```bash
 pip install -r requirements.txt
 ```
-
-3. Configure the tool (see Configuration section)
-
 ## Quick Start
 
-Extract text from a PDF:
+### Configure the tool (see [Configuration](#Configuration) section)
+(Note: Setup API Key, Model for inferencing, etc.)
+
+### Extract text from a PDF:
 
 ```bash
 python src/structured_extraction.py path/to/document.pdf --text
 ```
 
-Extract text and tables, and save tables as CSV files:
+### Extract text and tables, and save tables as CSV files:
 
 ```bash
 python src/structured_extraction.py path/to/document.pdf --text --tables --save_tables_as_csv
 ```
 
-Process a directory of PDFs and export tables to Excel:
+### Process a directory of PDFs and export tables to Excel:
 
 ```bash
 python src/structured_extraction.py path/to/pdf_directory --text --tables --export_excel

+ 0 - 1
end-to-end-use-cases/structured_parser/requirements.txt

@@ -14,7 +14,6 @@ vllm>=0.2.0
 openai>=1.0.0
 
 # Database and vector search
-sqlite3>=3.35.0
 chromadb>=0.4.0
 sqlalchemy>=2.0.0
 

File diff suppressed because it is too large
+ 75 - 2
end-to-end-use-cases/structured_parser/src/config.yaml


+ 4 - 2
end-to-end-use-cases/structured_parser/src/json_to_sql.py

@@ -100,7 +100,8 @@ class DatabaseManager:
                 cursor.execute("DROP TABLE IF EXISTS document_artifacts")
 
                 # Create table with schema
-                cursor.execute("""
+                cursor.execute(
+                    """
                 CREATE TABLE IF NOT EXISTS document_artifacts (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     doc_path TEXT,
@@ -124,7 +125,8 @@ class DatabaseManager:
                     image_caption TEXT,
                     image_type TEXT
                 )
-                """)
+                """
+                )
 
                 # Create indexes for common queries
                 cursor.execute(

+ 11 - 82
end-to-end-use-cases/structured_parser/src/structured_extraction.py

@@ -196,6 +196,7 @@ class ArtifactExtractor:
         artifact_types = [r[0] for r in requests]
         inference_requests = [r[1] for r in requests]
 
+        response_batch = []
         if backend == "offline-vllm":
             request_batch = InferenceUtils.make_vllm_batch(inference_requests)
             response_batch = InferenceUtils.run_vllm_inference(request_batch)
@@ -304,79 +305,6 @@ class ArtifactExtractor:
 
         return pdf_pages
 
-    # @staticmethod
-    # async def _run_inference_async(
-    #     requests: List[Tuple[str, InferenceRequest]],
-    # ) -> List[Tuple[str, str]]:
-    #     """
-    #     Run inference asynchronously for all requests.
-
-    #     Args:
-    #         requests: List of tuples containing (artifact_type, inference_request)
-
-    #     Returns:
-    #         List of tuples containing (artifact_type, response)
-
-    #     Raises:
-    #         ValueError: If the backend is not supported
-    #     """
-    #     backend = config["model"].get("backend")
-    #     if backend not in SUPPORTED_BACKENDS:
-    #         raise ValueError(
-    #             f"Allowed config.model.backend: {SUPPORTED_BACKENDS}, got unknown value: {backend}"
-    #         )
-
-    #     artifact_types = [r[0] for r in requests]
-    #     inference_requests = [r[1] for r in requests]
-
-    #     if backend == "offline-vllm":
-    #         request_batch = InferenceUtils.make_vllm_batch(inference_requests)
-    #         response_batch = InferenceUtils.run_vllm_inference(request_batch)
-    #     elif backend == "openai-compat":
-    #         tasks = [
-    #             InferenceUtils.async_run_openai_inference(request)
-    #             for request in inference_requests
-    #         ]
-    #         response_batch = await asyncio.gather(*tasks)
-
-    #     return list(zip(artifact_types, response_batch))
-
-    # @staticmethod
-    # async def from_image_async(
-    #     img_path: str,
-    #     artifact_types: Union[List[str], str],
-    # ) -> ArtifactCollection:
-    #     """
-    #     Extract artifacts from an image asynchronously.
-
-    #     Args:
-    #         img_path: Path to the image file
-    #         artifact_types: Type(s) of artifacts to extract
-
-    #     Returns:
-    #         ArtifactCollection: Extracted artifacts
-
-    #     Raises:
-    #         ValueError: If the backend is not supported
-    #         FileNotFoundError: If the image file doesn't exist
-    #     """
-    #     if not os.path.exists(img_path):
-    #         raise FileNotFoundError(f"Image file not found: {img_path}")
-
-    #     if isinstance(artifact_types, str):
-    #         artifact_types = [artifact_types]
-
-    #     # Prepare inference requests
-    #     requests = ArtifactExtractor._prepare_inference_requests(
-    #         img_path, artifact_types
-    #     )
-
-    #     # Run inference asynchronously
-    #     responses = await ArtifactExtractor._run_inference_async(requests)
-
-    #     # Process responses
-    #     return ArtifactExtractor._process_responses(responses)
-
 
 def get_artifact_types(text: bool, tables: bool, images: bool) -> List[str]:
     """
@@ -422,16 +350,16 @@ def get_target_files(target_path: str) -> List[Path]:
     if not os.path.exists(target_path):
         raise FileNotFoundError(f"Target path not found: {target_path}")
 
-    target_path = Path(target_path)
-    if target_path.is_file() and target_path.suffix not in SUPPORTED_FILE_TYPES:
+    target_path_obj = Path(target_path)
+    if target_path_obj.is_file() and target_path_obj.suffix not in SUPPORTED_FILE_TYPES:
         raise ValueError(
-            f"Unsupported file type: {target_path.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
+            f"Unsupported file type: {target_path_obj.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
         )
 
     targets = (
-        [target_path]
-        if target_path.is_file()
-        else [f for f in target_path.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
+        [target_path_obj]
+        if target_path_obj.is_file()
+        else [f for f in target_path_obj.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
     )
     logger.debug(f"Processing {len(targets)} files")
     if not targets:
@@ -456,7 +384,7 @@ def process_files(
     out_json = []
     for target in targets:
         try:
-            artifacts = ArtifactExtractor.from_pdf(target, artifact_types)
+            artifacts = ArtifactExtractor.from_pdf(str(target), artifact_types)
             out_json.extend(artifacts)
         except Exception as e:
             logger.error(f"Failed to process {target}: {e}")
@@ -485,6 +413,7 @@ def save_results(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     # Save to JSON file
+    output_path = None
     try:
         output_path = output_dir / f"artifacts_{timestamp}.json"
         json_content = json.dumps(data, indent=2)
@@ -562,8 +491,8 @@ def main(
     results = process_files(targets, artifact_types)
 
     # Save results
-    target_path = Path(target_path)
-    output_dir = target_path.parent / "extracted"
+    target_path_obj = Path(target_path)
+    output_dir = target_path_obj.parent / "extracted"
     save_results(
         output_dir,
         results,