Browse Source

Improving step by step instructions for structured_parser (#985)

varunfb 1 week ago
parent
commit
7621d1f03c

+ 9 - 0
.github/scripts/spellcheck_conf/wordlist.txt

@@ -1547,3 +1547,12 @@ DeepEval
 SDV
 sklearn
 GCP
+compat
+ArtifactExtractor
+DatabaseManager
+DocumentLens
+PDFs
+RequestBuilder
+VectorIndexManager
+csvs
+programmatically

+ 0 - 1
.github/workflows/pytest_cpu_gha_runner.yaml

@@ -72,5 +72,4 @@ jobs:
         with:
           paths: |
             **/*.xml
-            !**/AndroidManifest.xml
         if: always()

+ 19 - 6
end-to-end-use-cases/structured_parser/README.md

@@ -27,26 +27,39 @@ The tool is designed to handle complex documents with high accuracy and provides
 2. Install dependencies:
 
 ```bash
+git clone https://github.com/meta-llama/llama-cookbook.git
+```
+```bash
+cd llama-cookbook
+```
+```bash
+pip install -r requirements.txt
+```
+2. Install project specific dependencies:
+```bash
+cd end-to-end-use-cases/structured_parser
+```
+```bash
 pip install -r requirements.txt
 ```
-
-3. Configure the tool (see Configuration section)
-
 ## Quick Start
 
-Extract text from a PDF:
+### Configure the tool (see [Configuration](#Configuration) section)
+(Note: Setup API Key, Model for inferencing, etc.)
+
+### Extract text from a PDF:
 
 ```bash
 python src/structured_extraction.py path/to/document.pdf --text
 ```
 
-Extract text and tables, and save tables as CSV files:
+### Extract text and tables, and save tables as CSV files:
 
 ```bash
 python src/structured_extraction.py path/to/document.pdf --text --tables --save_tables_as_csv
 ```
 
-Process a directory of PDFs and export tables to Excel:
+### Process a directory of PDFs and export tables to Excel:
 
 ```bash
 python src/structured_extraction.py path/to/pdf_directory --text --tables --export_excel

+ 0 - 1
end-to-end-use-cases/structured_parser/requirements.txt

@@ -14,7 +14,6 @@ vllm>=0.2.0
 openai>=1.0.0
 
 # Database and vector search
-sqlite3>=3.35.0
 chromadb>=0.4.0
 sqlalchemy>=2.0.0
 

File diff suppressed because it is too large
+ 75 - 2
end-to-end-use-cases/structured_parser/src/config.yaml


+ 4 - 2
end-to-end-use-cases/structured_parser/src/json_to_sql.py

@@ -100,7 +100,8 @@ class DatabaseManager:
                 cursor.execute("DROP TABLE IF EXISTS document_artifacts")
 
                 # Create table with schema
-                cursor.execute("""
+                cursor.execute(
+                    """
                 CREATE TABLE IF NOT EXISTS document_artifacts (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     doc_path TEXT,
@@ -124,7 +125,8 @@ class DatabaseManager:
                     image_caption TEXT,
                     image_type TEXT
                 )
-                """)
+                """
+                )
 
                 # Create indexes for common queries
                 cursor.execute(

+ 11 - 82
end-to-end-use-cases/structured_parser/src/structured_extraction.py

@@ -196,6 +196,7 @@ class ArtifactExtractor:
         artifact_types = [r[0] for r in requests]
         inference_requests = [r[1] for r in requests]
 
+        response_batch = []
         if backend == "offline-vllm":
             request_batch = InferenceUtils.make_vllm_batch(inference_requests)
             response_batch = InferenceUtils.run_vllm_inference(request_batch)
@@ -304,79 +305,6 @@ class ArtifactExtractor:
 
         return pdf_pages
 
-    # @staticmethod
-    # async def _run_inference_async(
-    #     requests: List[Tuple[str, InferenceRequest]],
-    # ) -> List[Tuple[str, str]]:
-    #     """
-    #     Run inference asynchronously for all requests.
-
-    #     Args:
-    #         requests: List of tuples containing (artifact_type, inference_request)
-
-    #     Returns:
-    #         List of tuples containing (artifact_type, response)
-
-    #     Raises:
-    #         ValueError: If the backend is not supported
-    #     """
-    #     backend = config["model"].get("backend")
-    #     if backend not in SUPPORTED_BACKENDS:
-    #         raise ValueError(
-    #             f"Allowed config.model.backend: {SUPPORTED_BACKENDS}, got unknown value: {backend}"
-    #         )
-
-    #     artifact_types = [r[0] for r in requests]
-    #     inference_requests = [r[1] for r in requests]
-
-    #     if backend == "offline-vllm":
-    #         request_batch = InferenceUtils.make_vllm_batch(inference_requests)
-    #         response_batch = InferenceUtils.run_vllm_inference(request_batch)
-    #     elif backend == "openai-compat":
-    #         tasks = [
-    #             InferenceUtils.async_run_openai_inference(request)
-    #             for request in inference_requests
-    #         ]
-    #         response_batch = await asyncio.gather(*tasks)
-
-    #     return list(zip(artifact_types, response_batch))
-
-    # @staticmethod
-    # async def from_image_async(
-    #     img_path: str,
-    #     artifact_types: Union[List[str], str],
-    # ) -> ArtifactCollection:
-    #     """
-    #     Extract artifacts from an image asynchronously.
-
-    #     Args:
-    #         img_path: Path to the image file
-    #         artifact_types: Type(s) of artifacts to extract
-
-    #     Returns:
-    #         ArtifactCollection: Extracted artifacts
-
-    #     Raises:
-    #         ValueError: If the backend is not supported
-    #         FileNotFoundError: If the image file doesn't exist
-    #     """
-    #     if not os.path.exists(img_path):
-    #         raise FileNotFoundError(f"Image file not found: {img_path}")
-
-    #     if isinstance(artifact_types, str):
-    #         artifact_types = [artifact_types]
-
-    #     # Prepare inference requests
-    #     requests = ArtifactExtractor._prepare_inference_requests(
-    #         img_path, artifact_types
-    #     )
-
-    #     # Run inference asynchronously
-    #     responses = await ArtifactExtractor._run_inference_async(requests)
-
-    #     # Process responses
-    #     return ArtifactExtractor._process_responses(responses)
-
 
 def get_artifact_types(text: bool, tables: bool, images: bool) -> List[str]:
     """
@@ -422,16 +350,16 @@ def get_target_files(target_path: str) -> List[Path]:
     if not os.path.exists(target_path):
         raise FileNotFoundError(f"Target path not found: {target_path}")
 
-    target_path = Path(target_path)
-    if target_path.is_file() and target_path.suffix not in SUPPORTED_FILE_TYPES:
+    target_path_obj = Path(target_path)
+    if target_path_obj.is_file() and target_path_obj.suffix not in SUPPORTED_FILE_TYPES:
         raise ValueError(
-            f"Unsupported file type: {target_path.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
+            f"Unsupported file type: {target_path_obj.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
         )
 
     targets = (
-        [target_path]
-        if target_path.is_file()
-        else [f for f in target_path.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
+        [target_path_obj]
+        if target_path_obj.is_file()
+        else [f for f in target_path_obj.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
     )
     logger.debug(f"Processing {len(targets)} files")
     if not targets:
@@ -456,7 +384,7 @@ def process_files(
     out_json = []
     for target in targets:
         try:
-            artifacts = ArtifactExtractor.from_pdf(target, artifact_types)
+            artifacts = ArtifactExtractor.from_pdf(str(target), artifact_types)
             out_json.extend(artifacts)
         except Exception as e:
             logger.error(f"Failed to process {target}: {e}")
@@ -485,6 +413,7 @@ def save_results(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     # Save to JSON file
+    output_path = None
     try:
         output_path = output_dir / f"artifacts_{timestamp}.json"
         json_content = json.dumps(data, indent=2)
@@ -562,8 +491,8 @@ def main(
     results = process_files(targets, artifact_types)
 
     # Save results
-    target_path = Path(target_path)
-    output_dir = target_path.parent / "extracted"
+    target_path_obj = Path(target_path)
+    output_dir = target_path_obj.parent / "extracted"
     save_results(
         output_dir,
         results,

+ 16 - 10
src/tests/datasets/test_samsum_datasets.py

@@ -1,32 +1,36 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-import pytest
 from dataclasses import dataclass
 from functools import partial
 from unittest.mock import patch
+
+import pytest
 from datasets import load_dataset
 
+
 @dataclass
 class Config:
     model_type: str = "llama"
 
+
 try:
-    load_dataset("Samsung/samsum")
+    load_dataset("knkarthick/samsum")
     SAMSUM_UNAVAILABLE = False
 except ValueError:
     SAMSUM_UNAVAILABLE = True
 
+
 @pytest.mark.skipif(SAMSUM_UNAVAILABLE, reason="Samsum dataset is unavailable")
 @pytest.mark.skip_missing_tokenizer
-@patch('llama_cookbook.finetuning.train')
-@patch('llama_cookbook.finetuning.AutoTokenizer')
+@patch("llama_cookbook.finetuning.train")
+@patch("llama_cookbook.finetuning.AutoTokenizer")
 @patch("llama_cookbook.finetuning.AutoConfig.from_pretrained")
 @patch("llama_cookbook.finetuning.AutoProcessor")
 @patch("llama_cookbook.finetuning.MllamaForConditionalGeneration.from_pretrained")
-@patch('llama_cookbook.finetuning.LlamaForCausalLM.from_pretrained')
-@patch('llama_cookbook.finetuning.optim.AdamW')
-@patch('llama_cookbook.finetuning.StepLR')
+@patch("llama_cookbook.finetuning.LlamaForCausalLM.from_pretrained")
+@patch("llama_cookbook.finetuning.optim.AdamW")
+@patch("llama_cookbook.finetuning.StepLR")
 def test_samsum_dataset(
     step_lr,
     optimizer,
@@ -39,11 +43,13 @@ def test_samsum_dataset(
     mocker,
     setup_tokenizer,
     llama_version,
-    ):
+):
     from llama_cookbook.finetuning import main
 
     setup_tokenizer(tokenizer)
-    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [
+        32000 if "Llama-2" in llama_version else 128256
+    ]
     get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0]
     get_config.return_value = Config()
 
@@ -55,7 +61,7 @@ def test_samsum_dataset(
         "use_peft": False,
         "dataset": "samsum_dataset",
         "batching_strategy": "padding",
-        }
+    }
 
     main(**kwargs)