11 maanden geleden · 7621d1f03c
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1547,3 +1547,12 @@ DeepEval
 
				 SDV
			
 
				 sklearn
			
 
				 GCP
			
 
				+compat
			
 
				+ArtifactExtractor
			
 
				+DatabaseManager
			
 
				+DocumentLens
			
 
				+PDFs
			
 
				+RequestBuilder
			
 
				+VectorIndexManager
			
 
				+csvs
			
 
				+programmatically
			
--- a/.github/workflows/pytest_cpu_gha_runner.yaml
+++ b/.github/workflows/pytest_cpu_gha_runner.yaml
@@ -72,5 +72,4 @@ jobs:
 
				         with:
			
 
				           paths: |
			
 
				             **/*.xml
			
 
				-            !**/AndroidManifest.xml
			
 
				         if: always()
			
--- a/end-to-end-use-cases/structured_parser/README.md
+++ b/end-to-end-use-cases/structured_parser/README.md
@@ -27,26 +27,39 @@ The tool is designed to handle complex documents with high accuracy and provides
 
				 2. Install dependencies:
			
 
				 
			
 
				 ```bash
			
 
				+git clone https://github.com/meta-llama/llama-cookbook.git
			
 
				+```
			
 
				+```bash
			
 
				+cd llama-cookbook
			
 
				+```
			
 
				+```bash
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+2. Install project specific dependencies:
			
 
				+```bash
			
 
				+cd end-to-end-use-cases/structured_parser
			
 
				+```
			
 
				+```bash
			
 
				 pip install -r requirements.txt
			
 
				 ```
			
 
				-
			
 
				-3. Configure the tool (see Configuration section)
			
 
				-
			
 
				 ## Quick Start
			
 
				 
			
 
				-Extract text from a PDF:
			
 
				+### Configure the tool (see [Configuration](#Configuration) section)
			
 
				+(Note: Setup API Key, Model for inferencing, etc.)
			
 
				+
			
 
				+### Extract text from a PDF:
			
 
				 
			
 
				 ```bash
			
 
				 python src/structured_extraction.py path/to/document.pdf --text
			
 
				 ```
			
 
				 
			
 
				-Extract text and tables, and save tables as CSV files:
			
 
				+### Extract text and tables, and save tables as CSV files:
			
 
				 
			
 
				 ```bash
			
 
				 python src/structured_extraction.py path/to/document.pdf --text --tables --save_tables_as_csv
			
 
				 ```
			
 
				 
			
 
				-Process a directory of PDFs and export tables to Excel:
			
 
				+### Process a directory of PDFs and export tables to Excel:
			
 
				 
			
 
				 ```bash
			
 
				 python src/structured_extraction.py path/to/pdf_directory --text --tables --export_excel
			
--- a/end-to-end-use-cases/structured_parser/requirements.txt
+++ b/end-to-end-use-cases/structured_parser/requirements.txt
@@ -14,7 +14,6 @@ vllm>=0.2.0
 
				 openai>=1.0.0
			
 
				 
			
 
				 # Database and vector search
			
 
				-sqlite3>=3.35.0
			
 
				 chromadb>=0.4.0
			
 
				 sqlalchemy>=2.0.0
			
 
				 
			
--- a/end-to-end-use-cases/structured_parser/src/config.yaml
+++ b/end-to-end-use-cases/structured_parser/src/config.yaml
--- a/end-to-end-use-cases/structured_parser/src/json_to_sql.py
+++ b/end-to-end-use-cases/structured_parser/src/json_to_sql.py
@@ -100,7 +100,8 @@ class DatabaseManager:
 
				                 cursor.execute("DROP TABLE IF EXISTS document_artifacts")
			
 
				 
			
 
				                 # Create table with schema
			
 
				-                cursor.execute("""
			
 
				+                cursor.execute(
			
 
				+                    """
			
 
				                 CREATE TABLE IF NOT EXISTS document_artifacts (
			
 
				                     id INTEGER PRIMARY KEY AUTOINCREMENT,
			
 
				                     doc_path TEXT,
			
@@ -124,7 +125,8 @@ class DatabaseManager:
 
				                     image_caption TEXT,
			
 
				                     image_type TEXT
			
 
				                 )
			
 
				-                """)
			
 
				+                """
			
 
				+                )
			
 
				 
			
 
				                 # Create indexes for common queries
			
 
				                 cursor.execute(
			
--- a/end-to-end-use-cases/structured_parser/src/structured_extraction.py
+++ b/end-to-end-use-cases/structured_parser/src/structured_extraction.py
@@ -196,6 +196,7 @@ class ArtifactExtractor:
 
				         artifact_types = [r[0] for r in requests]
			
 
				         inference_requests = [r[1] for r in requests]
			
 
				 
			
 
				+        response_batch = []
			
 
				         if backend == "offline-vllm":
			
 
				             request_batch = InferenceUtils.make_vllm_batch(inference_requests)
			
 
				             response_batch = InferenceUtils.run_vllm_inference(request_batch)
			
@@ -304,79 +305,6 @@ class ArtifactExtractor:
 
				 
			
 
				         return pdf_pages
			
 
				 
			
 
				-    # @staticmethod
			
 
				-    # async def _run_inference_async(
			
 
				-    #     requests: List[Tuple[str, InferenceRequest]],
			
 
				-    # ) -> List[Tuple[str, str]]:
			
 
				-    #     """
			
 
				-    #     Run inference asynchronously for all requests.
			
 
				-
			
 
				-    #     Args:
			
 
				-    #         requests: List of tuples containing (artifact_type, inference_request)
			
 
				-
			
 
				-    #     Returns:
			
 
				-    #         List of tuples containing (artifact_type, response)
			
 
				-
			
 
				-    #     Raises:
			
 
				-    #         ValueError: If the backend is not supported
			
 
				-    #     """
			
 
				-    #     backend = config["model"].get("backend")
			
 
				-    #     if backend not in SUPPORTED_BACKENDS:
			
 
				-    #         raise ValueError(
			
 
				-    #             f"Allowed config.model.backend: {SUPPORTED_BACKENDS}, got unknown value: {backend}"
			
 
				-    #         )
			
 
				-
			
 
				-    #     artifact_types = [r[0] for r in requests]
			
 
				-    #     inference_requests = [r[1] for r in requests]
			
 
				-
			
 
				-    #     if backend == "offline-vllm":
			
 
				-    #         request_batch = InferenceUtils.make_vllm_batch(inference_requests)
			
 
				-    #         response_batch = InferenceUtils.run_vllm_inference(request_batch)
			
 
				-    #     elif backend == "openai-compat":
			
 
				-    #         tasks = [
			
 
				-    #             InferenceUtils.async_run_openai_inference(request)
			
 
				-    #             for request in inference_requests
			
 
				-    #         ]
			
 
				-    #         response_batch = await asyncio.gather(*tasks)
			
 
				-
			
 
				-    #     return list(zip(artifact_types, response_batch))
			
 
				-
			
 
				-    # @staticmethod
			
 
				-    # async def from_image_async(
			
 
				-    #     img_path: str,
			
 
				-    #     artifact_types: Union[List[str], str],
			
 
				-    # ) -> ArtifactCollection:
			
 
				-    #     """
			
 
				-    #     Extract artifacts from an image asynchronously.
			
 
				-
			
 
				-    #     Args:
			
 
				-    #         img_path: Path to the image file
			
 
				-    #         artifact_types: Type(s) of artifacts to extract
			
 
				-
			
 
				-    #     Returns:
			
 
				-    #         ArtifactCollection: Extracted artifacts
			
 
				-
			
 
				-    #     Raises:
			
 
				-    #         ValueError: If the backend is not supported
			
 
				-    #         FileNotFoundError: If the image file doesn't exist
			
 
				-    #     """
			
 
				-    #     if not os.path.exists(img_path):
			
 
				-    #         raise FileNotFoundError(f"Image file not found: {img_path}")
			
 
				-
			
 
				-    #     if isinstance(artifact_types, str):
			
 
				-    #         artifact_types = [artifact_types]
			
 
				-
			
 
				-    #     # Prepare inference requests
			
 
				-    #     requests = ArtifactExtractor._prepare_inference_requests(
			
 
				-    #         img_path, artifact_types
			
 
				-    #     )
			
 
				-
			
 
				-    #     # Run inference asynchronously
			
 
				-    #     responses = await ArtifactExtractor._run_inference_async(requests)
			
 
				-
			
 
				-    #     # Process responses
			
 
				-    #     return ArtifactExtractor._process_responses(responses)
			
 
				-
			
 
				 
			
 
				 def get_artifact_types(text: bool, tables: bool, images: bool) -> List[str]:
			
 
				     """
			
@@ -422,16 +350,16 @@ def get_target_files(target_path: str) -> List[Path]:
 
				     if not os.path.exists(target_path):
			
 
				         raise FileNotFoundError(f"Target path not found: {target_path}")
			
 
				 
			
 
				-    target_path = Path(target_path)
			
 
				-    if target_path.is_file() and target_path.suffix not in SUPPORTED_FILE_TYPES:
			
 
				+    target_path_obj = Path(target_path)
			
 
				+    if target_path_obj.is_file() and target_path_obj.suffix not in SUPPORTED_FILE_TYPES:
			
 
				         raise ValueError(
			
 
				-            f"Unsupported file type: {target_path.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
			
 
				+            f"Unsupported file type: {target_path_obj.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
			
 
				         )
			
 
				 
			
 
				     targets = (
			
 
				-        [target_path]
			
 
				-        if target_path.is_file()
			
 
				-        else [f for f in target_path.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
			
 
				+        [target_path_obj]
			
 
				+        if target_path_obj.is_file()
			
 
				+        else [f for f in target_path_obj.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
			
 
				     )
			
 
				     logger.debug(f"Processing {len(targets)} files")
			
 
				     if not targets:
			
@@ -456,7 +384,7 @@ def process_files(
 
				     out_json = []
			
 
				     for target in targets:
			
 
				         try:
			
 
				-            artifacts = ArtifactExtractor.from_pdf(target, artifact_types)
			
 
				+            artifacts = ArtifactExtractor.from_pdf(str(target), artifact_types)
			
 
				             out_json.extend(artifacts)
			
 
				         except Exception as e:
			
 
				             logger.error(f"Failed to process {target}: {e}")
			
@@ -485,6 +413,7 @@ def save_results(
 
				     output_dir.mkdir(parents=True, exist_ok=True)
			
 
				 
			
 
				     # Save to JSON file
			
 
				+    output_path = None
			
 
				     try:
			
 
				         output_path = output_dir / f"artifacts_{timestamp}.json"
			
 
				         json_content = json.dumps(data, indent=2)
			
@@ -562,8 +491,8 @@ def main(
 
				     results = process_files(targets, artifact_types)
			
 
				 
			
 
				     # Save results
			
 
				-    target_path = Path(target_path)
			
 
				-    output_dir = target_path.parent / "extracted"
			
 
				+    target_path_obj = Path(target_path)
			
 
				+    output_dir = target_path_obj.parent / "extracted"
			
 
				     save_results(
			
 
				         output_dir,
			
 
				         results,
			
--- a/src/tests/datasets/test_samsum_datasets.py
+++ b/src/tests/datasets/test_samsum_datasets.py
@@ -1,32 +1,36 @@
 
				 # Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				 
			
 
				-import pytest
			
 
				 from dataclasses import dataclass
			
 
				 from functools import partial
			
 
				 from unittest.mock import patch
			
 
				+
			
 
				+import pytest
			
 
				 from datasets import load_dataset
			
 
				 
			
 
				+
			
 
				 @dataclass
			
 
				 class Config:
			
 
				     model_type: str = "llama"
			
 
				 
			
 
				+
			
 
				 try:
			
 
				-    load_dataset("Samsung/samsum")
			
 
				+    load_dataset("knkarthick/samsum")
			
 
				     SAMSUM_UNAVAILABLE = False
			
 
				 except ValueError:
			
 
				     SAMSUM_UNAVAILABLE = True
			
 
				 
			
 
				+
			
 
				 @pytest.mark.skipif(SAMSUM_UNAVAILABLE, reason="Samsum dataset is unavailable")
			
 
				 @pytest.mark.skip_missing_tokenizer
			
 
				-@patch('llama_cookbook.finetuning.train')
			
 
				-@patch('llama_cookbook.finetuning.AutoTokenizer')
			
 
				+@patch("llama_cookbook.finetuning.train")
			
 
				+@patch("llama_cookbook.finetuning.AutoTokenizer")
			
 
				 @patch("llama_cookbook.finetuning.AutoConfig.from_pretrained")
			
 
				 @patch("llama_cookbook.finetuning.AutoProcessor")
			
 
				 @patch("llama_cookbook.finetuning.MllamaForConditionalGeneration.from_pretrained")
			
 
				-@patch('llama_cookbook.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				-@patch('llama_cookbook.finetuning.optim.AdamW')
			
 
				-@patch('llama_cookbook.finetuning.StepLR')
			
 
				+@patch("llama_cookbook.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_cookbook.finetuning.optim.AdamW")
			
 
				+@patch("llama_cookbook.finetuning.StepLR")
			
 
				 def test_samsum_dataset(
			
 
				     step_lr,
			
 
				     optimizer,
			
@@ -39,11 +43,13 @@ def test_samsum_dataset(
 
				     mocker,
			
 
				     setup_tokenizer,
			
 
				     llama_version,
			
 
				-    ):
			
 
				+):
			
 
				     from llama_cookbook.finetuning import main
			
 
				 
			
 
				     setup_tokenizer(tokenizer)
			
 
				-    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [
			
 
				+        32000 if "Llama-2" in llama_version else 128256
			
 
				+    ]
			
 
				     get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				     get_config.return_value = Config()
			
 
				 
			
@@ -55,7 +61,7 @@ def test_samsum_dataset(
 
				         "use_peft": False,
			
 
				         "dataset": "samsum_dataset",
			
 
				         "batching_strategy": "padding",
			
 
				-        }
			
 
				+    }
			
 
				 
			
 
				     main(**kwargs)