1 year ago · aaf5e22886
--- a/end-to-end-use-cases/data-tool/data_prep/01-Looking_at_Datasets.ipynb
+++ b/end-to-end-use-cases/data-tool/data_prep/01-Looking_at_Datasets.ipynb
--- a/end-to-end-use-cases/data-tool/data_prep/02-a-Down-sample-Nous.ipynb
+++ b/end-to-end-use-cases/data-tool/data_prep/02-a-Down-sample-Nous.ipynb
--- a/end-to-end-use-cases/data-tool/data_prep/02-b-Detailed-EDA-ToolACE.ipynb
+++ b/end-to-end-use-cases/data-tool/data_prep/02-b-Detailed-EDA-ToolACE.ipynb
--- a/end-to-end-use-cases/data-tool/data_prep/03-Pre-Process-Downsampled-Nous.ipynb
+++ b/end-to-end-use-cases/data-tool/data_prep/03-Pre-Process-Downsampled-Nous.ipynb
--- a/end-to-end-use-cases/data-tool/data_prep/_Detailed-EDA-Nous.ipynb
+++ b/end-to-end-use-cases/data-tool/data_prep/_Detailed-EDA-Nous.ipynb
--- a/end-to-end-use-cases/data-tool/scripts/pre-process-ToolAce.py
+++ b/end-to-end-use-cases/data-tool/scripts/pre-process-ToolAce.py
@@ -13,8 +13,6 @@ from datasets import Dataset, load_dataset
 
				 from tqdm import tqdm
			
 
				 
			
 
				 dataset = load_dataset("Team-ACE/ToolACE")
			
 
				-
			
 
				-# Transform data
			
 
				 new_data = {"id": [], "conversations": []}
			
 
				 
			
 
				 # Process each example
			
@@ -24,8 +22,6 @@ for example in dataset["train"]:
 
				     new_data["conversations"].append(
			
 
				         [{"from": "system", "value": example["system"]}] + example["conversations"]
			
 
				     )
			
 
				-
			
 
				-# Create new dataset with just id and conversations
			
 
				 new_dataset = Dataset.from_dict(new_data)
			
 
				 
			
 
				 # Save it
			
--- a/end-to-end-use-cases/data-tool/scripts/preprocess-json-agentic.py
+++ b/end-to-end-use-cases/data-tool/scripts/preprocess-json-agentic.py
@@ -4,7 +4,6 @@ from datasets import DatasetDict, load_from_disk
 
				 
			
 
				 
			
 
				 def preprocess_conversation(example):
			
 
				-    # Convert roles
			
 
				     for conv in example["conversations"]:
			
 
				         if conv["from"] == "human":
			
 
				             conv["from"] = "user"
			
@@ -20,17 +19,12 @@ def transform_conversations(example):
 
				     )
			
 
				     return {"id": example["id"], "conversations": conv_str}
			
 
				 
			
 
				-
			
 
				-# Load dataset
			
 
				 json_balanced = load_from_disk("balanced-json-modeagentic")
			
 
				 
			
 
				-# Apply preprocessing
			
 
				 processed_dataset = json_balanced.map(preprocess_conversation)
			
 
				 
			
 
				 processed_dataset = processed_dataset["train"].map(
			
 
				     transform_conversations, remove_columns=["category", "subcategory", "schema"]
			
 
				 )
			
 
				 
			
 
				-
			
 
				-# Save dataset
			
 
				 processed_dataset.save_to_disk("json-agentic-balanced-final")
			
--- a/end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot.py
+++ b/end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot.py
@@ -2,15 +2,15 @@ import argparse
 
				 import json
			
 
				 import os
			
 
				 from typing import Any, Dict, List, Union
			
 
				-
			
 
				+from vllm import LLM, SamplingParams
			
 
				 import torch
			
 
				 import yaml
			
 
				 from datasets import load_dataset, load_from_disk
			
 
				 from tqdm import tqdm
			
 
				+import transformers
			
 
				 
			
 
				 
			
 
				-def load_system_prompt(yaml_path: str) -> str:
			
 
				-    """Load system prompt from a YAML file."""
			
 
				+def load_system_prompt(yaml_path):
			
 
				     with open(yaml_path, "r") as f:
			
 
				         config = yaml.safe_load(f)
			
 
				     return config["system_prompt"]
			
@@ -23,9 +23,7 @@ def setup_llm(
 
				     max_model_len: int = 128000,
			
 
				     gpu_ids: List[int] = None,
			
 
				 ):
			
 
				-    """Initialize the vLLM LLM with specified parameters for multi-GPU support."""
			
 
				-    from vllm import LLM, SamplingParams
			
 
				-
			
 
				+    
			
 
				     if gpu_ids is not None:
			
 
				         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
			
 
				 
			
@@ -38,13 +36,8 @@ def setup_llm(
 
				     return llm, SamplingParams
			
 
				 
			
 
				 
			
 
				-def setup_hf_pipeline(
			
 
				-    model_name: str,
			
 
				-    gpu_ids: List[int] = None,
			
 
				-):
			
 
				-    """Initialize the HuggingFace pipeline."""
			
 
				-    import transformers
			
 
				-
			
 
				+def setup_hf_pipeline(model_name,gpu_ids):
			
 
				+    
			
 
				     if gpu_ids is not None:
			
 
				         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
			
 
				 
			
@@ -57,41 +50,27 @@ def setup_hf_pipeline(
 
				     return pipeline
			
 
				 
			
 
				 
			
 
				-def create_messages(system_prompt: str, conversation: str) -> List[Dict[str, str]]:
			
 
				-    """Create the messages list for the model input."""
			
 
				+def create_messages(system_prompt, conversation):
			
 
				     return [
			
 
				         {"role": "system", "content": system_prompt},
			
 
				         {"role": "user", "content": conversation},
			
 
				     ]
			
 
				 
			
 
				 
			
 
				-def format_prompt(system_prompt: str, conversation: str) -> str:
			
 
				-    """Format the system prompt and conversation into the specific chat template format."""
			
 
				+def format_prompt(system_prompt, conversation):
			
 
				     return (
			
 
				         f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|>"
			
 
				         f"<|start_header_id|>user<|end_header_id|>{conversation}"
			
 
				     )
			
 
				 
			
 
				 
			
 
				-def process_with_vllm(
			
 
				-    item: Dict,
			
 
				-    llm: Any,
			
 
				-    system_prompt: str,
			
 
				-    sampling_params: Any,
			
 
				-) -> str:
			
 
				-    """Process a single item using vLLM."""
			
 
				+def process_with_vllm(item,llm,system_prompt,sampling_params):
			
 
				     prompt = format_prompt(system_prompt, item["conversations"])
			
 
				     output = llm.generate(prompt, sampling_params)[0]
			
 
				     return output.outputs[0].text
			
 
				 
			
 
				 
			
 
				-def process_with_hf(
			
 
				-    item: Dict,
			
 
				-    pipeline: Any,
			
 
				-    system_prompt: str,
			
 
				-    max_new_tokens: int,
			
 
				-) -> str:
			
 
				-    """Process a single item using HuggingFace pipeline."""
			
 
				+def process_with_hf(item,pipeline,system_prompt,max_new_tokens,):
			
 
				     messages = create_messages(system_prompt, item["conversations"])
			
 
				     outputs = pipeline(
			
 
				         messages,
			
@@ -100,18 +79,7 @@ def process_with_hf(
 
				     return outputs[0]["generated_text"][-1]["content"]
			
 
				 
			
 
				 
			
 
				-def process_dataset(
			
 
				-    dataset,
			
 
				-    system_prompt: str,
			
 
				-    output_file: str,
			
 
				-    start_index: int = 0,
			
 
				-    end_index: int = None,
			
 
				-    max_new_tokens: int = 128000,
			
 
				-    use_hf: bool = False,
			
 
				-    model_instance: Any = None,
			
 
				-    sampling_params: Any = None,
			
 
				-) -> None:
			
 
				-    """Process the dataset using either vLLM or HuggingFace pipeline."""
			
 
				+def process_dataset(dataset,system_prompt,output_file,start_index,end_index,max_new_tokens,use_hf,model_instance,sampling_params,):
			
 
				     # Handle end_index
			
 
				     if end_index is None:
			
 
				         end_index = len(dataset)
			
@@ -129,16 +97,14 @@ def process_dataset(
 
				         raise ValueError(
			
 
				             f"Start index {start_index} must be less than end index {end_index}"
			
 
				         )
			
 
				-
			
 
				-    # Select the specified range
			
 
				+        
			
 
				     dataset_slice = dataset.select(range(start_index, end_index))
			
 
				 
			
 
				-    # Process examples one at a time
			
 
				     with open(output_file, "w") as f:
			
 
				         for item in tqdm(
			
 
				             dataset_slice, desc=f"Processing rows {start_index} to {end_index}"
			
 
				         ):
			
 
				-            # Generate the response using appropriate method
			
 
				+            # Select output
			
 
				             if use_hf:
			
 
				                 cot_response = process_with_hf(
			
 
				                     item, model_instance, system_prompt, max_new_tokens
			
@@ -147,8 +113,6 @@ def process_dataset(
 
				                 cot_response = process_with_vllm(
			
 
				                     item, model_instance, system_prompt, sampling_params
			
 
				                 )
			
 
				-
			
 
				-            # Save the result with both original and CoT conversations
			
 
				             result = {
			
 
				                 "id": item["id"],
			
 
				                 "conversations": item["conversations"],  # Keep original conversations
			
@@ -214,18 +178,12 @@ def main():
 
				     )
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				-    # Parse GPU IDs if provided
			
 
				     gpu_ids = None
			
 
				     if args.gpu_ids:
			
 
				         gpu_ids = [int(gpu_id) for gpu_id in args.gpu_ids.split(",")]
			
 
				 
			
 
				-    # Load system prompt from YAML
			
 
				     system_prompt = load_system_prompt(args.config)
			
 
				-
			
 
				-    # Load dataset
			
 
				     dataset = load_from_disk(args.dataset_path)
			
 
				-
			
 
				-    # Initialize appropriate model instance based on mode
			
 
				     sampling_params = None
			
 
				     if args.use_hf:
			
 
				         model_instance = setup_hf_pipeline(
			
@@ -244,8 +202,7 @@ def main():
 
				             temperature=0.7,
			
 
				             top_p=0.95,
			
 
				         )
			
 
				-
			
 
				-    # Process dataset
			
 
				+        
			
 
				     process_dataset(
			
 
				         dataset=dataset,
			
 
				         system_prompt=system_prompt,
			
--- a/end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot_vllm.py
+++ b/end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot_vllm.py
@@ -64,15 +64,13 @@ class LLM_Singleton:
 
				         self.processor = AutoProcessor.from_pretrained(model_id)
			
 
				 
			
 
				 
			
 
				-def load_system_prompt(yaml_path: str) -> str:
			
 
				-    """Load system prompt from YAML config."""
			
 
				+def load_system_prompt(yaml_path):
			
 
				     with open(yaml_path, "r") as f:
			
 
				         config = yaml.safe_load(f)
			
 
				     return config["system_prompt"]
			
 
				 
			
 
				 
			
 
				-def create_chat_message(system_prompt: str, conversation: str) -> List[Dict[str, Any]]:
			
 
				-    """Create properly formatted chat messages."""
			
 
				+def create_chat_message(system_prompt, conversation):
			
 
				     messages = [
			
 
				         {"role": "system", "content": system_prompt},
			
 
				         {"role": "user", "content": conversation},
			
@@ -80,8 +78,7 @@ def create_chat_message(system_prompt: str, conversation: str) -> List[Dict[str,
 
				     return messages
			
 
				 
			
 
				 
			
 
				-def parse_json_output(output_text: str) -> List[Dict[str, str]]:
			
 
				-    """Parse and clean model output to ensure valid JSON."""
			
 
				+def parse_json_output(output_text):
			
 
				     output_text = output_text.strip()
			
 
				     json_match = re.search(r"\[.*\]", output_text, re.DOTALL)
			
 
				 
			
@@ -117,7 +114,6 @@ def process_dataset(
 
				     batch_size: int = 16,
			
 
				     max_seq_len: int = 64000,
			
 
				 ) -> List[Dict]:
			
 
				-    """Process the dataset in parallel batches and return results."""
			
 
				     if end_index is None:
			
 
				         end_index = len(dataset)
			
 
				     else:
			
@@ -126,13 +122,10 @@ def process_dataset(
 
				     # Handle random sampling
			
 
				     dataset_size = end_index - start_index
			
 
				     if n_samples > 0:
			
 
				-        # If n_samples is larger than dataset size, use full dataset size
			
 
				         n_samples = min(n_samples, dataset_size)
			
 
				-        # Generate random indices within the specified range
			
 
				         indices = random.sample(range(start_index, end_index), n_samples)
			
 
				         dataset_slice = dataset.select(indices)
			
 
				     else:
			
 
				-        # If no sampling requested, use the full range
			
 
				         dataset_slice = dataset.select(range(start_index, end_index))
			
 
				 
			
 
				     results = []
			
@@ -154,7 +147,6 @@ def process_dataset(
 
				                 )
			
 
				                 batch_inputs.append(input_text)
			
 
				 
			
 
				-            # max_tokens here is per-batch generation limit
			
 
				             sampling_params = SamplingParams(
			
 
				                 max_tokens=max_seq_len, temperature=0.1, top_p=0.95
			
 
				             )
			
@@ -242,7 +234,6 @@ def main():
 
				     # Set spawn method for multiprocessing
			
 
				     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
			
 
				 
			
 
				-    # Load system prompt and dataset
			
 
				     system_prompt = load_system_prompt(args.config)
			
 
				     dataset = load_from_disk(args.dataset_path)
			
 
				     if isinstance(dataset, dict):
			
@@ -251,13 +242,12 @@ def main():
 
				     # Initialize VLLM instance
			
 
				     model_instance = LLM_Singleton(
			
 
				         model_id=args.model_id,
			
 
				-        max_model_len=args.max_seq_len,  # Use the passed max_seq_len
			
 
				+        max_model_len=args.max_seq_len,
			
 
				         max_num_seqs=16,
			
 
				         enforce_eager=True,
			
 
				         debug=args.debug,
			
 
				     )
			
 
				 
			
 
				-    # Process dataset and get results
			
 
				     results = process_dataset(
			
 
				         dataset=dataset,
			
 
				         system_prompt=system_prompt,
			
@@ -269,7 +259,6 @@ def main():
 
				         max_seq_len=args.max_seq_len,
			
 
				     )
			
 
				 
			
 
				-    # Convert results to HuggingFace dataset
			
 
				     output_dataset = Dataset.from_dict(
			
 
				         {
			
 
				             "id": [r["id"] for r in results],
			
@@ -278,7 +267,6 @@ def main():
 
				         }
			
 
				     )
			
 
				 
			
 
				-    # Save dataset
			
 
				     output_dataset.save_to_disk(args.output_path)
			
 
				 
			
 
				 
			
--- a/end-to-end-use-cases/data-tool/scripts/prep-for-FT.py
+++ b/end-to-end-use-cases/data-tool/scripts/prep-for-FT.py
@@ -6,9 +6,7 @@ from typing import Dict, Any
 
				 from pathlib import Path
			
 
				 import ast
			
 
				 
			
 
				-def clean_json_string(json_str: str) -> str:
			
 
				-    """Clean a JSON string by removing extra escapes."""
			
 
				-    # First try to parse it as a raw string literal
			
 
				+def clean_json_string(json_str):
			
 
				     try:
			
 
				         # This handles cases where the string is like "\\n" -> "\n"
			
 
				         cleaned = ast.literal_eval(f"'''{json_str}'''")
			
@@ -16,8 +14,8 @@ def clean_json_string(json_str: str) -> str:
 
				     except:
			
 
				         return json_str
			
 
				 
			
 
				-def process_dict(data: Dict[str, Any]) -> Dict[str, Any]:
			
 
				-    """Recursively process dictionary values to clean strings."""
			
 
				+def process_dict(data):
			
 
				+    #iteratively clean
			
 
				     cleaned_data = {}
			
 
				     for key, value in data.items():
			
 
				         if isinstance(value, str):
			
@@ -35,20 +33,16 @@ def process_dict(data: Dict[str, Any]) -> Dict[str, Any]:
 
				             cleaned_data[key] = value
			
 
				     return cleaned_data
			
 
				 
			
 
				-def process_dataset(input_path: str, output_path: str):
			
 
				-    """Process a dataset file or directory and save cleaned version."""
			
 
				+def process_dataset(input_path, output_path):
			
 
				     input_path = Path(input_path)
			
 
				     output_path = Path(output_path)
			
 
				     
			
 
				-    # Create output directory if it doesn't exist
			
 
				     output_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				     
			
 
				     if input_path.is_file():
			
 
				-        # Process single file
			
 
				         with open(input_path, 'r', encoding='utf-8') as f:
			
 
				             data = json.load(f)
			
 
				         
			
 
				-        # Clean the data
			
 
				         if isinstance(data, dict):
			
 
				             cleaned_data = process_dict(data)
			
 
				         elif isinstance(data, list):
			
@@ -59,20 +53,15 @@ def process_dataset(input_path: str, output_path: str):
 
				                 for item in data
			
 
				             ]
			
 
				         
			
 
				-        # Write cleaned data
			
 
				         with open(output_path, 'w', encoding='utf-8') as f:
			
 
				             json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
			
 
				             
			
 
				     elif input_path.is_dir():
			
 
				-        # Process directory of files
			
 
				         output_path.mkdir(parents=True, exist_ok=True)
			
 
				         for file_path in input_path.glob('**/*.json'):
			
 
				-            # Maintain directory structure in output
			
 
				             relative_path = file_path.relative_to(input_path)
			
 
				             output_file = output_path / relative_path
			
 
				             output_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				-            
			
 
				-            # Process individual file
			
 
				             process_dataset(str(file_path), str(output_file))
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/end-to-end-use-cases/data-tool/scripts/finetuning/toolcall.py
+++ b/end-to-end-use-cases/data-tool/scripts/finetuning/toolcall.py
@@ -5,20 +5,6 @@ from torchtune.data import _messages, Message
 
				 from torchtune.datasets import SFTDataset
			
 
				 from torchtune.modules.transforms import Transform
			
 
				 from torchtune.modules.transforms.tokenizers import ModelTokenizer
			
 
				-
			
 
				-# Store original validate_messages for reference if needed
			
 
				-original_validate = _messages.validate_messages
			
 
				-
			
 
				-
			
 
				-# Replace with no-op function
			
 
				-def no_validate(messages):
			
 
				-    pass
			
 
				-
			
 
				-
			
 
				-# Monkey patch
			
 
				-_messages.validate_messages = no_validate
			
 
				-
			
 
				-
			
 
				 class ToolCallMessages(Transform):
			
 
				     def __init__(self):
			
 
				         self._role_map = {