Browse Source

fix structure and naming

Sanyam Bhutani 2 months ago
parent
commit
aaf5e22886

end-to-end-use-cases/data-tool/Notebooks/01-Looking_at_Datasets.ipynb → end-to-end-use-cases/data-tool/data_prep/01-Looking_at_Datasets.ipynb


end-to-end-use-cases/data-tool/Notebooks/02-a-Down-sample-Nous.ipynb → end-to-end-use-cases/data-tool/data_prep/02-a-Down-sample-Nous.ipynb


end-to-end-use-cases/data-tool/Notebooks/02-b-Detailed-EDA-ToolACE.ipynb → end-to-end-use-cases/data-tool/data_prep/02-b-Detailed-EDA-ToolACE.ipynb


end-to-end-use-cases/data-tool/Notebooks/03-Pre-Process-Downsampled-Nous.ipynb → end-to-end-use-cases/data-tool/data_prep/03-Pre-Process-Downsampled-Nous.ipynb


end-to-end-use-cases/data-tool/Notebooks/_Detailed-EDA-Nous.ipynb → end-to-end-use-cases/data-tool/data_prep/_Detailed-EDA-Nous.ipynb


+ 0 - 4
end-to-end-use-cases/data-tool/scripts/pre-process-ToolAce.py

@@ -13,8 +13,6 @@ from datasets import Dataset, load_dataset
 from tqdm import tqdm
 
 dataset = load_dataset("Team-ACE/ToolACE")
-
-# Transform data
 new_data = {"id": [], "conversations": []}
 
 # Process each example
@@ -24,8 +22,6 @@ for example in dataset["train"]:
     new_data["conversations"].append(
         [{"from": "system", "value": example["system"]}] + example["conversations"]
     )
-
-# Create new dataset with just id and conversations
 new_dataset = Dataset.from_dict(new_data)
 
 # Save it

+ 0 - 6
end-to-end-use-cases/data-tool/scripts/preprocess-json-agentic.py

@@ -4,7 +4,6 @@ from datasets import DatasetDict, load_from_disk
 
 
 def preprocess_conversation(example):
-    # Convert roles
     for conv in example["conversations"]:
         if conv["from"] == "human":
             conv["from"] = "user"
@@ -20,17 +19,12 @@ def transform_conversations(example):
     )
     return {"id": example["id"], "conversations": conv_str}
 
-
-# Load dataset
 json_balanced = load_from_disk("balanced-json-modeagentic")
 
-# Apply preprocessing
 processed_dataset = json_balanced.map(preprocess_conversation)
 
 processed_dataset = processed_dataset["train"].map(
     transform_conversations, remove_columns=["category", "subcategory", "schema"]
 )
 
-
-# Save dataset
 processed_dataset.save_to_disk("json-agentic-balanced-final")

+ 14 - 57
end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot.py

@@ -2,15 +2,15 @@ import argparse
 import json
 import os
 from typing import Any, Dict, List, Union
-
+from vllm import LLM, SamplingParams
 import torch
 import yaml
 from datasets import load_dataset, load_from_disk
 from tqdm import tqdm
+import transformers
 
 
-def load_system_prompt(yaml_path: str) -> str:
-    """Load system prompt from a YAML file."""
+def load_system_prompt(yaml_path):
     with open(yaml_path, "r") as f:
         config = yaml.safe_load(f)
     return config["system_prompt"]
@@ -23,9 +23,7 @@ def setup_llm(
     max_model_len: int = 128000,
     gpu_ids: List[int] = None,
 ):
-    """Initialize the vLLM LLM with specified parameters for multi-GPU support."""
-    from vllm import LLM, SamplingParams
-
+    
     if gpu_ids is not None:
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
 
@@ -38,13 +36,8 @@ def setup_llm(
     return llm, SamplingParams
 
 
-def setup_hf_pipeline(
-    model_name: str,
-    gpu_ids: List[int] = None,
-):
-    """Initialize the HuggingFace pipeline."""
-    import transformers
-
+def setup_hf_pipeline(model_name,gpu_ids):
+    
     if gpu_ids is not None:
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
 
@@ -57,41 +50,27 @@ def setup_hf_pipeline(
     return pipeline
 
 
-def create_messages(system_prompt: str, conversation: str) -> List[Dict[str, str]]:
-    """Create the messages list for the model input."""
+def create_messages(system_prompt, conversation):
     return [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": conversation},
     ]
 
 
-def format_prompt(system_prompt: str, conversation: str) -> str:
-    """Format the system prompt and conversation into the specific chat template format."""
+def format_prompt(system_prompt, conversation):
     return (
         f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|>"
         f"<|start_header_id|>user<|end_header_id|>{conversation}"
     )
 
 
-def process_with_vllm(
-    item: Dict,
-    llm: Any,
-    system_prompt: str,
-    sampling_params: Any,
-) -> str:
-    """Process a single item using vLLM."""
+def process_with_vllm(item,llm,system_prompt,sampling_params):
     prompt = format_prompt(system_prompt, item["conversations"])
     output = llm.generate(prompt, sampling_params)[0]
     return output.outputs[0].text
 
 
-def process_with_hf(
-    item: Dict,
-    pipeline: Any,
-    system_prompt: str,
-    max_new_tokens: int,
-) -> str:
-    """Process a single item using HuggingFace pipeline."""
+def process_with_hf(item,pipeline,system_prompt,max_new_tokens,):
     messages = create_messages(system_prompt, item["conversations"])
     outputs = pipeline(
         messages,
@@ -100,18 +79,7 @@ def process_with_hf(
     return outputs[0]["generated_text"][-1]["content"]
 
 
-def process_dataset(
-    dataset,
-    system_prompt: str,
-    output_file: str,
-    start_index: int = 0,
-    end_index: int = None,
-    max_new_tokens: int = 128000,
-    use_hf: bool = False,
-    model_instance: Any = None,
-    sampling_params: Any = None,
-) -> None:
-    """Process the dataset using either vLLM or HuggingFace pipeline."""
+def process_dataset(dataset,system_prompt,output_file,start_index,end_index,max_new_tokens,use_hf,model_instance,sampling_params,):
     # Handle end_index
     if end_index is None:
         end_index = len(dataset)
@@ -129,16 +97,14 @@ def process_dataset(
         raise ValueError(
             f"Start index {start_index} must be less than end index {end_index}"
         )
-
-    # Select the specified range
+        
     dataset_slice = dataset.select(range(start_index, end_index))
 
-    # Process examples one at a time
     with open(output_file, "w") as f:
         for item in tqdm(
             dataset_slice, desc=f"Processing rows {start_index} to {end_index}"
         ):
-            # Generate the response using appropriate method
+            # Select output
             if use_hf:
                 cot_response = process_with_hf(
                     item, model_instance, system_prompt, max_new_tokens
@@ -147,8 +113,6 @@ def process_dataset(
                 cot_response = process_with_vllm(
                     item, model_instance, system_prompt, sampling_params
                 )
-
-            # Save the result with both original and CoT conversations
             result = {
                 "id": item["id"],
                 "conversations": item["conversations"],  # Keep original conversations
@@ -214,18 +178,12 @@ def main():
     )
     args = parser.parse_args()
 
-    # Parse GPU IDs if provided
     gpu_ids = None
     if args.gpu_ids:
         gpu_ids = [int(gpu_id) for gpu_id in args.gpu_ids.split(",")]
 
-    # Load system prompt from YAML
     system_prompt = load_system_prompt(args.config)
-
-    # Load dataset
     dataset = load_from_disk(args.dataset_path)
-
-    # Initialize appropriate model instance based on mode
     sampling_params = None
     if args.use_hf:
         model_instance = setup_hf_pipeline(
@@ -244,8 +202,7 @@ def main():
             temperature=0.7,
             top_p=0.95,
         )
-
-    # Process dataset
+        
     process_dataset(
         dataset=dataset,
         system_prompt=system_prompt,

+ 4 - 16
end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot_vllm.py

@@ -64,15 +64,13 @@ class LLM_Singleton:
         self.processor = AutoProcessor.from_pretrained(model_id)
 
 
-def load_system_prompt(yaml_path: str) -> str:
-    """Load system prompt from YAML config."""
+def load_system_prompt(yaml_path):
     with open(yaml_path, "r") as f:
         config = yaml.safe_load(f)
     return config["system_prompt"]
 
 
-def create_chat_message(system_prompt: str, conversation: str) -> List[Dict[str, Any]]:
-    """Create properly formatted chat messages."""
+def create_chat_message(system_prompt, conversation):
     messages = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": conversation},
@@ -80,8 +78,7 @@ def create_chat_message(system_prompt: str, conversation: str) -> List[Dict[str,
     return messages
 
 
-def parse_json_output(output_text: str) -> List[Dict[str, str]]:
-    """Parse and clean model output to ensure valid JSON."""
+def parse_json_output(output_text):
     output_text = output_text.strip()
     json_match = re.search(r"\[.*\]", output_text, re.DOTALL)
 
@@ -117,7 +114,6 @@ def process_dataset(
     batch_size: int = 16,
     max_seq_len: int = 64000,
 ) -> List[Dict]:
-    """Process the dataset in parallel batches and return results."""
     if end_index is None:
         end_index = len(dataset)
     else:
@@ -126,13 +122,10 @@ def process_dataset(
     # Handle random sampling
     dataset_size = end_index - start_index
     if n_samples > 0:
-        # If n_samples is larger than dataset size, use full dataset size
         n_samples = min(n_samples, dataset_size)
-        # Generate random indices within the specified range
         indices = random.sample(range(start_index, end_index), n_samples)
         dataset_slice = dataset.select(indices)
     else:
-        # If no sampling requested, use the full range
         dataset_slice = dataset.select(range(start_index, end_index))
 
     results = []
@@ -154,7 +147,6 @@ def process_dataset(
                 )
                 batch_inputs.append(input_text)
 
-            # max_tokens here is per-batch generation limit
             sampling_params = SamplingParams(
                 max_tokens=max_seq_len, temperature=0.1, top_p=0.95
             )
@@ -242,7 +234,6 @@ def main():
     # Set spawn method for multiprocessing
     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-    # Load system prompt and dataset
     system_prompt = load_system_prompt(args.config)
     dataset = load_from_disk(args.dataset_path)
     if isinstance(dataset, dict):
@@ -251,13 +242,12 @@ def main():
     # Initialize VLLM instance
     model_instance = LLM_Singleton(
         model_id=args.model_id,
-        max_model_len=args.max_seq_len,  # Use the passed max_seq_len
+        max_model_len=args.max_seq_len,
         max_num_seqs=16,
         enforce_eager=True,
         debug=args.debug,
     )
 
-    # Process dataset and get results
     results = process_dataset(
         dataset=dataset,
         system_prompt=system_prompt,
@@ -269,7 +259,6 @@ def main():
         max_seq_len=args.max_seq_len,
     )
 
-    # Convert results to HuggingFace dataset
     output_dataset = Dataset.from_dict(
         {
             "id": [r["id"] for r in results],
@@ -278,7 +267,6 @@ def main():
         }
     )
 
-    # Save dataset
     output_dataset.save_to_disk(args.output_path)
 
 

+ 4 - 15
end-to-end-use-cases/data-tool/scripts/prep-for-FT.py

@@ -6,9 +6,7 @@ from typing import Dict, Any
 from pathlib import Path
 import ast
 
-def clean_json_string(json_str: str) -> str:
-    """Clean a JSON string by removing extra escapes."""
-    # First try to parse it as a raw string literal
+def clean_json_string(json_str):
     try:
         # This handles cases where the string is like "\\n" -> "\n"
         cleaned = ast.literal_eval(f"'''{json_str}'''")
@@ -16,8 +14,8 @@ def clean_json_string(json_str: str) -> str:
     except:
         return json_str
 
-def process_dict(data: Dict[str, Any]) -> Dict[str, Any]:
-    """Recursively process dictionary values to clean strings."""
+def process_dict(data):
+    #iteratively clean
     cleaned_data = {}
     for key, value in data.items():
         if isinstance(value, str):
@@ -35,20 +33,16 @@ def process_dict(data: Dict[str, Any]) -> Dict[str, Any]:
             cleaned_data[key] = value
     return cleaned_data
 
-def process_dataset(input_path: str, output_path: str):
-    """Process a dataset file or directory and save cleaned version."""
+def process_dataset(input_path, output_path):
     input_path = Path(input_path)
     output_path = Path(output_path)
     
-    # Create output directory if it doesn't exist
     output_path.parent.mkdir(parents=True, exist_ok=True)
     
     if input_path.is_file():
-        # Process single file
         with open(input_path, 'r', encoding='utf-8') as f:
             data = json.load(f)
         
-        # Clean the data
         if isinstance(data, dict):
             cleaned_data = process_dict(data)
         elif isinstance(data, list):
@@ -59,20 +53,15 @@ def process_dataset(input_path: str, output_path: str):
                 for item in data
             ]
         
-        # Write cleaned data
         with open(output_path, 'w', encoding='utf-8') as f:
             json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
             
     elif input_path.is_dir():
-        # Process directory of files
         output_path.mkdir(parents=True, exist_ok=True)
         for file_path in input_path.glob('**/*.json'):
-            # Maintain directory structure in output
             relative_path = file_path.relative_to(input_path)
             output_file = output_path / relative_path
             output_file.parent.mkdir(parents=True, exist_ok=True)
-            
-            # Process individual file
             process_dataset(str(file_path), str(output_file))
 
 if __name__ == "__main__":

+ 0 - 14
end-to-end-use-cases/data-tool/scripts/finetuning/toolcall.py

@@ -5,20 +5,6 @@ from torchtune.data import _messages, Message
 from torchtune.datasets import SFTDataset
 from torchtune.modules.transforms import Transform
 from torchtune.modules.transforms.tokenizers import ModelTokenizer
-
-# Store original validate_messages for reference if needed
-original_validate = _messages.validate_messages
-
-
-# Replace with no-op function
-def no_validate(messages):
-    pass
-
-
-# Monkey patch
-_messages.validate_messages = no_validate
-
-
 class ToolCallMessages(Transform):
     def __init__(self):
         self._role_map = {