# Configuration for data loading, formatting, and fine-tuning output_dir: "/tmp/finetuning-pipeline/llama3_2_vision/" # Directory to store output files data: data_path: "data/path" # Path to the dataset to load is_local: true # Whether the data is stored locally formatter_type: "vllm" # Type of formatter to use ('torchtune', 'vllm', or system_prompt: "You are a helpful assisstant" # System prompt to use for the dataset column_mapping: input: "instruction" # Field containing the input text output: "output" # Field containing the output text image: "image" # Field containing the image path (optional) # Additional arguments to pass to the load_dataset function dataset_kwargs: split: "validation" # Dataset split to load shuffle: false # Whether to shuffle the dataset # Training configuration finetuning: #formatter_type: "torchtune" # Type of formatter to use ('torchtune', 'vllm', or 'openai') model_path: "path/to/model" # Path to the model checkpoint tokenizer_path: "path/to/tokenizer" # Path to the tokenizer output_dir: /tmp/finetuning-pipeline/model_outputs # Directory to store checkpoints log_dir: /tmp/finetuning-pipeline/logs # Directory to store logs strategy: "lora" # Training strategy ('fft' or 'lora') num_epochs: 1 # Number of training epochs max_steps_per_epoch: null batch_size: 8 # Batch size per device for training torchtune_config: "llama3_2_vision/11B_lora" # TorchTune-specific configuration num_processes_per_node: 8 # TorchTune-specific configuration distributed: true # Whether to use distributed training # vLLM Inference configuration inference: # Model configuration model_path: "path/to/model/checkpoint" # Path to the model checkpoint quantization: null # Quantization method (awq, gptq, squeezellm) dtype: "auto" # Data type for model weights (half, float, bfloat16, auto) trust_remote_code: false # Trust remote code when loading the model # Server configuration port: 8000 # Port to run the server on host: "0.0.0.0" # Host to run the server on # Performance configuration tensor_parallel_size: 8 # Number of GPUs to use for tensor parallelism max_model_len: 8192 # Maximum sequence length max_num_seqs: 1 # Maximum number of sequences gpu_memory_utilization: 0.95 # Fraction of GPU memory to use enforce_eager: false # Enforce eager execution inference_data_kwargs: data_path: "inference/data/path" # Path to the inference dataset split: "validation" # Dataset split to load formatter_type: "vllm" # Type of formatter to use ('torchtune', 'vllm', or 'openai') format_data: false # Whether to format the inference dataset max_samples: null # Maximum number of samples to load (null for all) is_local: true # Whether the data is stored locally # Additional vLLM parameters (optional) # swap_space: 4 # Size of CPU swap space in GiB # block_size: 16 # Size of blocks used in the KV cache # disable_log_stats: true # Disable logging of stats # disable_log_requests: false # Disable logging of requests