# Model arguments
model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
bf16: true
tf32: true
output_dir: runs/llama-3.2-3b-grpo-text2sql-4rewards-6gpu

# Lora Arguments
# No LoRA is used here

# Training arguments
max_steps: 750 # 1000 #500
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
learning_rate: 5.0e-7 # 1.0e-6 # 5.0e-7 # 1.0e-6 as in the deepseek math paper 5-e7 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
lr_scheduler_type: cosine
warmup_ratio: 0.03
# GRPO specific parameters
beta: 0.001 # 0.04 as in the deepseek math paper 0.001 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
max_prompt_length: 512 # 256
max_completion_length: 1024
num_generations: 8 # 6 # 8
use_vllm: true

# Reward function weights
# Order: [format_reward_func, execution_reward_func, ensemble_n_gram_reward_func]
reward_weights: [1.0, 3.0, 1.0, 1.0]
# **Recommended Weight Strategy**
# Current Setting: `[1.0, 3.0, 1.0]`**
# *   **Format reward (1.0)**: Standard weight since format correctness is binary but essential
# *   **Execution reward (3.0)**: **Highest weight** - SQL execution correctness is most important for text2sql
# *   **N-gram similarity (1.0)**: Standard weight for syntactic similarity

# **Alternative Weight Strategies**
# **Conservative approach: `[2.0, 4.0, 1.0]`**
# *   Emphasizes both format and execution correctness
# *   Lower weight on similarity metrics
# **Balanced approach: `[1.5, 2.0, 1.5]`**
# *   More balanced across all three metrics
# *   Good for early training stages
# **Similarity-focused: `[1.0, 2.0, 2.0]`**
# *   Higher weight on N-gram similarity
# *   Useful if execution often fails initially
# final_reward = format_reward*1.0 + execution_reward*3.0 + ngram_reward*1.0

vllm_device: "cuda:0" # use vLLM for generation and DeepSpeed for distributed training.
# Set the num_processes to the number of GPUs you have -
# the last one will be used with vLLM for Generation.
# if you have 6 GPUs, set vllm_device to "cuda:5" (or 5?) and
# num_processes to 5 (or 6? in which case, 6th GPU will be used
# for both generation and training

vllm_gpu_memory_utilization: 0.5

# Logging arguments
logging_strategy: steps
logging_steps: 2
report_to:
- tensorboard
save_strategy: "steps"
save_steps: 50
seed: 42

# Hugging Face Hub
push_to_hub: false
  # hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
hub_strategy: every_save