eval_grid.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import json
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..utils import load_config
  5. from .grader import get_grader
  6. from .inference import create_inference_request, LocalModelRunner
  7. from .shift_analysis import calculate_transferability_index
  8. def load_dataset(dataset_path: Path, nb_samples: Optional[int] = None):
  9. """Load conversation dataset from JSON file."""
  10. with open(dataset_path, "r") as f:
  11. samples = json.load(f)
  12. if nb_samples is not None:
  13. samples = samples[:nb_samples]
  14. return samples
  15. def grade_dataset(llm_runner, dataset, grader, inference_params):
  16. """Grade a dataset using the LLM runner."""
  17. requests = [create_inference_request(m, **inference_params) for m in dataset]
  18. llm_outputs = llm_runner.run_batch(requests)
  19. rows = [
  20. {"expected_output": m[-1]["content"][0]["text"], "raw_response": l}
  21. for m, l in zip(dataset, llm_outputs)
  22. ]
  23. return grader.grade(rows)
  24. def get_finetuned_checkpoint_path(base_path: Path, checkpoint_to_eval: int):
  25. """Get the path to a specific finetuned checkpoint."""
  26. if checkpoint_to_eval == -1:
  27. # Find the last checkpoint
  28. checkpoint_dirs = []
  29. if base_path.exists():
  30. for item in base_path.iterdir():
  31. if item.is_dir() and item.name.startswith("epoch_"):
  32. try:
  33. epoch_num = int(item.name.split("_")[1])
  34. checkpoint_dirs.append((epoch_num, item))
  35. except (ValueError, IndexError):
  36. continue
  37. if not checkpoint_dirs:
  38. raise FileNotFoundError(f"No checkpoints found in {base_path}")
  39. # Return the highest epoch
  40. return max(checkpoint_dirs, key=lambda x: x[0])[1]
  41. else:
  42. # Return specific epoch
  43. epoch_path = base_path / f"epoch_{checkpoint_to_eval}"
  44. if not epoch_path.exists():
  45. raise FileNotFoundError(f"Checkpoint not found: {epoch_path}")
  46. return epoch_path
  47. def run_eval_grid(experiment_dir: str):
  48. print("🚀 Starting evaluation grid execution...")
  49. print(f"📁 Experiment directory: {experiment_dir}")
  50. # Get script directory and config path
  51. script_dir = Path(__file__).parent.parent.parent
  52. config_path = script_dir / "config.yaml"
  53. print(f"📝 Loading configuration from: {config_path}")
  54. logs_dir = Path(experiment_dir) / "grader_logs"
  55. # Load configuration
  56. config = load_config(config_path)
  57. print("✅ Configuration loaded successfully")
  58. # Load task names
  59. tasks = ["task1", "task2"]
  60. # Populate checkpoints dictionary with base and finetuned checkpoints
  61. print("🔍 Building checkpoint list...")
  62. checkpoints = {}
  63. # Add base model from config
  64. base_model_path = config["finetuning"]["model_path"]
  65. checkpoints["base_model"] = base_model_path
  66. print(f" 📋 Base model: {base_model_path}")
  67. # Add finetuned checkpoints
  68. finetuned_ckpts_dir = Path(experiment_dir) / "finetuned_checkpoints"
  69. checkpoint_to_eval = config["evals"]["checkpoint_to_eval"]
  70. if finetuned_ckpts_dir.exists():
  71. for ckpt_dir in finetuned_ckpts_dir.iterdir():
  72. if ckpt_dir.is_dir():
  73. try:
  74. ckpt_path = get_finetuned_checkpoint_path(
  75. ckpt_dir, checkpoint_to_eval
  76. )
  77. model_name = f"finetuned_{ckpt_dir.name}"
  78. checkpoints[model_name] = str(ckpt_path)
  79. print(f" 📋 Finetuned: {model_name} -> {ckpt_path}")
  80. except FileNotFoundError as e:
  81. print(f" ⚠️ Skipping {ckpt_dir.name}: {e}")
  82. else:
  83. print(" ⚠️ No finetuned checkpoints directory found")
  84. print(f"📊 Total checkpoints to evaluate: {len(checkpoints)}")
  85. # Load model server args from config
  86. model_server_args = config["evals"]["model_server_args"]
  87. print(f"🔧 Model server args: {model_server_args}")
  88. # Load inference params from config
  89. inference_params = config["evals"]["inference_params"]
  90. print(f"⚡ Inference params: {inference_params}")
  91. eval_grid_results = []
  92. print(f"\n🎯 Starting evaluation grid...")
  93. print("=" * 60)
  94. total_evaluations = len(checkpoints) * len(tasks)
  95. eval_count = 0
  96. for model_name, ckpt in checkpoints.items():
  97. print(f"\n🤖 Evaluating model: {model_name}")
  98. print(f"📁 Checkpoint: {ckpt}")
  99. # Initialize model runner for this checkpoint
  100. llm_runner = LocalModelRunner(ckpt, **model_server_args)
  101. # Create log file for this model in `logs_dir`
  102. logs_dir.mkdir(parents=True, exist_ok=True)
  103. log_file_path = logs_dir / f"{model_name}_evaluation_log.json"
  104. model_log_data = {
  105. "model_name": model_name,
  106. "checkpoint_path": str(ckpt),
  107. "model_server_args": model_server_args,
  108. "inference_params": inference_params,
  109. "tasks": {},
  110. }
  111. for task_name in tasks:
  112. eval_count += 1
  113. print(f"\n📈 Evaluation {eval_count}/{total_evaluations}")
  114. print(f"🎯 Model: {model_name}, Task: {task_name}")
  115. # Get task-specific grader from config
  116. grader_name = config[task_name].get("grader", "JSONGrader")
  117. grader = get_grader(grader_name)
  118. print(f" 🔧 Using grader: {grader_name}")
  119. dataset_path = (
  120. Path(experiment_dir)
  121. / "formatted_datasets"
  122. / task_name
  123. / "test_conversation_data.json"
  124. )
  125. if not dataset_path.exists():
  126. print(f" ❌ Dataset not found: {dataset_path}")
  127. continue
  128. print(f" 📊 Loading dataset: {dataset_path}")
  129. dataset = load_dataset(dataset_path)
  130. print(f" 📋 Dataset size: {len(dataset)} samples")
  131. try:
  132. print(" ⏳ Running evaluation...")
  133. eval_result = grade_dataset(
  134. llm_runner, dataset, grader, inference_params
  135. )
  136. # Log eval_result for each task in the log file
  137. model_log_data["tasks"][task_name] = {
  138. "metrics": eval_result.metrics,
  139. "topline_metric_name": eval_result.topline_metric_name,
  140. "num_samples": len(eval_result.result_data),
  141. "result_data": eval_result.result_data,
  142. "rows": eval_result.rows,
  143. }
  144. topline_metric = eval_result.topline_metric_name
  145. score = eval_result.metrics.get(topline_metric)
  146. print(f" ✅ {topline_metric}: {score:.4f}")
  147. eval_grid_results.append(
  148. {
  149. "model": model_name,
  150. "task": task_name,
  151. "topline_metric": topline_metric,
  152. "score": score,
  153. "metrics": eval_result.metrics,
  154. }
  155. )
  156. except Exception as e:
  157. print(f" ❌ Evaluation failed: {e}")
  158. eval_grid_results.append(
  159. {
  160. "model": model_name,
  161. "task": task_name,
  162. "topline_metric": "error",
  163. "score": -1,
  164. "error": str(e),
  165. }
  166. )
  167. # Write the log file for this model
  168. with open(log_file_path, "w") as f:
  169. json.dump(model_log_data, f, indent=2)
  170. print(f" 📄 Evaluation log saved to: {log_file_path}")
  171. llm_runner.shutdown()
  172. # Save results
  173. results_path = Path(experiment_dir) / "eval_grid_results.json"
  174. with open(results_path, "w") as f:
  175. json.dump(eval_grid_results, f, indent=2)
  176. print("\n" + "=" * 60)
  177. print("🎉 Evaluation grid completed!")
  178. print(f"📁 Results saved to: {results_path}")
  179. # Print summary table
  180. # print("\n📊 Results Summary:")
  181. # print("-" * 80)
  182. # print(f"{'Model':<25} {'Task':<10} {'Metric':<15} {'Score':<10}")
  183. # print("-" * 80)
  184. # for result in eval_grid_results:
  185. # print(
  186. # f"{result['model']:<25} {result['task']:<10} {result['topline_metric']:<15} {result['score']:<10.4f}"
  187. # )
  188. # print("-" * 80)
  189. transferability_results = calculate_transferability_index(eval_grid_results)
  190. # Print summary table
  191. print("\n📊 Results Summary:")
  192. print("-" * 80)
  193. print(transferability_results)
  194. transferability_results_path = Path(experiment_dir) / "transferability_results.csv"
  195. transferability_results.to_csv(transferability_results_path, index=False)
  196. return eval_grid_results
  197. if __name__ == "__main__":
  198. run_eval_grid(
  199. "/data/users/subramen/fbsource/fbcode/users/subramen/internal-llama-cookbook/end-to-end-use-cases/transferability/experiments/test01"
  200. )