config.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import os
  2. MODEL_CONFIGS = {
  3. "vllm_llama_70b": {
  4. "model": "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
  5. "api_base": "http://localhost:8001/v1",
  6. "api_key": None,
  7. "port": 8001,
  8. "cuda_devices": "4,5,6,7",
  9. "tensor_parallel": 4,
  10. "gpu_util": 0.90,
  11. "chat_template": None,
  12. },
  13. "vllm_llama_90b": {
  14. "model": "hosted_vllm/meta-llama/Llama-3.2-90B-Vision-Instruct",
  15. "api_base": "http://localhost:8090/v1",
  16. "api_key": None,
  17. "port": 8090,
  18. "cuda_devices": "4,5,6,7",
  19. "tensor_parallel": 4,
  20. "gpu_util": 0.70,
  21. "chat_template": None,
  22. },
  23. "vllm_llama_405b": {
  24. "model": "hosted_vllm/meta-llama/Llama-3.1-405B-FP8",
  25. "api_base": "http://localhost:8405/v1",
  26. "api_key": None,
  27. "port": 8405,
  28. "cuda_devices": "0,1,2,3,4,5,6,7",
  29. "tensor_parallel": 8,
  30. "gpu_util": 0.80,
  31. "chat_template": "./llama3_405b_chat_template.jinja",
  32. },
  33. "vllm_llama_8b": {
  34. "model": "hosted_vllm/meta-llama/Llama-3.1-8B-Instruct",
  35. "api_base": "http://localhost:8008/v1",
  36. "api_key": None,
  37. "port": 8008,
  38. "cuda_devices": "0",
  39. "tensor_parallel": 1,
  40. "gpu_util": 0.95,
  41. "chat_template": None,
  42. },
  43. "openrouter_gpt4o": {
  44. "model": "openrouter/openai/gpt-4o",
  45. "api_base": "https://openrouter.ai/api/v1",
  46. "api_key": os.getenv("OPENROUTER_API_KEY"),
  47. },
  48. "openrouter_gpt4o_mini": {
  49. "model": "openrouter/openai/gpt-4o-mini",
  50. "api_base": "https://openrouter.ai/api/v1",
  51. "api_key": os.getenv("OPENROUTER_API_KEY"),
  52. },
  53. "openrouter_llama_70b": {
  54. "model": "openrouter/meta-llama/llama-3.3-70b-instruct",
  55. "api_base": "https://openrouter.ai/api/v1",
  56. "api_key": os.getenv("OPENROUTER_API_KEY"),
  57. },
  58. }