import os MODEL_CONFIGS = { "vllm_llama_70b": { "model": "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct", "api_base": "http://localhost:8001/v1", "api_key": None, "port": 8001, "cuda_devices": "4,5,6,7", "tensor_parallel": 4, "gpu_util": 0.90, "chat_template": None, }, "vllm_llama_90b": { "model": "hosted_vllm/meta-llama/Llama-3.2-90B-Vision-Instruct", "api_base": "http://localhost:8090/v1", "api_key": None, "port": 8090, "cuda_devices": "4,5,6,7", "tensor_parallel": 4, "gpu_util": 0.70, "chat_template": None, }, "vllm_llama_405b": { "model": "hosted_vllm/meta-llama/Llama-3.1-405B-FP8", "api_base": "http://localhost:8405/v1", "api_key": None, "port": 8405, "cuda_devices": "0,1,2,3,4,5,6,7", "tensor_parallel": 8, "gpu_util": 0.80, "chat_template": "./llama3_405b_chat_template.jinja", }, "vllm_llama_8b": { "model": "hosted_vllm/meta-llama/Llama-3.1-8B-Instruct", "api_base": "http://localhost:8008/v1", "api_key": None, "port": 8008, "cuda_devices": "0", "tensor_parallel": 1, "gpu_util": 0.95, "chat_template": None, }, "openrouter_gpt4o": { "model": "openrouter/openai/gpt-4o", "api_base": "https://openrouter.ai/api/v1", "api_key": os.getenv("OPENROUTER_API_KEY"), }, "openrouter_gpt4o_mini": { "model": "openrouter/openai/gpt-4o-mini", "api_base": "https://openrouter.ai/api/v1", "api_key": os.getenv("OPENROUTER_API_KEY"), }, "openrouter_llama_70b": { "model": "openrouter/meta-llama/llama-3.3-70b-instruct", "api_base": "https://openrouter.ai/api/v1", "api_key": os.getenv("OPENROUTER_API_KEY"), }, }