123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- # ## Serving Llama 3.2 3B Instruct Model With vLLM
- # This app runs a vLLM server on an A100 GPU.
- #
- # Run it with:
- # modal deploy inference
- import modal
- # This defines the image to use for the vLLM server container
- vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
- "vllm==0.5.3post1"
- )
- MODELS_DIR = "/llamas"
- MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
- # Ensure the model is downloaded and the volume exists
- try:
- volume = modal.Volume.lookup("llamas", create_if_missing=False)
- except modal.exception.NotFoundError:
- raise Exception("Download models first with modal run download")
- app = modal.App("many-llamas-human-eval")
- N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
- TOKEN = (
- "super-secret-token" # auth token. for production use, replace with a modal.Secret
- )
- MINUTES = 60 # seconds
- HOURS = 60 * MINUTES
- @app.function(
- image=vllm_image,
- gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
- container_idle_timeout=5 * MINUTES,
- timeout=24 * HOURS,
- allow_concurrent_inputs=20, # VLLM will batch requests so many can be received at once
- volumes={MODELS_DIR: volume},
- concurrency_limit=10, # max 10 GPUs
- )
- @modal.asgi_app()
- def serve():
- import fastapi
- import vllm.entrypoints.openai.api_server as api_server
- from vllm.engine.arg_utils import AsyncEngineArgs
- from vllm.engine.async_llm_engine import AsyncLLMEngine
- from vllm.entrypoints.logger import RequestLogger
- from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
- from vllm.entrypoints.openai.serving_completion import (
- OpenAIServingCompletion,
- )
- from vllm.usage.usage_lib import UsageContext
- volume.reload() # ensure we have the latest version of the weights
- # create a fastAPI app that uses vLLM's OpenAI-compatible router
- web_app = fastapi.FastAPI(
- title=f"OpenAI-compatible {MODEL_NAME} server",
- description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
- version="0.0.1",
- docs_url="/docs",
- )
- # security: CORS middleware for external requests
- http_bearer = fastapi.security.HTTPBearer(
- scheme_name="Bearer Token",
- description="See code for authentication details.",
- )
- web_app.add_middleware(
- fastapi.middleware.cors.CORSMiddleware,
- allow_origins=["*"],
- allow_credentials=True,
- allow_methods=["*"],
- allow_headers=["*"],
- )
- # security: inject dependency on authed routes
- async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
- if api_key.credentials != TOKEN:
- raise fastapi.HTTPException(
- status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
- detail="Invalid authentication credentials",
- )
- return {"username": "authenticated_user"}
- router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
- # wrap vllm's router in auth router
- router.include_router(api_server.router)
- # add authed vllm to our fastAPI app
- web_app.include_router(router)
- engine_args = AsyncEngineArgs(
- model=MODELS_DIR + "/" + MODEL_NAME,
- tensor_parallel_size=N_GPU,
- gpu_memory_utilization=0.90,
- max_model_len=2048,
- enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s)
- )
- engine = AsyncLLMEngine.from_engine_args(
- engine_args, usage_context=UsageContext.OPENAI_API_SERVER
- )
- model_config = get_model_config(engine)
- request_logger = RequestLogger(max_log_len=2048)
- api_server.openai_serving_chat = OpenAIServingChat(
- engine,
- model_config=model_config,
- served_model_names=[MODEL_NAME],
- chat_template=None,
- response_role="assistant",
- lora_modules=[],
- prompt_adapters=[],
- request_logger=request_logger,
- )
- api_server.openai_serving_completion = OpenAIServingCompletion(
- engine,
- model_config=model_config,
- served_model_names=[MODEL_NAME],
- lora_modules=[],
- prompt_adapters=[],
- request_logger=request_logger,
- )
- return web_app
- def get_model_config(engine):
- import asyncio
- try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
- event_loop = asyncio.get_running_loop()
- except RuntimeError:
- event_loop = None
- if event_loop is not None and event_loop.is_running():
- # If the current is instanced by Ray Serve,
- # there is already a running event loop
- model_config = event_loop.run_until_complete(engine.get_model_config())
- else:
- # When using single vLLM without engine_use_ray
- model_config = asyncio.run(engine.get_model_config())
- return model_config
|