chat_utils.py 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. import asyncio
  2. import logging
  3. from abc import ABC, abstractmethod
  4. from octoai.client import OctoAI
  5. from functools import partial
  6. from openai import OpenAI
  7. import json
  8. # Configure logging to include the timestamp, log level, and message
  9. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  10. # Since OctoAI has different naming for llama models, create this mapping to get huggingface offical model name given OctoAI names.
  11. MODEL_NAME_MAPPING={"meta-llama-3-70b-instruct":"meta-llama/Meta-Llama-3-70B-Instruct",
  12. "meta-llama-3-8b-instruct":"meta-llama/Meta-Llama-3-8B-Instruct","llama-2-7b-chat":"meta-llama/Llama-2-7b-chat-hf"
  13. ,"llama-2-70b-chat":"meta-llama/Llama-2-70b-chat-hf"}
  14. # Manage rate limits with throttling
  15. rate_limit_threshold = 2000
  16. allowed_concurrent_requests = int(rate_limit_threshold * 0.75)
  17. request_limiter = asyncio.Semaphore(allowed_concurrent_requests)
  18. class ChatService(ABC):
  19. @abstractmethod
  20. async def execute_chat_request_async(self, api_context: dict, chat_request):
  21. pass
  22. # Please implement your own chat service class here.
  23. # The class should inherit from the ChatService class and implement the execute_chat_request_async method.
  24. # The following are two example chat service classes that you can use as a reference.
  25. class OctoAIChatService(ChatService):
  26. async def execute_chat_request_async(self, api_context: dict, chat_request):
  27. async with request_limiter:
  28. try:
  29. event_loop = asyncio.get_running_loop()
  30. client = OctoAI(api_context['api_key'])
  31. api_chat_call = partial(
  32. client.chat.completions.create,
  33. model=api_context['model'],
  34. messages=chat_request,
  35. temperature=0.0
  36. )
  37. response = await event_loop.run_in_executor(None, api_chat_call)
  38. assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
  39. return assistant_response
  40. except Exception as error:
  41. logging.error(f"Error during chat request execution: {error}",exc_info=True)
  42. return ""
  43. # Use the local vllm openai compatible server for generating question/answer pairs to make API call syntax consistent
  44. # please read for more detail:https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html.
  45. class VllmChatService(ChatService):
  46. async def execute_chat_request_async(self, api_context: dict, chat_request):
  47. try:
  48. event_loop = asyncio.get_running_loop()
  49. if api_context["model"] in MODEL_NAME_MAPPING:
  50. model_name = MODEL_NAME_MAPPING[api_context['model']]
  51. else:
  52. model_name = api_context['model']
  53. client = OpenAI(api_key=api_context['api_key'], base_url="http://localhost:"+ str(api_context['endpoint'])+"/v1")
  54. api_chat_call = partial(
  55. client.chat.completions.create,
  56. model=model_name,
  57. messages=chat_request,
  58. temperature=0.0
  59. )
  60. response = await event_loop.run_in_executor(None, api_chat_call)
  61. assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
  62. return assistant_response
  63. except Exception as error:
  64. logging.error(f"Error during chat request execution: {error}",exc_info=True)
  65. return ""