helper.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. import re
  2. import io
  3. import codecs
  4. import random
  5. from openai import OpenAI
  6. from config import DEEPINFRA_API_KEY
  7. openai = OpenAI(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")
  8. #client = OpenAI(api_key=OPENAI_API_KEY)
  9. def file_put_contents(filename, st):
  10. file = codecs.open(filename, "w", "utf-8")
  11. file.write(st)
  12. file.close()
  13. def file_get_contents(name):
  14. f = io.open(name, mode="r", encoding="utf-8") #utf-8 | Windows-1252
  15. return f.read()
  16. def openai_run(system_prompt, user_message):
  17. messages = [{"role":"system", "content":system_prompt}, {"role":"user", "content":user_message}]
  18. completion = client.chat.completions.create(
  19. model="gpt-4o-mini", #"gpt-4o-2024-05-13",
  20. temperature=0,
  21. max_tokens=2000,
  22. messages=messages
  23. )
  24. message = completion.choices[0].message
  25. return message.content
  26. def deepinfra_run(system_prompt, user_message):
  27. chat_completion = openai.chat.completions.create(
  28. model="meta-llama/Meta-Llama-3.1-405B-Instruct",
  29. messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
  30. max_tokens=4096
  31. )
  32. return chat_completion.choices[0].message.content
  33. def get_llm_answer(chunks_content, user_message): #keywords + content
  34. gp = "Is answer is not given below, say that you don't know it. Make sure to copy answers from documents without changing them."+chunks_content
  35. answer = deepinfra_run(gp, user_message)
  36. return answer
  37. def parse_keywords(content):
  38. result = []
  39. lines = content.strip().split('\n')
  40. current_chunk = None
  41. inline_pattern = re.compile(r'^\s*[^#:]+\s*:\s*(.+)$') # Matches lines like "Chunk1: word1, word2"
  42. #section_pattern = re.compile(r'^###\s*[^#]+\s*###$') #v1
  43. section_pattern = re.compile(r'[#\*]*\s*Chunk\s*\d+\s*[#\*]*') #v2
  44. for line in lines:
  45. line = line.strip()
  46. if not line: continue
  47. inline_match = inline_pattern.match(line)
  48. if inline_pattern.match(line) and "Chunk" in line:
  49. words_str = inline_match.group(1)
  50. words = [word.strip() for word in words_str.split(',') if word.strip()]
  51. result.append(words)
  52. elif section_pattern.match(line):
  53. if current_chunk: result.append(current_chunk)
  54. current_chunk = []
  55. elif current_chunk is not None: #section_pattern continuation
  56. words = [word.strip() for word in line.split(',') if word.strip()]
  57. current_chunk.extend(words)
  58. if current_chunk: result.append(current_chunk)
  59. return result
  60. def generate_contextual_keywords(chunked_content):
  61. system_prompt = '''
  62. Each chunk is separated as ### Chunk [id] ###. For each chunk generate keywords required to fully understand the chunk without any need for looking at the previous chunks.
  63. Don't just say "List of services", because its unclear what services are you referring to. Make sure to cover all chunks.
  64. Sample output:
  65. Chunk 1: BMW X5, pricings in France
  66. Chunk 2: BMW X5, discounts
  67. '''
  68. keywords_st = deepinfra_run(system_prompt, chunked_content)
  69. print("Keywords_st:\n", keywords_st, "\n")
  70. keywords = parse_keywords(keywords_st)
  71. return keywords
  72. def generate_questions_bychunk(chunks):
  73. system_prompt = '''
  74. Given a chunk from document. Generate 1-3 questions related to the chunk. Each question must be full and not require additional context.
  75. Example output:
  76. 1. How to open new account?
  77. 2. How much BMW X5 costs?
  78. '''
  79. n = len(chunks)
  80. indexes = [i for i in range(n)]
  81. random.shuffle(indexes)
  82. for idx in indexes[: min(n//5, 60)]:
  83. chunk = chunks[idx]
  84. text = "#"+(", ".join(chunk["keywords"]))+"\n"+chunk["content"]
  85. out = deepinfra_run(system_prompt, text) #anthropic_run(system_prompt, text)
  86. question_pattern = re.compile(r'^\s*\d+\.\s+(.*)', re.MULTILINE)
  87. questions = question_pattern.findall(out)
  88. chunk["questions"] = questions
  89. chunk["idx"] = idx
  90. return chunks
  91. def temp():
  92. st = '''
  93. Here are the keywords for each chunk:
  94. **Chunk 1**
  95. 3M, industrial and consumer products, electrical power transmission, renewable energy, infrastructure, Communication Markets Division, Germany
  96. ### Chunk 2 ###
  97. 3M, consumer retail, office supply products, home improvement products, Scotch brand, Post-it Products, Filtrete Filters, Thinsulate Insulation
  98. ** Chunk 3 **
  99. 3M, patents, trademarks, research and development, inventions, intellectual property, legal protection
  100. '''
  101. print( parse_keywords(st) )
  102. if __name__=="__main__":
  103. temp()