123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- import re
- import io
- import codecs
- import random
- from openai import OpenAI
- from config import DEEPINFRA_API_KEY
- openai = OpenAI(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")
- #client = OpenAI(api_key=OPENAI_API_KEY)
- def file_put_contents(filename, st):
- file = codecs.open(filename, "w", "utf-8")
- file.write(st)
- file.close()
- def file_get_contents(name):
- f = io.open(name, mode="r", encoding="utf-8") #utf-8 | Windows-1252
- return f.read()
- def openai_run(system_prompt, user_message):
- messages = [{"role":"system", "content":system_prompt}, {"role":"user", "content":user_message}]
- completion = client.chat.completions.create(
- model="gpt-4o-mini", #"gpt-4o-2024-05-13",
- temperature=0,
- max_tokens=2000,
- messages=messages
- )
- message = completion.choices[0].message
- return message.content
- def deepinfra_run(system_prompt, user_message):
- chat_completion = openai.chat.completions.create(
- model="meta-llama/Meta-Llama-3.1-405B-Instruct",
- messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
- max_tokens=4096
- )
- return chat_completion.choices[0].message.content
- def get_llm_answer(chunks_content, user_message): #keywords + content
- gp = "Is answer is not given below, say that you don't know it. Make sure to copy answers from documents without changing them."+chunks_content
- answer = deepinfra_run(gp, user_message)
- return answer
- def parse_keywords(content):
- result = []
- lines = content.strip().split('\n')
- current_chunk = None
- inline_pattern = re.compile(r'^\s*[^#:]+\s*:\s*(.+)$') # Matches lines like "Chunk1: word1, word2"
- #section_pattern = re.compile(r'^###\s*[^#]+\s*###$') #v1
- section_pattern = re.compile(r'[#\*]*\s*Chunk\s*\d+\s*[#\*]*') #v2
-
- for line in lines:
- line = line.strip()
- if not line: continue
- inline_match = inline_pattern.match(line)
- if inline_pattern.match(line) and "Chunk" in line:
- words_str = inline_match.group(1)
- words = [word.strip() for word in words_str.split(',') if word.strip()]
- result.append(words)
- elif section_pattern.match(line):
- if current_chunk: result.append(current_chunk)
- current_chunk = []
- elif current_chunk is not None: #section_pattern continuation
- words = [word.strip() for word in line.split(',') if word.strip()]
- current_chunk.extend(words)
- if current_chunk: result.append(current_chunk)
- return result
- def generate_contextual_keywords(chunked_content):
- system_prompt = '''
- Each chunk is separated as ### Chunk [id] ###. For each chunk generate keywords required to fully understand the chunk without any need for looking at the previous chunks.
- Don't just say "List of services", because its unclear what services are you referring to. Make sure to cover all chunks.
- Sample output:
- Chunk 1: BMW X5, pricings in France
- Chunk 2: BMW X5, discounts
- '''
- keywords_st = deepinfra_run(system_prompt, chunked_content)
- print("Keywords_st:\n", keywords_st, "\n")
- keywords = parse_keywords(keywords_st)
- return keywords
- def generate_questions_bychunk(chunks):
- system_prompt = '''
- Given a chunk from document. Generate 1-3 questions related to the chunk. Each question must be full and not require additional context.
- Example output:
- 1. How to open new account?
- 2. How much BMW X5 costs?
- '''
- n = len(chunks)
- indexes = [i for i in range(n)]
- random.shuffle(indexes)
- for idx in indexes[: min(n//5, 60)]:
- chunk = chunks[idx]
- text = "#"+(", ".join(chunk["keywords"]))+"\n"+chunk["content"]
- out = deepinfra_run(system_prompt, text) #anthropic_run(system_prompt, text)
- question_pattern = re.compile(r'^\s*\d+\.\s+(.*)', re.MULTILINE)
- questions = question_pattern.findall(out)
- chunk["questions"] = questions
- chunk["idx"] = idx
- return chunks
-
- def temp():
- st = '''
- Here are the keywords for each chunk:
- **Chunk 1**
- 3M, industrial and consumer products, electrical power transmission, renewable energy, infrastructure, Communication Markets Division, Germany
- ### Chunk 2 ###
- 3M, consumer retail, office supply products, home improvement products, Scotch brand, Post-it Products, Filtrete Filters, Thinsulate Insulation
- ** Chunk 3 **
- 3M, patents, trademarks, research and development, inventions, intellectual property, legal protection
- '''
- print( parse_keywords(st) )
- if __name__=="__main__":
- temp()
|