| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 | 
							- import re
 
- import io
 
- import codecs
 
- import random
 
- from openai import OpenAI
 
- from config import DEEPINFRA_API_KEY
 
- openai = OpenAI(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")
 
- #client = OpenAI(api_key=OPENAI_API_KEY)
 
- def file_put_contents(filename, st):
 
- 	file = codecs.open(filename, "w", "utf-8")
 
- 	file.write(st)
 
- 	file.close()
 
- def file_get_contents(name):
 
- 	f = io.open(name, mode="r", encoding="utf-8") #utf-8 | Windows-1252
 
- 	return f.read()
 
- def openai_run(system_prompt, user_message):
 
- 	messages = [{"role":"system", "content":system_prompt}, {"role":"user", "content":user_message}]    
 
- 	completion = client.chat.completions.create(
 
- 	  model="gpt-4o-mini", #"gpt-4o-2024-05-13",
 
- 	  temperature=0,
 
- 	  max_tokens=2000,
 
- 	  messages=messages
 
- 	)
 
- 	message = completion.choices[0].message
 
- 	return message.content    
 
- def deepinfra_run(system_prompt, user_message):
 
- 	chat_completion = openai.chat.completions.create(
 
- 		model="meta-llama/Meta-Llama-3.1-405B-Instruct",
 
- 		messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
 
- 		max_tokens=4096
 
- 	)
 
- 	return chat_completion.choices[0].message.content
 
- def get_llm_answer(chunks_content, user_message): #keywords + content
 
- 	gp = "Is answer is not given below, say that you don't know it. Make sure to copy answers from documents without changing them."+chunks_content
 
- 	answer = deepinfra_run(gp, user_message)
 
- 	return answer
 
- def parse_keywords(content):
 
- 	result = []
 
- 	lines = content.strip().split('\n')
 
- 	current_chunk = None
 
- 	inline_pattern = re.compile(r'^\s*[^#:]+\s*:\s*(.+)$')  # Matches lines like "Chunk1: word1, word2"
 
- 	#section_pattern = re.compile(r'^###\s*[^#]+\s*###$') #v1
 
- 	section_pattern = re.compile(r'[#\*]*\s*Chunk\s*\d+\s*[#\*]*') #v2
 
-  
 
- 	for line in lines:
 
- 		line = line.strip()
 
- 		if not line: continue
 
- 		inline_match = inline_pattern.match(line)
 
- 		if inline_pattern.match(line) and "Chunk" in line:			
 
- 			words_str = inline_match.group(1)
 
- 			words = [word.strip() for word in words_str.split(',') if word.strip()]
 
- 			result.append(words)
 
- 		elif section_pattern.match(line):			
 
- 			if current_chunk: result.append(current_chunk)
 
- 			current_chunk = []
 
- 		elif current_chunk is not None: #section_pattern continuation
 
- 			words = [word.strip() for word in line.split(',') if word.strip()]
 
- 			current_chunk.extend(words)
 
- 	if current_chunk: result.append(current_chunk)
 
- 	return result
 
- def generate_contextual_keywords(chunked_content):
 
- 	system_prompt = '''
 
- 	Each chunk is separated as ### Chunk [id] ###. For each chunk generate keywords required to fully understand the chunk without any need for looking at the previous chunks.
 
- 	Don't just say "List of services", because its unclear what services are you referring to. Make sure to cover all chunks.
 
- 	Sample output:
 
- 	Chunk 1: BMW X5, pricings in France
 
- 	Chunk 2: BMW X5, discounts
 
- 	'''
 
- 	keywords_st = deepinfra_run(system_prompt, chunked_content)
 
- 	print("Keywords_st:\n", keywords_st, "\n")
 
- 	keywords = parse_keywords(keywords_st)    
 
- 	return keywords
 
- def generate_questions_bychunk(chunks):
 
- 	system_prompt = '''
 
-  Given a chunk from document. Generate 1-3 questions related to the chunk. Each question must be full and not require additional context. 
 
-  Example output:
 
-  1. How to open new account?
 
-  2. How much BMW X5 costs? 
 
- 	'''	
 
- 	n = len(chunks)
 
- 	indexes = [i for i in range(n)]
 
- 	random.shuffle(indexes)
 
- 	for idx in indexes[: min(n//5, 60)]:
 
- 		chunk  = chunks[idx]
 
- 		text = "#"+(", ".join(chunk["keywords"]))+"\n"+chunk["content"]
 
- 		out =  deepinfra_run(system_prompt, text) #anthropic_run(system_prompt, text)
 
- 		question_pattern = re.compile(r'^\s*\d+\.\s+(.*)', re.MULTILINE)
 
- 		questions = question_pattern.findall(out)
 
- 		chunk["questions"] = questions
 
- 		chunk["idx"] = idx
 
- 	return chunks
 
- 	
 
- def temp():
 
- 	st = '''
 
- Here are the keywords for each chunk:
 
- **Chunk 1**
 
- 3M, industrial and consumer products, electrical power transmission, renewable energy, infrastructure, Communication Markets Division, Germany
 
- ### Chunk 2 ###
 
- 3M, consumer retail, office supply products, home improvement products, Scotch brand, Post-it Products, Filtrete Filters, Thinsulate Insulation
 
- ** Chunk 3 **
 
- 3M, patents, trademarks, research and development, inventions, intellectual property, legal protection
 
- '''
 
- 	print( parse_keywords(st) )
 
- if __name__=="__main__":
 
- 	temp()
 
 
  |