radu
/
LLamaRecipes
spiegel van https://github.com/facebookresearch/llama-recipes.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							import re
import io
import codecs
import random
from openai import OpenAI
from config import DEEPINFRA_API_KEY

openai = OpenAI(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")
#client = OpenAI(api_key=OPENAI_API_KEY)


def file_put_contents(filename, st):
	file = codecs.open(filename, "w", "utf-8")
	file.write(st)
	file.close()

def file_get_contents(name):
	f = io.open(name, mode="r", encoding="utf-8") #utf-8 | Windows-1252
	return f.read()


def openai_run(system_prompt, user_message):
	messages = [{"role":"system", "content":system_prompt}, {"role":"user", "content":user_message}]    
	completion = client.chat.completions.create(
	  model="gpt-4o-mini", #"gpt-4o-2024-05-13",
	  temperature=0,
	  max_tokens=2000,
	  messages=messages
	)
	message = completion.choices[0].message
	return message.content    


def deepinfra_run(system_prompt, user_message):
	chat_completion = openai.chat.completions.create(
		model="meta-llama/Meta-Llama-3.1-405B-Instruct",
		messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
		max_tokens=4096
	)
	return chat_completion.choices[0].message.content


def get_llm_answer(chunks_content, user_message): #keywords + content
	gp = "Is answer is not given below, say that you don't know it. Make sure to copy answers from documents without changing them."+chunks_content
	answer = deepinfra_run(gp, user_message)
	return answer


def parse_keywords(content):
	result = []
	lines = content.strip().split('\n')
	current_chunk = None
	inline_pattern = re.compile(r'^\s*[^#:]+\s*:\s*(.+)$')  # Matches lines like "Chunk1: word1, word2"
	#section_pattern = re.compile(r'^###\s*[^#]+\s*###$') #v1
	section_pattern = re.compile(r'[#\*]*\s*Chunk\s*\d+\s*[#\*]*') #v2
 
	for line in lines:
		line = line.strip()
		if not line: continue
		inline_match = inline_pattern.match(line)

		if inline_pattern.match(line) and "Chunk" in line:			
			words_str = inline_match.group(1)
			words = [word.strip() for word in words_str.split(',') if word.strip()]
			result.append(words)

		elif section_pattern.match(line):			
			if current_chunk: result.append(current_chunk)
			current_chunk = []

		elif current_chunk is not None: #section_pattern continuation
			words = [word.strip() for word in line.split(',') if word.strip()]
			current_chunk.extend(words)

	if current_chunk: result.append(current_chunk)
	return result


def generate_contextual_keywords(chunked_content):
	system_prompt = '''
	Each chunk is separated as ### Chunk [id] ###. For each chunk generate keywords required to fully understand the chunk without any need for looking at the previous chunks.
	Don't just say "List of services", because its unclear what services are you referring to. Make sure to cover all chunks.
	Sample output:
	Chunk 1: BMW X5, pricings in France
	Chunk 2: BMW X5, discounts
	'''
	keywords_st = deepinfra_run(system_prompt, chunked_content)
	print("Keywords_st:\n", keywords_st, "\n")
	keywords = parse_keywords(keywords_st)    
	return keywords


def generate_questions_bychunk(chunks):
	system_prompt = '''
 Given a chunk from document. Generate 1-3 questions related to the chunk. Each question must be full and not require additional context. 
 Example output:
 1. How to open new account?
 2. How much BMW X5 costs? 
	'''	
	n = len(chunks)
	indexes = [i for i in range(n)]
	random.shuffle(indexes)
	for idx in indexes[: min(n//5, 60)]:
		chunk  = chunks[idx]
		text = "#"+(", ".join(chunk["keywords"]))+"\n"+chunk["content"]
		out =  deepinfra_run(system_prompt, text) #anthropic_run(system_prompt, text)
		question_pattern = re.compile(r'^\s*\d+\.\s+(.*)', re.MULTILINE)
		questions = question_pattern.findall(out)
		chunk["questions"] = questions
		chunk["idx"] = idx
	return chunks

	
def temp():
	st = '''
Here are the keywords for each chunk:

**Chunk 1**
3M, industrial and consumer products, electrical power transmission, renewable energy, infrastructure, Communication Markets Division, Germany

### Chunk 2 ###
3M, consumer retail, office supply products, home improvement products, Scotch brand, Post-it Products, Filtrete Filters, Thinsulate Insulation

** Chunk 3 **
3M, patents, trademarks, research and development, inventions, intellectual property, legal protection
'''
	print( parse_keywords(st) )


if __name__=="__main__":
	temp()