# Contextual keywords generation for Financial report document
In this Example we will use a Financial report document taken from https://github.com/patronus-ai/financebench/tree/main/pdfs

**Steps**:
1) Parse financial report document (pdf file) using LlamaParse.
2) Split document into chunks. Here we use basic tokens-based chunking with constant chunk_size, but you can use any other method for chunking.
3) Generate contextual keywords for each chunk.
4) Create questions related to randomly selected chunks since a predefined test set is unavailable.
5) Evaluate the method by retrieving the top five most relevant chunks based on cosine similarity between chunk and question embeddings, checking if the correct chunk is included.
6) To compare results with raw content (without keywords), create a new index in the "Create index" section and modify the document structure by replacing Document(text='#'+x['keywords']+'\n'+x['content'], .. ) with Document(text=x['content'], .. ).


In [None]:
# Install dependencies
!pip install tiktoken
!pip install torch
!pip install transformers
!pip install llama_parse
!pip install llama_index

import random
import os
import sys
import json
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from llama_index.core.schema import Document
from config import LLAMAPARSE_API_KEY

# Enable nested async loops
import nest_asyncio
nest_asyncio.apply()


In [None]:
# download the pdf file into ./data folder 
!wget -P data/ https://github.com/patronus-ai/financebench/raw/main/pdfs/AMAZON_2015_10K.pdf
!mkdir temp

# Parse pdf file
parser = LlamaParse(
 api_key=LLAMAPARSE_API_KEY,
 result_type="markdown" # "markdown" and "text" are available
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['./data/AMAZON_2015_10K.pdf'], file_extractor=file_extractor).load_data()
print("pdf file pages:", len(documents))

In [12]:
# Split into chunks (by tokens)
from helper import file_get_contents, file_put_contents, generate_contextual_keywords, get_llm_answer, generate_questions_bychunk
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

def split_into_chunks(content, chunk_size):
	a = enc.encode(content)
	left, chunks = 0, []
	while left < len(a):
		arr = a[left : left+chunk_size]
		chunks.append(enc.decode(arr))
		left+=chunk_size
	return chunks
 
def generate_chunked_content(chunks):
 chunked_content = ""
 for idx, text in enumerate(chunks):
 chunked_content+=f"### Chunk {idx+1} ###\n{text}\n\n"
 return chunked_content
 

# Generate contextual keywords
path = "./temp/chunks2.json"
if not os.path.exists(path):
 print("Generating keywords..")
 document_content, chunks, chunks2 = "", [], []
 for doc in documents: document_content+=doc.text+"\n"
 chunks1 = split_into_chunks(document_content, 400) #400 -- defaulf value
 for i, chunk in enumerate(chunks1):
 chunks.append(chunk)
 if (len(chunks) > 10 or (i==len(chunks1)-1) and len(chunks)>2):
 chunked_content = generate_chunked_content(chunks)
 keywords = generate_contextual_keywords(chunked_content) 
 print("page_end:", i+1, keywords, len(keywords), len(chunks)) 
 assert len(keywords) >= len(chunks)
 for j in range(len(chunks)): chunks2.append( {"idx":j, "keywords":keywords[j], "content":chunks[j]} )
 chunks = []
 file_put_contents(path, json.dumps(chunks2))
else:
 chunks2 = json.loads(file_get_contents(path)) #it has content, keywords, idx


# Generate questions
path = "./temp/chunks3.json"
if not os.path.exists(path):
 print("Generating questions..")
 chunks3 = generate_questions_bychunk(chunks2) 
 file_put_contents(path, json.dumps(chunks3))
else:
 chunks3 = json.loads(file_get_contents(path)) #it has content, keywords, questions, idx now

In [9]:
# Create Index 
from llama_index.core import GPTVectorStoreIndex, StorageContext, load_index_from_storage, Settings
from embedding import LocalJinaEmbedding #locally run the jinai embedding model

INDEX_DIR = "./temp/local_index_cache"
if not os.path.exists(INDEX_DIR):
 print("Creating new index ...") 
 Settings.embed_model = LocalJinaEmbedding()
 Settings.llm = None
 documents2 = [Document(text='#'+",".join(x['keywords'])+'\n'+x['content'], metadata={"id": str(x["idx"])}) for x in chunks3] 
 index = GPTVectorStoreIndex.from_documents(documents2)
 index.storage_context.persist(persist_dir=INDEX_DIR)
else:
 storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
 index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(similarity_top_k=5)

# Run tests
count, correct = 0, 0
for test in chunks3[:]:
 if not "questions" in test: continue
 idx = test["idx"]
 for question in test["questions"]:
 count+=1
 response = query_engine.query(question)
 print("\n\n--- Test:", question, "idx:", idx)
 for result in response.source_nodes[:]:
 print(result.node.metadata) #prompt+=f"\n\n\n 
 if result.node.metadata['id'] == str(idx): correct+=1 

print("Test correct, all:", correct, count)



--- Test: What is the fiscal year end date for Adobe Systems Incorporated's annual report filed with the SEC in 2015? idx: 0
{'id': '7'}
{'id': '0'}
{'id': '0'}
{'id': '4'}
{'id': '2'}


--- Test: What is the Commission File Number for Adobe Systems Incorporated? idx: 0
{'id': '1'}
{'id': '0'}
{'id': '10'}
{'id': '468'}
{'id': '397'}


--- Test: In which state is Adobe Systems Incorporated incorporated or organized? idx: 0
{'id': '10'}
{'id': '280'}
{'id': '9'}
{'id': '1'}
{'id': '8'}


--- Test: How is Adobe's business organized? idx: 13
{'id': '10'}
{'id': '280'}
{'id': '10'}
{'id': '9'}
{'id': '5'}


--- Test: What are Adobe's two strategic growth opportunities? idx: 13
{'id': '0'}
{'id': '3'}
{'id': '13'}
{'id': '1'}
{'id': '6'}


--- Test: What are the three reportable segments of Adobe's business? idx: 13
{'id': '7'}
{'id': '10'}
{'id': '184'}
{'id': '13'}
{'id': '9'}


--- Test: What is Adobe Digital Publishing Solution used for? idx: 16
{'id': '3'}
{'id': '9'}
{'id': '3'}
{'i