radu
/
LLamaRecipes
огледало от https://github.com/facebookresearch/llama-recipes.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
							import os
import requests
import json
import time
import io
import re
import gradio as gr
import PyPDF2
from together import Together


def download_pdf(url, save_path=None):
    if url is None or 'arxiv.org' not in url:
        return None
    response = requests.get(url)
    if save_path:
        with open(save_path, 'wb') as f:
            f.write(response.content)
    return response.content

def extract_arxiv_pdf_url(arxiv_url):
    # Check if URL is already in PDF format
    if 'arxiv.org/pdf/' in arxiv_url:
        return arxiv_url
    
    # Extract arxiv_id from different URL formats
    arxiv_id = None
    if 'arxiv.org/abs/' in arxiv_url:
        arxiv_id = arxiv_url.split('arxiv.org/abs/')[1].split()[0]
    elif 'arxiv.org/html/' in arxiv_url:
        arxiv_id = arxiv_url.split('arxiv.org/html/')[1].split()[0]
    
    if arxiv_id:
        return f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    
    return None  # Return None if no valid arxiv_id found

def extract_text_from_pdf(pdf_content):
    pdf_file = io.BytesIO(pdf_content)
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_references_with_llm(pdf_content):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_content)
    
    # Truncate if too long
    max_length = 50000
    if len(text) > max_length:
        text = text[:max_length] + "..."

    client = Together(api_key="Your API key here")

    citations = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        messages=[
            {
                "role":"user",
                "content":f"Extract all the arXiv citations from Reference section of the paper including their title, authors and origins. Paper: {text} "
            }
        ],
        temperature=0.3,
    )
    
    # Prepare prompt for Llama 4
    prompt = f"""   
                Extract the arXiv ID from the list of citations provided, including preprint arXiv ID. If there is no arXiv ID presented with the list, skip that citations.
                
                Here are some examples on arXiv ID format:
                1. arXiv preprint arXiv:1607.06450, where 1607.06450 is the arXiv ID.
                2. CoRR, abs/1409.0473, where 1409.0473 is the arXiv ID.

                Then, return a JSON array of objects with 'title' and 'ID' fields strictly in the following format, only return the paper title if it's arXiv ID is extracted:

                Output format: [{{\"title\": \"Paper Title\", \"ID\": \"arXiv ID\"}}]

                DO NOT return any other text.

                List of citations:
                {citations}
                """

    
    response = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        messages=[
            {
                "role":"user",
                "content":prompt
            }
        ],
        temperature=0.3,
    )
    response_json = response.choices[0].message.content

    # Convert the JSON string to a Python object
    references = []
    try:
        references = json.loads(response_json)
     # Now you can work with `references` as a Python list or dictionary
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")

    return references

# Check if ref_id is a valid arXiv ID
def is_valid_arxiv_id(ref_id):
    # arXiv IDs are typically in the format of "1234.56789" or "1234567"
    return bool(re.match(r'^\d{4}\.\d{4,5}$', ref_id) or re.match(r'^\d{7}$', ref_id))

def download_arxiv_paper_and_citations(arxiv_url, download_dir, progress=None):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    
    if progress:
        progress("Downloading main paper PDF...")
    
    # Download main paper PDF
    pdf_url = extract_arxiv_pdf_url(arxiv_url)
    main_pdf_path = os.path.join(download_dir, 'main_paper.pdf')
    main_pdf_content = download_pdf(pdf_url, main_pdf_path)
    
    if main_pdf_content is None:
        if progress:
            progress("Invalid Url. Valid example: https://arxiv.org/abs/1706.03762v7")
        return None, 0

    if progress:
        progress("Main paper downloaded. Extracting references...")
    
    # Extract references using LLM
    references = extract_references_with_llm(main_pdf_content)

    if progress:
        progress(f"Found {len(references)} references. Downloading...")
        time.sleep(1)
    
    # Download reference PDFs
    all_pdf_paths = [main_pdf_path]
    for i, reference in enumerate(references):
        ref_title = reference.get("title")
        ref_id = reference.get("ID")
        if ref_id and is_valid_arxiv_id(ref_id):
            ref_url = f'https://arxiv.org/pdf/{ref_id}'
            ref_pdf_path = os.path.join(download_dir, f'{ref_title}.pdf')
            if progress:
                progress(f"Downloading reference {i+1}/{len(references)}...{ref_title}")
                time.sleep(0.2)
            try:
                download_pdf(ref_url, ref_pdf_path)
                all_pdf_paths.append(ref_pdf_path)
            except Exception as e:
                if progress:
                    progress(f"Error downloading {ref_url}: {str(e)}")
                    time.sleep(0.2)
    
    # Create a list of all PDF paths
    paths_file = os.path.join(download_dir, 'pdf_paths.txt')
    with open(paths_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_pdf_paths))
    
    if progress:
        progress(f"All papers downloaded. Total references: {len(references)}")
        time.sleep(1)
    return paths_file, len(references)

def ingest_paper_with_llama(paths_file, progress=None):
    total_text = ""
    total_word_count = 0

    if progress:
        progress("Ingesting paper content...")

    with open(paths_file, 'r', encoding='utf-8') as f:
        pdf_paths = f.read().splitlines()
        

    for i, pdf_path in enumerate(pdf_paths):
        if progress:
            progress(f"Ingesting PDF {i+1}/{len(pdf_paths)}...")
        with open(pdf_path, 'rb') as pdf_file:
            pdf_content = pdf_file.read()
            text = extract_text_from_pdf(pdf_content)
            total_text += text + "\n\n"
            total_word_count += len(text.split())

    if progress:
        progress("Paper ingestion complete!")

    return total_text, total_word_count
    
def gradio_interface():
    paper_content = {"text": ""}
    
    def process(arxiv_url, progress=gr.Progress()):
        download_dir = 'downloads'
        progress(0, "Starting download...")
        paper_path, num_references = download_arxiv_paper_and_citations(arxiv_url, download_dir, 
                                                      lambda msg: progress(0.3, msg))
        if paper_path is None:
            return "Invalid Url. Valid example: https://arxiv.org/abs/1706.03762v7"
            
        paper_content["text"], total_word_count = ingest_paper_with_llama(paper_path, 
                                                      lambda msg: progress(0.7, msg))
        progress(1.0, "Ready for chat!")
        return f"Total {total_word_count} words and {num_references} reference ingested. You can now chat about the paper and citations."

    def respond(message, history):
        user_message = message

        if not user_message:
            return history, ""
        
        # Append user message immediately
        history.append([user_message, ""])
        

        client = Together(api_key="Your API key here")

        # Prepare the system prompt and user message


        system_prompt = f"""
                    You are a research assistant that have access to the paper reference below.
                    Answer questions based on your knowledge on these references.
                    If you do not know the answer, say you don't know.
                    paper reference: {paper_content["text"]}
                    """
        
        stream = client.chat.completions.create(
            model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            temperature=0.3,
            stream=True  # Enable streaming
        )
        
        # Initialize an empty response
        full_response = ""
        
        # Stream the response chunks
        for chunk in stream:
            if len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None:
                content = chunk.choices[0].delta.content
                full_response += content
                # Update the last message in history with the current response
                history[-1][1] = full_response
                yield history,""
        

    def clear_chat_history():
        return [], ""
    
    with gr.Blocks(css=".orange-button {background-color: #FF7C00 !important; color: white;}") as demo:
        gr.Markdown("# Research Analyzer")
        with gr.Column():
            input_text = gr.Textbox(label="ArXiv URL")
            status_text = gr.Textbox(label="Status", interactive=False)
            submit_btn = gr.Button("Ingest", elem_classes="orange-button")
            submit_btn.click(fn=process, inputs=input_text, outputs=status_text)
            
            gr.Markdown("## Chat with Llama")
            chatbot = gr.Chatbot()
        with gr.Row():
            msg = gr.Textbox(label="Ask about the paper", scale=5)
            submit_chat_btn = gr.Button("➤", elem_classes="orange-button", scale=1)
            
        submit_chat_btn.click(respond, [msg, chatbot], [chatbot, msg])
        msg.submit(respond, [msg, chatbot], [chatbot, msg])
            
        def copy_last_response(history):
            if history and len(history) > 0:
                last_response = history[-1][1]
                return gr.update(value=last_response)
            return gr.update(value="No response to copy")
        
    
    demo.launch()


if __name__ == "__main__":
    gradio_interface()