1 рік тому · 0b1228fb83
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1454,3 +1454,4 @@ acc
 
				 OCRVQA
			
 
				 OCRVQADataCollator
			
 
				 ocrvqa
			
 
				+langchain
			
--- a/README.md
+++ b/README.md
@@ -76,6 +76,10 @@ To use the sensitive topics safety checker install with:
 
				 ```
			
 
				 pip install llama-recipes[auditnlg]
			
 
				 ```
			
 
				+Some recipes require the presence of langchain. To install the packages follow the recipe description or install with:
			
 
				+```
			
 
				+pip install llama-recipes[langchain]
			
 
				+```
			
 
				 Optional dependencies can also be combines with [option1,option2].
			
 
				 
			
 
				 #### Install from source
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dynamic = ["dependencies"]
 
				 vllm = ["vllm"]
			
 
				 tests = ["pytest-mock"]
			
 
				 auditnlg = ["auditnlg"]
			
 
				+langchain = ["langchain_openai", "langchain", "langchain_community"]
			
 
				 
			
 
				 [project.urls]
			
 
				 "Homepage" = "https://github.com/facebookresearch/llama-recipes/"
			
--- a/recipes/quickstart/finetuning/datasets/custom_dataset.py
+++ b/recipes/quickstart/finetuning/datasets/custom_dataset.py
@@ -9,19 +9,30 @@ import itertools
 
				 
			
 
				 
			
 
				 B_INST, E_INST = "[INST]", "[/INST]"
			
 
				+EOT_ID = 128009 #<|eot_id|>
			
 
				+
			
 
				+def mask_target(target,seq):
			
 
				+    for i in range(len(seq)-len(target)):
			
 
				+        if seq[i:i+len(target)] == target:
			
 
				+            seq[i:i+len(target)] = [-100] * len(target)
			
 
				+    return seq
			
 
				 
			
 
				 def tokenize_dialog(dialog, tokenizer):
			
 
				     if tokenizer.vocab_size >= 128000:
			
 
				         dialog_tokens = tokenizer.apply_chat_template(dialog)
			
 
				-        dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
			
 
				-        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
			
 
				+        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == EOT_ID]
			
 
				         labels = copy.copy(dialog_tokens)
			
 
				-        last_idx = 0
			
 
				+        #determine token for system and user 
			
 
				+        system_or_user = (tokenizer.encode("system")[-1], tokenizer.encode("user")[-1])
			
 
				+        labels[0] = -100 # bos token
			
 
				+        last_idx = 1
			
 
				         for n, idx in enumerate(eot_indices):
			
 
				-            if n % 2 == 1:
			
 
				-                last_idx = idx
			
 
				-            else:
			
 
				+            role_token = labels[last_idx+1]
			
 
				+            if role_token in system_or_user:
			
 
				+                # Set labels to -100 for system and user tokens to ignore in loss function
			
 
				                 labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
			
 
				+            last_idx = idx + 1
			
 
				+        mask_target(tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False), labels)
			
 
				 
			
 
				         dialog_tokens = [dialog_tokens]
			
 
				         labels_tokens = [labels]
			
--- a/recipes/quickstart/inference/local_inference/inference.py
+++ b/recipes/quickstart/inference/local_inference/inference.py
@@ -6,7 +6,6 @@ import sys
 
				 import time
			
 
				 
			
 
				 import fire
			
 
				-import gradio as gr
			
 
				 
			
 
				 import torch
			
 
				 
			
@@ -146,6 +145,11 @@ def main(
 
				         user_prompt = "\n".join(sys.stdin.readlines())
			
 
				         inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
			
 
				     else:
			
 
				+        try:
			
 
				+            import gradio as gr
			
 
				+        except ImportError:
			
 
				+            raise ImportError("This part of the recipe requires gradio. Please run `pip install gradio`")
			
 
				+            
			
 
				         gr.Interface(
			
 
				             fn=inference,
			
 
				             inputs=[
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,6 @@ py7zr
 
				 scipy
			
 
				 optimum
			
 
				 matplotlib
			
 
				-gradio
			
 
				 chardet
			
 
				 openai
			
 
				 typing-extensions==4.8.0
			
@@ -24,8 +23,5 @@ rouge_score
 
				 pyyaml==6.0.1
			
 
				 faiss-gpu; python_version < '3.11'
			
 
				 unstructured[pdf]
			
 
				-langchain_openai
			
 
				-langchain
			
 
				-langchain_community
			
 
				 sentence_transformers
			
 
				 codeshield
			
--- a/src/tests/conftest.py
+++ b/src/tests/conftest.py
@@ -6,7 +6,7 @@ import pytest
 
				 from transformers import AutoTokenizer
			
 
				 
			
 
				 ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?"
			
 
				-LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B"]
			
 
				+LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B-Instruct"]
			
 
				 
			
 
				 @pytest.fixture(params=LLAMA_VERSIONS)
			
 
				 def llama_version(request):
			
--- a/src/tests/datasets/test_custom_dataset.py
+++ b/src/tests/datasets/test_custom_dataset.py
@@ -11,7 +11,7 @@ EXPECTED_RESULTS={
 
				         "example_1": "[INST] Who made Berlin [/INST] dunno",
			
 
				         "example_2": "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!",
			
 
				     },
			
 
				-    "meta-llama/Meta-Llama-3.1-8B":{
			
 
				+    "meta-llama/Meta-Llama-3.1-8B-Instruct":{
			
 
				         "example_1": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWho made Berlin<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndunno<|eot_id|><|end_of_text|>",
			
 
				         "example_2": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow to start learning guitar and become a master at it?",
			
 
				     },
			
@@ -114,3 +114,30 @@ def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train,
 
				         }
			
 
				     with pytest.raises(AttributeError):
			
 
				         main(**kwargs)
			
 
				+
			
 
				+@pytest.mark.skip_missing_tokenizer
			
 
				+@patch('llama_recipes.finetuning.AutoTokenizer')
			
 
				+def test_tokenize_dialog(tokenizer, monkeypatch, setup_tokenizer, llama_version):
			
 
				+    monkeypatch.syspath_prepend("recipes/quickstart/finetuning/datasets/")
			
 
				+    from custom_dataset import tokenize_dialog
			
 
				+
			
 
				+    setup_tokenizer(tokenizer)
			
 
				+    tokenizer = tokenizer.from_pretrained()
			
 
				+
			
 
				+    dialog = [
			
 
				+        {"role":"user", "content":"Who made Berlin?"},
			
 
				+        {"role":"assistant", "content":"dunno"},
			
 
				+        {"role":"user", "content":"And Rome?"},
			
 
				+        {"role":"assistant", "content":"Romans"},
			
 
				+    ]
			
 
				+
			
 
				+    result = tokenize_dialog(dialog, tokenizer)
			
 
				+    
			
 
				+    if "Llama-2" in llama_version:
			
 
				+        assert result["labels"][:12] == [-100] * 12
			
 
				+        assert result["labels"][17:28] == [-100] * 11
			
 
				+        assert result["labels"].count(-100) == 11 + 12
			
 
				+    else:
			
 
				+        assert result["labels"][:38] == [-100] * 38
			
 
				+        assert result["labels"][43:54] == [-100] * 11
			
 
				+        assert result["labels"].count(-100) == 38 + 11