| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 | # Copyright (c) Meta Platforms, Inc. and affiliates.# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.import jsonfrom typing import List, Literal, TypedDictRole = Literal["user", "assistant"]class Message(TypedDict):    role: Role    content: strDialog = List[Message]B_INST, E_INST = "[INST]", "[/INST]"B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"def format_tokens(dialogs, tokenizer):    prompt_tokens = []    for dialog in dialogs:        if dialog[0]["role"] == "system":            dialog = [            {                "role": dialog[1]["role"],                "content": B_SYS                + dialog[0]["content"]                + E_SYS                + dialog[1]["content"],            }        ] + dialog[2:]        assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(            [msg["role"] == "assistant" for msg in dialog[1::2]]        ), (            "model only supports 'system','user' and 'assistant' roles, "            "starting with user and alternating (u/a/u/a/u...)"        )        """        Please verify that your tokenizer support adding "[INST]", "[/INST]" to your inputs.        Here, we are adding it manually.        """        dialog_tokens: List[int] = sum(            [                tokenizer.encode(                    f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",                ) + [tokenizer.eos_token_id]                for prompt, answer in zip(dialog[::2], dialog[1::2])            ],            [],        )        assert (            dialog[-1]["role"] == "user"        ), f"Last message must be from user, got {dialog[-1]['role']}"        dialog_tokens += tokenizer.encode(            f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}",        )        prompt_tokens.append(dialog_tokens)    return prompt_tokens        def read_dialogs_from_file(file_path):    with open(file_path, 'r') as file:        dialogs = json.load(file)    return dialogs
 |