In [None]:
!nvidia-smi

Sat May 6 10:14:14 2023 
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |
| N/A 58C P0 29W / 70W | 13785MiB / 15360MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
 
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
+-----------------------------------------------------------------------------+


In [None]:
!git clone https://huggingface.co/openlm-research/open_llama_7b_preview_300bt

In [None]:
!pip install -qqq transformers==4.28.1 --progress-bar off
!pip install -qqq bitsandbytes==0.38.1 --progress-bar off
!pip install -qqq accelerate==0.18.0 --progress-bar off
!pip install -qqq sentencepiece==0.1.99 --progress-bar off

In [None]:
import textwrap

import torch
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

In [None]:
def print_response(response: str):
 print(textwrap.fill(response, width=110))

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BOS_TOKEN_ID = 1
EOS_TOKEN_ID = 2
MAX_TOKENS = 1024

In [None]:
MODEL_NAME = "/content/open_llama_7b_preview_300bt/open_llama_7b_preview_300bt_transformers_weights"

tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME, add_eos_token=True)

model = LlamaForCausalLM.from_pretrained(
 MODEL_NAME, local_files_only=True, torch_dtype=torch.float16, device_map="auto"
)

In [None]:
tokenizer.bos_token_id = BOS_TOKEN_ID

## Single prompt

In [None]:
prompt = "The world's highest building is"

In [None]:
generation_config = GenerationConfig(max_new_tokens=256, temperature=0.5)

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [None]:
inputs

{'input_ids': tensor([[ 1, 347, 925, 31889, 31842, 4454, 2203, 322, 0]],
 device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [None]:
%%time

with torch.inference_mode():
 tokens = model.generate(**inputs, generation_config=generation_config)

CPU times: user 18.2 s, sys: 306 ms, total: 18.5 s
Wall time: 21.5 s


In [None]:
tokens

tensor([[ 1, 347, 925, 31889, 31842, 4454, 2203, 322, 0, 31856,
 347, 925, 31889, 31842, 8522, 384, 2203, 322, 266, 4646,
 31892, 21527, 21880, 31856, 347, 925, 31889, 31842, 8522, 384,
 2203, 322, 266, 4646, 31892, 21527, 21880, 31856, 347, 925,
 31889, 31842, 8522, 384, 2203, 322, 266, 4646, 31892, 21527,
 21880, 31856, 347, 925, 31889, 31842, 8522, 384, 2203, 322,
 266, 4646, 31892, 21527, 21880, 31856, 347, 925, 31889, 31842,
 8522, 384, 2203, 322, 266, 4646, 31892, 21527, 21880, 31856,
 347, 925, 31889, 31842, 8522, 384, 2203, 322, 266, 4646,
 31892, 21527, 21880, 31856, 347, 925, 31889, 31842, 8522, 384,
 2203, 322, 266, 4646, 31892, 21527, 21880, 31856, 347, 925,
 31889, 31842, 8522, 384, 2203, 322, 266, 4646, 31892, 21527,
 21880, 31856, 347, 925, 31889, 31842, 8522, 384, 2203, 322,
 266, 4646, 31892, 21527, 21880, 31856, 347, 925, 31889, 31842,
 8522, 384, 2203, 322, 266, 4646, 31892, 21527, 21880, 31856,
 347, 925, 31889, 31842, 8522, 384, 2203, 322, 266, 4646,
 31892, 21527,

In [None]:
completion = tokenizer.decode(tokens[0], skip_special_tokens=True)
print_response(completion)

The world's highest building is. The world's tallest building is the Burj Khalifa. The world's tallest
building is the Burj Khalifa. The world's tallest building is the Burj Khalifa. The world's tallest building
is the Burj Khalifa. The world's tallest building is the Burj Khalifa. The world's tallest building is the
Burj Khalifa. The world's tallest building is the Burj Khalifa. The world's tallest building is the Burj
Khalifa. The world's tallest building is the Burj Khalifa. The world's tallest building is the Burj Khalifa.
The world's tallest building is the Burj Khalifa. The world's tallest building is the Burj Khalifa. The
world's tallest building is the Burj Khalifa. The world's tallest building is the Burj Khalifa. The world's
tallest building is the Burj Khalifa. The world's tallest building is the Burj Khalifa. The world's tallest
building is the Burj Khalifa. The world's tallest building is the Burj Khalifa. The world'


## Prompting with sampling

In [None]:
def top_k_sampling(logits, k=10):
 top_k = torch.topk(logits, k)
 top_k_indices = top_k.indices
 top_k_values = top_k.values
 probabilities = torch.softmax(top_k_values, dim=-1)
 choice = torch.multinomial(probabilities, num_samples=1)
 token_id = int(top_k_indices[choice])
 return token_id


def process_chat(
 model: LlamaForCausalLM,
 tokenizer: LlamaTokenizer,
 prompt: str,
 max_new_tokens: int = 256,
):
 input_ids = tokenizer(prompt).input_ids

 output_token_ids = list(input_ids)

 max_src_len = MAX_TOKENS - max_new_tokens - 8
 input_ids = input_ids[-max_src_len:]
 with torch.inference_mode():
 for i in range(max_new_tokens):
 if i == 0:
 out = model(
 input_ids=torch.as_tensor([input_ids], device=DEVICE),
 use_cache=True,
 )
 logits = out.logits
 past_key_values = out.past_key_values
 else:
 out = model(
 input_ids=torch.as_tensor([[token_id]], device=DEVICE),
 use_cache=True,
 past_key_values=past_key_values,
 )
 logits = out.logits
 past_key_values = out.past_key_values

 last_token_logits = logits[0][-1]

 token_id = top_k_sampling(last_token_logits)

 output_token_ids.append(token_id)

 if token_id == EOS_TOKEN_ID:
 break

 return tokenizer.decode(output_token_ids, skip_special_tokens=True)

In [None]:
%%time
prompt = "You're Michael G Scott from the office. What is your favorite phrase?"
response = process_chat(model, tokenizer, prompt)
print_response(response)

You're Michael G Scott from the office. What is your favorite phrase?. I'm a bit too old to be doing this!
What do you think of your character in the Office? Who is your favorite person you know who is also an actor?
How do you get your inspiration? How do you feel about working with John Krasinski? What is the hardest thing
about being in the show? What kind of a fan are you? What's your favorite thing about being an actor?
CPU times: user 5.44 s, sys: 0 ns, total: 5.44 s
Wall time: 5.46 s


In [None]:
prompt = "The world's highest building is"
response = process_chat(model, tokenizer, prompt)
print_response(response)

The world's highest building is tower, which is 1,336.8 meters from the ground in 2007. A building 1,336.8
meters high is called tallest building in history. The tallest building in the world is currently Burj Khalifa
(Dubai). Burj Khalifa is 829.8 meters tall. Burj Khalifa is a 77-story skyscraper. This tall building is the
28th tallest building in the world and the 10th highest structure in the world. Burj Khalifa is a mixed use
building. In addition to office space, there are apartments, hotels, shopping malls in the building. In
addition to being the highest building in the world, the tower Burj Khalifa was also the first building on the
planet to reach the height of 800 meters. This is a record that was achieved in 2010. Burj Khalifa is owned by
the real estate developer Emaar Properties. Burj Khalifa was developed in a record time, the building was
opened at 2010, but was first opened at the end of December 2010 and officially opened on January 4


In [None]:
prompt = "The best way to invest $10,000 is"
response = process_chat(model, tokenizer, prompt)
print_response(response)

The best way to invest $10,000 is The best way to invest $10,000 is to start by saving up to $5000 and then
buy stock that’s already cheap. This is a more conservative way of investing and you can buy a good stock for
a cheap price. I think you’re right, but the way you’re describing it is more like buying a stock for 50 cents
a share. If you look at the way that I describe it, it’s not a stock that’s a penny a share. It’s a stock
that’s a penny a share, so it’s a penny on a dollar. If you look at the way that I describe it, it’s a stock
that’s a penny on a dollar. It’s a penny on a dollar, so it’s a penny on a dollar. But if you look at it, it’s
a penny on a penny. I think it’s a penny on a penny. But the way you describe it is a penny on a penny. But if
you look at it, it’s a penny on a penny. I think it’s a penny on the dollar. But the way you describe it is a
penny on the penny. But


In [None]:
prompt = "The best make and model v8 manual gearbox car is"
response = process_chat(model, tokenizer, prompt)
print_response(response)

The best make and model v8 manual gearbox car is 1968-1969 gmc sierra with 4spd manual or gm overdrive auto.
If you want to know how to install a manual gearbox car, please do not hesitate to contact us. If you are
wondering how to install a manual gearbox car, please contact us at:. If you want to know how to install a
manual transaxle gearbox car, please read the following.


## References

- https://github.com/riversun/open_llama_7b_hands_on
- https://news.ycombinator.com/item?id=35798888
- https://github.com/openlm-research/open_llama
- https://huggingface.co/openlm-research/open_llama_7b_preview_300bt