Last active
October 8, 2023 17:21
-
-
Save JoaoLages/461b3e0b05adf6f8cdc54365ae1ca824 to your computer and use it in GitHub Desktop.
Comparing generation speed with and without KV caching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import time | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device) | |
for use_cache in (True, False): | |
times = [] | |
for _ in range(10): # measuring 10 generations | |
start = time.time() | |
model.generate(**tokenizer("What is KV caching?", return_tensors="pt").to(device), use_cache=use_cache, max_new_tokens=1000) | |
times.append(time.time() - start) | |
print(f"{'with' if use_cache else 'without'} KV caching: {round(np.mean(times), 3)} +- {round(np.std(times), 3)} seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment