| ```python | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/step-00047360-recurrence_full_512_0", trust_remote_code=True) | |
| tokenizer = AutoTokenizer("tomg-group-umd/step-00047360-recurrence_full_512_0") | |
| device=torch.device("cuda:0") | |
| input_ids = tokenizer.encode("The capital of Westphalia is", return_tensors="pt", add_special_tokens=True).to(device)[:, :-1] | |
| model.eval() | |
| model.to(device) | |
| model(input_ids) | |
| # or, more efficiently | |
| amp_settings = {"device_type": "cuda", "enabled": True, "dtype": torch.bfloat16} | |
| if not amp_settings["enabled"]: | |
| torch.backends.cuda.enable_math_sdp(True) | |
| with torch.autocast(**amp_settings), torch.no_grad(): | |
| model(input_ids=input_ids) | |
| ###### Caching: | |
| # first step: | |
| past_key_values = None | |
| outputs = model(input_ids=input_ids, use_cache=True, past_key_values=past_key_values) | |
| past_key_values = outputs.past_key_values | |
| # next step | |
| outputs = model(input_ids=input_ids, use_cache=True, past_key_values=past_key_values) | |