Created
June 22, 2023 09:41
-
-
Save Narsil/7fd524bd6d59c9827563e9e0c99e7952 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import AutoTokenizer, RobertaForMaskedLM, AutoConfig | |
from transformers.pipelines.base import infer_framework_load_model | |
from os import path | |
from huggingface_hub import hf_hub_download | |
def compare_models(pt_mdl, sf_mdl): | |
# A blend of convert.py's generalized check_final_model with concrete usage example to demonstrate | |
sf_dict = sf_mdl.state_dict() | |
print( | |
"Tensors the same for pt and sf hub models:", | |
all([torch.allclose(v, sf_dict[k]) for k, v in pt_mdl.state_dict().items()]), | |
) | |
kwargs = dict() | |
kwargs["input_ids"] = torch.arange(10).unsqueeze(0) | |
pt_logits = pt_mdl(**kwargs)[0] | |
sf_logits = sf_mdl(**kwargs)[0] | |
try: | |
torch.testing.assert_close(sf_logits, pt_logits) | |
print("Model outputs match!") | |
except AssertionError as e: | |
print(e) | |
sequence = f"To be, or not to be, that is the {tokenizer.mask_token}" | |
input_seq = tokenizer.encode(sequence, return_tensors="pt") | |
mask_token_index = torch.where(input_seq == tokenizer.mask_token_id)[1] # we only want the 2nd dimension | |
pt_token_logits = pt_mdl(input_seq).logits | |
sf_token_logits = sf_mdl(input_seq).logits | |
pt_masked_token_logits = pt_token_logits[0, mask_token_index, :] | |
sf_masked_token_logits = sf_token_logits[0, mask_token_index, :] | |
pt_top_tokens = torch.topk(pt_masked_token_logits, 4, dim=1).indices[0].tolist() | |
sf_top_tokens = torch.topk(sf_masked_token_logits, 4, dim=1).indices[0].tolist() | |
print( | |
f"Pytorch masked language model output for '{sequence}' with top predicted <mask> tokens: \n" | |
f"{', '.join([tokenizer.decode([token]) for token in pt_top_tokens])}" | |
) | |
print( | |
f"Safetensors masked language model output for '{sequence}' with top predicted <mask> tokens: \n" | |
f"{', '.join([tokenizer.decode([token]) for token in sf_top_tokens])}" | |
) | |
repo_id = "roberta-base" | |
tokenizer = AutoTokenizer.from_pretrained(repo_id) | |
MODELS_DIR = "tmp/" | |
pt_dir = path.join(MODELS_DIR, f"{repo_id}_pt") | |
sf_dir = path.join(MODELS_DIR, f"{repo_id}_sf") | |
pt_config_path = hf_hub_download(repo_id=repo_id, filename="config.json", local_dir=pt_dir) | |
sf_config_path = hf_hub_download(repo_id=repo_id, filename="config.json", local_dir=sf_dir) | |
pt_model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", local_dir=pt_dir) | |
sf_model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors", local_dir=sf_dir) | |
config = AutoConfig.from_pretrained(sf_config_path) | |
pt_framework, pt_model_infer = infer_framework_load_model(pt_dir, config) | |
sf_framework, sf_model_infer = infer_framework_load_model(sf_dir, config) | |
compare_models(pt_model_infer, sf_model_infer) | |
# What infer_framework_load_model would use, i.e., model.__class__ | |
pt_model = RobertaForMaskedLM.from_pretrained(pt_dir, use_safetensors=False) | |
sf_model = RobertaForMaskedLM.from_pretrained(sf_dir, use_safetensors=True) | |
compare_models(pt_model, sf_model) | |
pt_model, _ = RobertaForMaskedLM.from_pretrained(pt_dir, use_safetensors=False, output_loading_info=True) | |
sf_model, _ = RobertaForMaskedLM.from_pretrained(sf_dir, use_safetensors=True, output_loading_info=True) | |
compare_models(pt_model, sf_model) | |
# print("\nThe pytorch and safetensors models on the hub for roberta-base produce different results") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment