Last active
September 22, 2023 14:33
-
-
Save cmosguy/4b5052507191200397c551fc24853ef0 to your computer and use it in GitHub Desktop.
debugging a codellama finetuning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
from datetime import datetime | |
import os | |
import sys | |
import gpustat | |
gpus = gpustat.new_query() | |
[print(gpu) for gpu in gpus] | |
import torch | |
from peft import ( | |
LoraConfig, | |
get_peft_model, | |
get_peft_model_state_dict, | |
prepare_model_for_int8_training, | |
set_peft_model_state_dict, | |
) | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq | |
from datasets import load_dataset | |
dataset = load_dataset("b-mc2/sql-create-context", split="train") | |
train_dataset = dataset.train_test_split(test_size=0.1)["train"] | |
eval_dataset = dataset.train_test_split(test_size=0.1)["test"] | |
# %% | |
base_model_name = "codellama/CodeLlama-7b-hf" | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
load_in_8bit=True, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
# %% | |
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables. | |
You must output the SQL query that answers the question. | |
### Input: | |
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska? | |
### Context: | |
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR) | |
### Response: | |
""" | |
# {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'} | |
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda") | |
model.eval() | |
with torch.no_grad(): | |
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True)) | |
# %% [markdown] | |
# I get the output: | |
# ``` | |
# SELECT * FROM table_name_12 WHERE class > 91.5 AND city_of_license = 'hyannis, nebraska' | |
# ``` | |
# which is clearly wrong if the input is asking for just class! | |
# %% [markdown] | |
# ### 4. Tokenization | |
# Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa): | |
tokenizer.add_eos_token = True | |
tokenizer.pad_token_id = 0 | |
tokenizer.padding_side = "left" | |
# %% [markdown] | |
# Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is: | |
# %% | |
def tokenize(prompt): | |
result = tokenizer( | |
prompt, | |
truncation=True, | |
max_length=512, | |
padding=False, | |
return_tensors=None, | |
) | |
# "self-supervised learning" means the labels are also the inputs: | |
result["labels"] = result["input_ids"].copy() | |
return result | |
# %% [markdown] | |
# And run convert each data_point into a prompt that I found online that works quite well: | |
def generate_and_tokenize_prompt(data_point): | |
full_prompt =f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables. | |
You must output the SQL query that answers the question. | |
### Input: | |
{data_point["question"]} | |
### Context: | |
{data_point["context"]} | |
### Response: | |
{data_point["answer"]} | |
""" | |
return tokenize(full_prompt) | |
# %% [markdown] | |
# Reformat to prompt and tokenize each sample: | |
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt) | |
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt) | |
# %% [markdown] | |
# ### 5. Setup Lora | |
model.train() # put model back into training mode | |
model = prepare_model_for_int8_training(model) | |
config = LoraConfig( | |
r=16, | |
lora_alpha=16, | |
target_modules=[ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
], | |
lora_dropout=0.05, | |
bias="none", | |
task_type="CAUSAL_LM", | |
) | |
model = get_peft_model(model, config) | |
# %% [markdown] | |
# To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model: | |
resume_from_checkpoint = "" #"./sql-code-llama/checkpoint-380/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from | |
if resume_from_checkpoint: | |
if os.path.exists(resume_from_checkpoint): | |
print(f"Restarting from {resume_from_checkpoint}") | |
adapters_weights = torch.load(resume_from_checkpoint) | |
set_peft_model_state_dict(model, adapters_weights) | |
else: | |
print(f"Checkpoint {resume_from_checkpoint} not found") | |
# %% | |
if torch.cuda.device_count() > 1: | |
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available | |
print("total devices: ", torch.cuda.device_count()) | |
model.is_parallelizable = True | |
model.model_parallel = True | |
else: | |
print("only 1 gpu available") | |
# %% [markdown] | |
# ### 6. Training arguments | |
# If you run out of GPU memory, change per_device_train_batch_size. | |
# The gradient_accumulation_steps variable should ensure this | |
# doesn't affect batch dynamics during the training run. | |
# All the other variables are standard stuff that I wouldn't | |
# recommend messing with: | |
batch_size = 128 | |
per_device_train_batch_size = 32 | |
gradient_accumulation_steps = batch_size // per_device_train_batch_size | |
output_dir = "../checkpoints" | |
training_args = TrainingArguments( | |
per_device_train_batch_size=per_device_train_batch_size, | |
gradient_accumulation_steps=gradient_accumulation_steps, | |
warmup_steps=100, | |
max_steps=400, | |
learning_rate=3e-4, | |
fp16=True, | |
bf16=False, # you can only set true if fp16 is false, you cannot have both | |
logging_steps=10, | |
optim="adamw_torch", | |
evaluation_strategy="steps", # if val_set_size > 0 else "no", | |
save_strategy="steps", | |
eval_steps=20, | |
save_steps=20, | |
output_dir=output_dir, | |
# save_total_limit=3, | |
load_best_model_at_end=False, | |
# ddp_find_unused_parameters=False if ddp else None, | |
group_by_length=True, # group sequences of roughly the same length together to speed up training | |
report_to="tensorboard", # if use_wandb else "none", | |
run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None, | |
logging_dir=f"../logs/runs", | |
) | |
trainer = Trainer( | |
model=model, | |
train_dataset=tokenized_train_dataset, | |
eval_dataset=tokenized_val_dataset, | |
args=training_args, | |
data_collator=DataCollatorForSeq2Seq( | |
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True | |
), | |
) | |
# %% [markdown] | |
# Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy): | |
model.config.use_cache = False | |
old_state_dict = model.state_dict | |
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__( | |
model, type(model) | |
) | |
if torch.__version__ >= "2" and sys.platform != "win32": | |
print("compiling the model") | |
model = torch.compile(model) | |
# %% | |
gpus = gpustat.new_query() | |
[print(gpu) for gpu in gpus] | |
trainer.train() | |
# Save trained model | |
new_model = "../models/sql-code-llama" | |
trainer.model.save_pretrained(new_model) | |
gpus = gpustat.new_query() | |
[print(gpu) for gpu in gpus] | |
# %% [markdown] | |
# ### Load the final checkpoint | |
# Now for the moment of truth! Has our work paid off...? | |
import torch | |
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer | |
from peft import PeftModel | |
base_model_name = "codellama/CodeLlama-7b-hf" | |
base_model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
load_in_8bit=True, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
model = PeftModel.from_pretrained(base_model, new_model) | |
#model = model.merge_and_unload() | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) | |
# %% [markdown] | |
# To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained. | |
# ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin: | |
# the run took forever so all I have is this checkpoint: | |
# output_dir = "./sql-code-llama/checkpoint-360" | |
# %% [markdown] | |
# Try the same prompt as before: | |
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables. | |
You must output the SQL query that answers the question. | |
### Input: | |
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska? | |
### Context: | |
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR) | |
### Response: | |
""" | |
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda") | |
model.eval() | |
with torch.no_grad(): | |
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True)) | |
# %% [markdown] | |
# And the model outputs: | |
# ``` | |
# SELECT class FROM table_name_12 WHERE frequency_mhz > 91.5 AND city_of_license = "hyannis, nebraska" | |
# ``` | |
# So it works! If you want to convert your this adapter to a Llama.cpp model to run locally follow my other [guide](https://ragntune.com/blog/A-guide-to-running-Llama-2-qlora-loras-on-Llama.cpp). If you have any questions, shoot me a message on [Elon Musk's website](https://twitter.com/samlhuillier_). | |
# | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
git+https://github.com/huggingface/transformers.git@main | |
bitsandbytes # we need latest transformers for this | |
git+https://github.com/huggingface/peft.git@4c611f4 | |
datasets==2.10.1 | |
wandb | |
scipy | |
gpustat | |
pytz | |
tensorboardX |
Hey try these:
!pip install git+https://github.com/huggingface/transformers.git@main bitsandbytes accelerate==0.20.3 # we need latest transformers for this
!pip install git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08
!pip install datasets==2.10.1
import locale # colab workaround
locale.getpreferredencoding = lambda: "UTF-8" # colab workaround
!pip install wandb
!pip install scipy
I suspect they may work...
Thanks @samlhuillier !
The issue is if you install everything using the requirements file: file-requirements-debug.txt attached to this gist the patching of the peft
does not work. It has to be installed outside the requirements file for it to really work as it should. very weird...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@samlhuillier
Sorry to keep bothering you about this, but still am having issues
I just tried running this code and using the versions:
and I still cannot get it to save the updated weights with the lora adaptors. I did the check using the code:
Are you getting the same results?