cmosguy · September 22, 2023 14:33 · cmosguy · Sep 21, 2023 · samlhuillier · Sep 22, 2023
diff --git a/03-ragntune-fine-tune-code-llama.py b/03-ragntune-fine-tune-code-llama.py
 # %%
 from datetime import datetime
 import os
 import sys
 import gpustat

 gpus = gpustat.new_query()
 [print(gpu) for gpu in gpus]


 import torch
 from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
 )
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


 from datasets import load_dataset
 dataset = load_dataset("b-mc2/sql-create-context", split="train")
 train_dataset = dataset.train_test_split(test_size=0.1)["train"]
 eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

 # %%
 base_model_name = "codellama/CodeLlama-7b-hf"
 model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)


 # %%
 eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

 You must output the SQL query that answers the question.
 ### Input:
 Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

 ### Context:
 CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

 ### Response:
 """
 # {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'}
 model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

 model.eval()
 with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

 # %% [markdown]
 # I get the output:
 # ```
 # SELECT * FROM table_name_12 WHERE class > 91.5 AND city_of_license = 'hyannis, nebraska'
 # ```
 # which is clearly wrong if the input is asking for just class!

 # %% [markdown]
 # ### 4. Tokenization
 # Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa):
 tokenizer.add_eos_token = True
 tokenizer.pad_token_id = 0
 tokenizer.padding_side = "left"

 # %% [markdown]
 # Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

 # %%
 def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

 # %% [markdown]
 # And run convert each data_point into a prompt that I found online that works quite well:
 def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

 You must output the SQL query that answers the question.

 ### Input:
 {data_point["question"]}

 ### Context:
 {data_point["context"]}

 ### Response:
 {data_point["answer"]}
 """
    return tokenize(full_prompt)

 # %% [markdown]
 # Reformat to prompt and tokenize each sample:
 tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
 tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

 # %% [markdown]
 # ### 5. Setup Lora
 model.train() # put model back into training mode
 model = prepare_model_for_int8_training(model)

 config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
 ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
 )
 model = get_peft_model(model, config)

 # %% [markdown]
 # To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model:
 resume_from_checkpoint = "" #"./sql-code-llama/checkpoint-380/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from

 if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

 # %%
 if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    print("total devices: ", torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True
 else:
    print("only 1 gpu available")

 # %% [markdown]
 # ### 6. Training arguments
 # If you run out of GPU memory, change per_device_train_batch_size. 
 # The gradient_accumulation_steps variable should ensure this 
 # doesn't affect batch dynamics during the training run. 
 # All the other variables are standard stuff that I wouldn't 
 # recommend messing with:
 batch_size = 128
 per_device_train_batch_size = 32
 gradient_accumulation_steps = batch_size // per_device_train_batch_size
 output_dir = "../checkpoints"

 training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        bf16=False, # you can only set true if fp16 is false, you cannot have both
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        # save_total_limit=3,
        load_best_model_at_end=False,
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="tensorboard", # if use_wandb else "none",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
        logging_dir=f"../logs/runs",
    )

 trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
 )

 # %% [markdown]
 # Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):
 model.config.use_cache = False

 old_state_dict = model.state_dict
 model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
 )
 if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

 # %%
 gpus = gpustat.new_query()
 [print(gpu) for gpu in gpus]

 trainer.train()

 # Save trained model
 new_model = "../models/sql-code-llama"
 trainer.model.save_pretrained(new_model)

 gpus = gpustat.new_query()
 [print(gpu) for gpu in gpus]
 # %% [markdown]
 # ### Load the final checkpoint
 # Now for the moment of truth! Has our work paid off...?
 import torch
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
 from peft import PeftModel

 base_model_name = "codellama/CodeLlama-7b-hf"
 base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
 )

 model = PeftModel.from_pretrained(base_model, new_model)
 #model = model.merge_and_unload()


 tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)


 # %% [markdown]
 # To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained. 
 # ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:
 # the run took forever so all I have is this checkpoint:
 # output_dir = "./sql-code-llama/checkpoint-360"

 # %% [markdown]
 # Try the same prompt as before:
 eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

 You must output the SQL query that answers the question.
 ### Input:
 Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

 ### Context:
 CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

 ### Response:
 """

 model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

 model.eval()
 with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


 # %% [markdown]
 # And the model outputs:
 # ```
 # SELECT class FROM table_name_12 WHERE frequency_mhz > 91.5 AND city_of_license = "hyannis, nebraska"
 # ```
 # So it works! If you want to convert your this adapter to a Llama.cpp model to run locally follow my other [guide](https://ragntune.com/blog/A-guide-to-running-Llama-2-qlora-loras-on-Llama.cpp). If you have any questions, shoot me a message on [Elon Musk's website](https://twitter.com/samlhuillier_).
 # 


diff --git a/requirements-debug.txt b/requirements-debug.txt
 git+https://github.com/huggingface/transformers.git@main 
 bitsandbytes  # we need latest transformers for this
 git+https://github.com/huggingface/peft.git@4c611f4
 datasets==2.10.1
 wandb
 scipy
 gpustat
 pytz
 tensorboardX
	# %%
	from datetime import datetime
	import os
	import sys
	import gpustat

	gpus = gpustat.new_query()
	[print(gpu) for gpu in gpus]


	import torch
	from peft import (
	LoraConfig,
	get_peft_model,
	get_peft_model_state_dict,
	prepare_model_for_int8_training,
	set_peft_model_state_dict,
	)
	from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


	from datasets import load_dataset
	dataset = load_dataset("b-mc2/sql-create-context", split="train")
	train_dataset = dataset.train_test_split(test_size=0.1)["train"]
	eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

	# %%
	base_model_name = "codellama/CodeLlama-7b-hf"
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	load_in_8bit=True,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)


	# %%
	eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

	You must output the SQL query that answers the question.
	### Input:
	Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

	### Context:
	CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

	### Response:
	"""
	# {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'}
	model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

	model.eval()
	with torch.no_grad():
	print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

	# %% [markdown]
	# I get the output:
	# ```
	# SELECT * FROM table_name_12 WHERE class > 91.5 AND city_of_license = 'hyannis, nebraska'
	# ```
	# which is clearly wrong if the input is asking for just class!

	# %% [markdown]
	# ### 4. Tokenization
	# Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa):
	tokenizer.add_eos_token = True
	tokenizer.pad_token_id = 0
	tokenizer.padding_side = "left"

	# %% [markdown]
	# Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

	# %%
	def tokenize(prompt):
	result = tokenizer(
	prompt,
	truncation=True,
	max_length=512,
	padding=False,
	return_tensors=None,
	)

	# "self-supervised learning" means the labels are also the inputs:
	result["labels"] = result["input_ids"].copy()

	return result

	# %% [markdown]
	# And run convert each data_point into a prompt that I found online that works quite well:
	def generate_and_tokenize_prompt(data_point):
	full_prompt =f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

	You must output the SQL query that answers the question.

	### Input:
	{data_point["question"]}

	### Context:
	{data_point["context"]}

	### Response:
	{data_point["answer"]}
	"""
	return tokenize(full_prompt)

	# %% [markdown]
	# Reformat to prompt and tokenize each sample:
	tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
	tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

	# %% [markdown]
	# ### 5. Setup Lora
	model.train() # put model back into training mode
	model = prepare_model_for_int8_training(model)

	config = LoraConfig(
	r=16,
	lora_alpha=16,
	target_modules=[
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	)
	model = get_peft_model(model, config)

	# %% [markdown]
	# To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model:
	resume_from_checkpoint = "" #"./sql-code-llama/checkpoint-380/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from

	if resume_from_checkpoint:
	if os.path.exists(resume_from_checkpoint):
	print(f"Restarting from {resume_from_checkpoint}")
	adapters_weights = torch.load(resume_from_checkpoint)
	set_peft_model_state_dict(model, adapters_weights)
	else:
	print(f"Checkpoint {resume_from_checkpoint} not found")

	# %%
	if torch.cuda.device_count() > 1:
	# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
	print("total devices: ", torch.cuda.device_count())
	model.is_parallelizable = True
	model.model_parallel = True
	else:
	print("only 1 gpu available")

	# %% [markdown]
	# ### 6. Training arguments
	# If you run out of GPU memory, change per_device_train_batch_size.
	# The gradient_accumulation_steps variable should ensure this
	# doesn't affect batch dynamics during the training run.
	# All the other variables are standard stuff that I wouldn't
	# recommend messing with:
	batch_size = 128
	per_device_train_batch_size = 32
	gradient_accumulation_steps = batch_size // per_device_train_batch_size
	output_dir = "../checkpoints"

	training_args = TrainingArguments(
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	warmup_steps=100,
	max_steps=400,
	learning_rate=3e-4,
	fp16=True,
	bf16=False, # you can only set true if fp16 is false, you cannot have both
	logging_steps=10,
	optim="adamw_torch",
	evaluation_strategy="steps", # if val_set_size > 0 else "no",
	save_strategy="steps",
	eval_steps=20,
	save_steps=20,
	output_dir=output_dir,
	# save_total_limit=3,
	load_best_model_at_end=False,
	# ddp_find_unused_parameters=False if ddp else None,
	group_by_length=True, # group sequences of roughly the same length together to speed up training
	report_to="tensorboard", # if use_wandb else "none",
	run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
	logging_dir=f"../logs/runs",
	)

	trainer = Trainer(
	model=model,
	train_dataset=tokenized_train_dataset,
	eval_dataset=tokenized_val_dataset,
	args=training_args,
	data_collator=DataCollatorForSeq2Seq(
	tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
	),
	)

	# %% [markdown]
	# Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):
	model.config.use_cache = False

	old_state_dict = model.state_dict
	model.state_dict = (lambda self, _, *__: get_peft_model_state_dict(self, old_state_dict())).__get__(
	model, type(model)
	)
	if torch.__version__ >= "2" and sys.platform != "win32":
	print("compiling the model")
	model = torch.compile(model)

	# %%
	gpus = gpustat.new_query()
	[print(gpu) for gpu in gpus]

	trainer.train()

	# Save trained model
	new_model = "../models/sql-code-llama"
	trainer.model.save_pretrained(new_model)

	gpus = gpustat.new_query()
	[print(gpu) for gpu in gpus]
	# %% [markdown]
	# ### Load the final checkpoint
	# Now for the moment of truth! Has our work paid off...?
	import torch
	from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
	from peft import PeftModel

	base_model_name = "codellama/CodeLlama-7b-hf"
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	load_in_8bit=True,
	torch_dtype=torch.float16,
	device_map="auto",
	)

	model = PeftModel.from_pretrained(base_model, new_model)
	#model = model.merge_and_unload()


	tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)


	# %% [markdown]
	# To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained.
	# ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:
	# the run took forever so all I have is this checkpoint:
	# output_dir = "./sql-code-llama/checkpoint-360"

	# %% [markdown]
	# Try the same prompt as before:
	eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

	You must output the SQL query that answers the question.
	### Input:
	Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

	### Context:
	CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

	### Response:
	"""

	model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

	model.eval()
	with torch.no_grad():
	print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


	# %% [markdown]
	# And the model outputs:
	# ```
	# SELECT class FROM table_name_12 WHERE frequency_mhz > 91.5 AND city_of_license = "hyannis, nebraska"
	# ```
	# So it works! If you want to convert your this adapter to a Llama.cpp model to run locally follow my other [guide](https://ragntune.com/blog/A-guide-to-running-Llama-2-qlora-loras-on-Llama.cpp). If you have any questions, shoot me a message on [Elon Musk's website](https://twitter.com/samlhuillier_).
	#
	git+https://github.com/huggingface/transformers.git@main
	bitsandbytes # we need latest transformers for this
	git+https://github.com/huggingface/peft.git@4c611f4
	datasets==2.10.1
	wandb
	scipy
	gpustat
	pytz
	tensorboardX