!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl
Fine Tune LLAMA 2
Step 1: Install All the Required Packages
Step 2: Import All the Required Libraries
import os
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)from peft import LoraConfig, PeftModel
from trl import SFTTrainer
Step 3: Load & Train model
Llma2 modelini fine tune edebilmek için belli formatta datasetine ihtiyacımız var. Ve format resimde ki gibidir.
Bu Promtta veri setine sahipsek istedeiğimiz şekilde modeli fine tune edebiliriz.
Şimdi sıradaki adım uygun bir veri seti bulup o veri setini nasil bu formata getirebiliriz onu yapacaz.
Bulduğum uygun bir Dataset: https://huggingface.co/datasets/timdettmers/openassistant-guanaco
Reformat Dataset following the Llama 2 template with 1k sample: https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k
Complete Reformat Dataset following the Llama 2 template: https://huggingface.co/datasets/mlabonne/guanaco-llama2
To know how this dataset was created, you can check this notebook. https://colab.research.google.com/drive/1Ad7a9zMmkxuXTOh1Z7-rNSICA4dybpM2?usp=sharing
# The model that you want to train from the Hugging Face hub
= "NousResearch/Llama-2-7b-chat-hf"
model_name
# The instruction dataset to use
= "mlabonne/guanaco-llama2-1k"
dataset_name
# Fine-tuned model name
= "Llama-2-7b-chat-finetune"
new_model
################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension KIND OF HYPERPARAMETER
= 64
lora_r
# Alpha parameter for LoRA scaling
= 16
lora_alpha
# Dropout probability for LoRA layers
= 0.1
lora_dropout
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
= True
use_4bit
# Compute dtype for 4-bit base models
= "float16"
bnb_4bit_compute_dtype
# Quantization type (fp4 or nf4)
= "nf4"
bnb_4bit_quant_type
# Activate nested quantization for 4-bit base models (double quantization)
= False
use_nested_quant
################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
= "./results"
output_dir
# Number of training epochs
= 1
num_train_epochs
# Enable fp16/bf16 training (set bf16 to True with an A100)
= False
fp16 = False
bf16
# Batch size per GPU for training
= 4
per_device_train_batch_size
# Batch size per GPU for evaluation
= 4
per_device_eval_batch_size
# Number of update steps to accumulate the gradients for
= 1
gradient_accumulation_steps
# Enable gradient checkpointing
= True
gradient_checkpointing
# Maximum gradient normal (gradient clipping)
= 0.3
max_grad_norm
# Initial learning rate (AdamW optimizer)
= 2e-4
learning_rate
# Weight decay to apply to all layers except bias/LayerNorm weights
= 0.001
weight_decay
# Optimizer to use
= "paged_adamw_32bit"
optim
# Learning rate schedule
= "cosine"
lr_scheduler_type
# Number of training steps (overrides num_train_epochs)
= -1
max_steps
# Ratio of steps for a linear warmup (from 0 to learning rate)
= 0.03
warmup_ratio
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
= True
group_by_length
# Save checkpoint every X updates steps
= 0
save_steps
# Log every X updates steps
= 25
logging_steps
################################################################################
# SFT parameters SUPERVISED TUNING
################################################################################
# Maximum sequence length to use
= None
max_seq_length
# Pack multiple short examples in the same input sequence to increase efficiency
= False
packing
# Load the entire model on the GPU 0
= {"": 0} device_map
# Load dataset (you can process it here)
= load_dataset(dataset_name, split="train")
dataset
# Load tokenizer and model with QLoRA configuration
= getattr(torch, bnb_4bit_compute_dtype)
compute_dtype
= BitsAndBytesConfig(
bnb_config =use_4bit,
load_in_4bit=bnb_4bit_quant_type,
bnb_4bit_quant_type=compute_dtype,
bnb_4bit_compute_dtype=use_nested_quant,
bnb_4bit_use_double_quant
)
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
= torch.cuda.get_device_capability()
major, _ if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)
# Load base model
= AutoModelForCausalLM.from_pretrained(
model
model_name,=bnb_config,
quantization_config=device_map
device_map
)= False
model.config.use_cache = 1
model.config.pretraining_tp
# Load LLaMA tokenizer
= AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer = tokenizer.eos_token
tokenizer.pad_token = "right" # Fix weird overflow issue with fp16 training
tokenizer.padding_side
# Load LoRA configuration
= LoraConfig(
peft_config =lora_alpha,
lora_alpha=lora_dropout,
lora_dropout=lora_r,
r="none",
bias="CAUSAL_LM",
task_type
)
# Set training parameters
= TrainingArguments(
training_arguments =output_dir,
output_dir=num_train_epochs,
num_train_epochs=per_device_train_batch_size,
per_device_train_batch_size=gradient_accumulation_steps,
gradient_accumulation_steps=optim,
optim=save_steps,
save_steps=logging_steps,
logging_steps=learning_rate,
learning_rate=weight_decay,
weight_decay=fp16,
fp16=bf16,
bf16=max_grad_norm,
max_grad_norm=max_steps,
max_steps=warmup_ratio,
warmup_ratio=group_by_length,
group_by_length=lr_scheduler_type,
lr_scheduler_type="tensorboard"
report_to
)
# Set supervised fine-tuning parameters
= SFTTrainer(
trainer =model,
model=dataset,
train_dataset=peft_config,
peft_config="text",
dataset_text_field=max_seq_length,
max_seq_length=tokenizer,
tokenizer=training_arguments,
args=packing,
packing
)
# Train model
trainer.train()
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Input In [31], in <cell line: 15>() 14 # Check GPU compatibility with bfloat16 15 if compute_dtype == torch.float16 and use_4bit: ---> 16 major, _ = torch.cuda.get_device_capability() 17 if major >= 8: 18 print("=" * 80) File ~\anaconda3\lib\site-packages\torch\cuda\__init__.py:451, in get_device_capability(device) 438 def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]: 439 r"""Get the cuda capability of a device. 440 441 Args: (...) 449 tuple(int, int): the major and minor cuda capability of the device 450 """ --> 451 prop = get_device_properties(device) 452 return prop.major, prop.minor File ~\anaconda3\lib\site-packages\torch\cuda\__init__.py:465, in get_device_properties(device) 455 def get_device_properties(device: _device_t) -> _CudaDeviceProperties: 456 r"""Get the properties of a device. 457 458 Args: (...) 463 _CudaDeviceProperties: the properties of the device 464 """ --> 465 _lazy_init() # will define _get_device_properties 466 device = _get_device_index(device, optional=True) 467 if device < 0 or device >= device_count(): File ~\anaconda3\lib\site-packages\torch\cuda\__init__.py:305, in _lazy_init() 300 raise RuntimeError( 301 "Cannot re-initialize CUDA in forked subprocess. To use CUDA with " 302 "multiprocessing, you must use the 'spawn' start method" 303 ) 304 if not hasattr(torch._C, "_cuda_getDeviceCount"): --> 305 raise AssertionError("Torch not compiled with CUDA enabled") 306 if _cudart is None: 307 raise AssertionError( 308 "libcudart functions unavailable. It looks like you have a broken build?" 309 ) AssertionError: Torch not compiled with CUDA enabled
Step 4: Load everything and start the fine-tuning process
First of all, we want to load the dataset we defined. Here, our dataset is already preprocessed but, usually, this is where you would reformat the prompt, filter out bad text, combine multiple datasets, etc.
Then, we’re configuring bitsandbytes for 4-bit quantization.
Next, we’re loading the Llama 2 model in 4-bit precision on a GPU with the corresponding tokenizer.
Finally, we’re loading configurations for QLoRA, regular training parameters, and passing everything to the SFTTrainer. The training can finally start!
# Load dataset (you can process it here)
= load_dataset(dataset_name, split="train")
dataset
# Load tokenizer and model with QLoRA configuration
= getattr(torch, bnb_4bit_compute_dtype)
compute_dtype
= BitsAndBytesConfig(
bnb_config =use_4bit,
load_in_4bit=bnb_4bit_quant_type,
bnb_4bit_quant_type=compute_dtype,
bnb_4bit_compute_dtype=use_nested_quant,
bnb_4bit_use_double_quant
)
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
= torch.cuda.get_device_capability()
major, _ if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)
# Load base model
= AutoModelForCausalLM.from_pretrained(
model
model_name,=bnb_config,
quantization_config=device_map
device_map
)= False
model.config.use_cache = 1
model.config.pretraining_tp
# Load LLaMA tokenizer
= AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer = tokenizer.eos_token
tokenizer.pad_token = "right" # Fix weird overflow issue with fp16 training
tokenizer.padding_side
# Load LoRA configuration
= LoraConfig(
peft_config =lora_alpha,
lora_alpha=lora_dropout,
lora_dropout=lora_r,
r="none",
bias="CAUSAL_LM",
task_type
)
# Set training parameters
= TrainingArguments(
training_arguments =output_dir,
output_dir=num_train_epochs,
num_train_epochs=per_device_train_batch_size,
per_device_train_batch_size=gradient_accumulation_steps,
gradient_accumulation_steps=optim,
optim=save_steps,
save_steps=logging_steps,
logging_steps=learning_rate,
learning_rate=weight_decay,
weight_decay=fp16,
fp16=bf16,
bf16=max_grad_norm,
max_grad_norm=max_steps,
max_steps=warmup_ratio,
warmup_ratio=group_by_length,
group_by_length=lr_scheduler_type,
lr_scheduler_type="tensorboard"
report_to
)
# Set supervised fine-tuning parameters
= SFTTrainer(
trainer =model,
model=dataset,
train_dataset=peft_config,
peft_config="text",
dataset_text_field=max_seq_length,
max_seq_length=tokenizer,
tokenizer=training_arguments,
args=packing,
packing
)
# Train model
trainer.train()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [8], in <cell line: 2>() 1 # Load dataset (you can process it here) ----> 2 dataset = load_dataset(dataset_name, split="train") 4 # Load tokenizer and model with QLoRA configuration 5 compute_dtype = getattr(torch, bnb_4bit_compute_dtype) NameError: name 'load_dataset' is not defined
# Save trained model
trainer.model.save_pretrained(new_model)
Step 5: Check the plots on tensorboard, as follows
Step 6: Use the text generation pipeline to ask questions like “What is a large language model?”
Note that I’m formatting the input to match Llama 2 prompt template.
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
# Run text generation pipeline with our next model
= "What is a large language model?"
prompt = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
pipe = pipe(f"<s>[INST] {prompt} [/INST]")
result print(result[0]['generated_text'])
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect() gc.collect()
You can train a Llama 2 model on the entire dataset using mlabonne/guanaco-llama2
Step 7: Store New Llama2 Model (Llama-2-7b-chat-finetune)
How can we store our new Llama-2-7b-chat-finetune model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything.
# Reload model in FP16 and merge it with LoRA weights
= AutoModelForCausalLM.from_pretrained(
base_model
model_name,=True,
low_cpu_mem_usage=True,
return_dict=torch.float16,
torch_dtype=device_map,
device_map
)= PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()
model
# Reload tokenizer to save it
= AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer = tokenizer.eos_token
tokenizer.pad_token = "right" tokenizer.padding_side
Step 8: Pushing Model to Hugging Face Hub
Our weights are merged and we reloaded the tokenizer. We can now push everything to the Hugging Face Hub to save our model.
import locale
= lambda: "UTF-8" locale.getpreferredencoding
!huggingface-cli login
"entbappy/Llama-2-7b-chat-finetune", check_pr=True)
model.push_to_hub(
"entbappy/Llama-2-7b-chat-finetune",check_pr=True) tokenizer.push_to_hub(
You can now use this model for inference by loading it like any other Llama 2 model from the Hub.