intel
diff --git a/‎dependency_version.json‎
Lines changed: 2 additions & 2 deletions b/‎dependency_version.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/gpu/llm/README.md‎
Lines changed: 7 additions & 5 deletions b/‎examples/gpu/llm/README.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎examples/gpu/llm/bitsandbytes/README.md‎
Lines changed: 47 additions & 0 deletions b/‎examples/gpu/llm/bitsandbytes/README.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/gpu/llm/bitsandbytes/requirements.txt‎
Lines changed: 4 additions & 0 deletions b/‎examples/gpu/llm/bitsandbytes/requirements.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/gpu/llm/fine-tuning/Llama3/README.md‎
Lines changed: 0 additions & 28 deletions b/‎examples/gpu/llm/fine-tuning/Llama3/README.md‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎examples/gpu/llm/fine-tuning/Phi3/README.md‎
Lines changed: 0 additions & 78 deletions b/‎examples/gpu/llm/fine-tuning/Phi3/README.md‎
Lines changed: 0 additions & 78 deletions
diff --git a/‎examples/gpu/llm/fine-tuning/Qwen/run_qwen2_fsdp.sh‎
Lines changed: 0 additions & 26 deletions b/‎examples/gpu/llm/fine-tuning/Qwen/run_qwen2_fsdp.sh‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎examples/gpu/llm/fine-tuning/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/gpu/llm/fine-tuning/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gpu/llm/inference/run_accuracy.sh‎
Lines changed: 0 additions & 25 deletions b/‎examples/gpu/llm/inference/run_accuracy.sh‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎examples/gpu/llm/inference/run_accuracy_ds.sh‎
Lines changed: 0 additions & 24 deletions b/‎examples/gpu/llm/inference/run_accuracy_ds.sh‎
Lines changed: 0 additions & 24 deletions
@@ -19,8 +19,8 @@
     "commit": "v0.21.0"
   },
   "torch-ccl": {
-    "version": "2.5.0+xpu",
-    "commit": "v2.5.0+xpu"
+    "version": "2.6.0+xpu",
+    "commit": "v2.6.0+xpu"
   },
   "basekit": {
     "dpcpp-cpp-rt": {
 
@@ -2,7 +2,7 @@
 
 Here you can find benchmarking scripts for large language models (LLM) text generation. These scripts:
 
-- Support Llama, GPT-J, Qwen, OPT, Bloom model families and some other models such as ChatGLMv3-6B, Baichuan2-13B and Phi3-mini. 
+- Support Llama, GPT-J, Qwen, OPT, Bloom model families and some other models such as Baichuan2-13B and Phi3-mini. 
 - Include both single instance and distributed (DeepSpeed) use cases for FP16 optimization.
 - Cover model generation inference with low precision cases for different models with best performance and accuracy (fp16 AMP and weight only quantization)
 
@@ -28,7 +28,7 @@ docker run -it --rm --privileged -v /dev/dri/by-path:/dev/dri/by-path ipex-llm:2
 cd llm
 
 # Activate environment variables
-source ./tools/env_activate.sh [inference|fine-tuning]
+source ./tools/env_activate.sh [inference|fine-tuning|bitsandbytes]
 ```
 
 ### Conda-based environment setup with prebuilt wheel files
@@ -54,7 +54,7 @@ cd examples/gpu/llm
 bash ./tools/env_setup.sh 0x07
 conda deactivate
 conda activate llm
-source ./tools/env_activate.sh [inference|fine-tuning]
+source ./tools/env_activate.sh [inference|fine-tuning|bitsandbytes]
 ```
 
 ### Docker-based environment setup with compilation from source
@@ -77,7 +77,7 @@ docker run -it --rm --privileged -v /dev/dri/by-path:/dev/dri/by-path ipex-llm:2
 cd llm
 
 # Activate environment variables
-source ./tools/env_activate.sh [inference|fine-tuning]
+source ./tools/env_activate.sh [inference|fine-tuning|bitsandbytes]
 ```
 
 ### Conda-based environment setup with compilation from source
@@ -106,7 +106,7 @@ bash ./tools/env_setup.sh 3 <ONEAPI_ROOT_DIR> <AOT>
 
 conda deactivate
 conda activate llm
-source ./tools/env_activate.sh [inference|fine-tuning]
+source ./tools/env_activate.sh [inference|fine-tuning|bitsandbytes]
 ```
 
 where <br />
@@ -122,3 +122,5 @@ Inference and fine-tuning are supported in individual directories.
 For inference example scripts, visit the [inference](./inference/) directory.
 
 For fine-tuning example scripts, visit the [fine-tuning](./fine-tuning/) directory.
+
+For fine-tuning with quantized model, visit the [bitsandbytes](./bitsandbytes/) directory.
@@ -0,0 +1,47 @@
+# LLM Quantized Model Lora-Finetuning Overview
+
+Here you can find the quantized model lora-finetuning scripts for Llama3.
+
+
+
+## Supported Platforms
+
+\* Intel® Data Center GPU Max Series (1550/1100) : support Llama3.1-8B.<br />
+\* Intel® Core™ Ultra Processors with Intel® Arc™ B Series Graphics : support Llama3.2-3B.<br />
+
+## Run Models
+
+**Note**: During the execution, you may need to log in your Hugging Face account to access model files. Refer to [HuggingFace Login](https://huggingface.co/docs/huggingface_hub/quick-start#login)
+
+```
+huggingface-cli login --token <your_token_here>
+```
+
+### Environment Set Up
+Set up environment by following [LLM Environment Set Up](../README.md).
+
+
+### Run Qlora finetuning with quantized model using Bash Script
+
+The related code and run script are prepared in the folder. Run all with the one-click bash script `run_qlora_pvc.sh` or `run_qlora_client.sh`:
+
+
+If you are running on a Data Center Max Series GPU：
+
+```
+bash run_qlora_pvc.sh
+```
+
+If you are running on a Intel Client GPU:
+
+```
+bash run_qlora_client.sh
+```
+
+
+### Run inference with quantized model
+
+```
+# set quant_type and max_new_tokens according to your needs
+python bnb_inf_xpu.py --model_name ${model} --quant_type nf4 --max_new_tokens 64 --device xpu 
+```
@@ -0,0 +1,4 @@
+transformers==v4.49.0
+tf-keras
+accelerate==1.1.1
+peft==0.14.0
@@ -18,34 +18,6 @@ huggingface-cli login --token <your_token_here>
 wandb login
 ```
 
-### Fine-tuning on single card
-
-**Note**:
-Full-finetuning on single card will cause OOM.
-
-Example: Llama 3 8B LoRA fine-tuning on single card. The default dataset `financial_phrasebank` is loaded in `llama3_ft.py`.
-
-```bash
-export TORCH_LLM_ALLREDUCE=1
-
-export model="meta-llama/Meta-Llama-3-8B"
-
-python llama3_ft.py \
-    --model_name_or_path ${model} \
-    --use_flashattn True \
-    --custom_mp True \
-    --use_peft True \
-    --max_seq_length 128 \
-    --output_dir="output" \
-    --evaluation_strategy="epoch" \
-    --learning_rate=1e-3 \
-    --auto_find_batch_size=True \
-    --num_train_epochs=1 \
-    --save_steps=500 \
-    --logging_steps=1 \
-    --save_total_limit=8
-```
-
 ### Fine-tuning on multi-GPU
 
 **Note**:
 
@@ -43,29 +43,6 @@ python phi3_ft.py \
 
 #### Fine-tuning on single card
 
-Example: Phi-3 Mini 4k full fine-tuning on single card. The default dataset `financial_phrasebank` is loaded in `phi3_ft.py`.
-
-```bash
-export TORCH_LLM_ALLREDUCE=1
-
-export model="microsoft/Phi-3-mini-4k-instruct"
-
-python phi3_ft.py \
-    --model_name_or_path ${model} \
-    --use_flashattn False \
-    --custom_mp True \
-    --max_seq_length 128 \
-    --output_dir="output" \
-    --evaluation_strategy="epoch" \
-    --learning_rate=1e-3 \
-    --auto_find_batch_size=True \
-    --num_train_epochs=1 \
-    --save_steps=500 \
-    --logging_steps=1 \
-    --save_total_limit=8
-```
-
-
 Example: Phi-3 Mini 4k LoRA fine-tuning on single card. The default dataset `financial_phrasebank` is loaded in `phi3_ft.py`.
 
 ```bash
@@ -95,61 +72,6 @@ python phi3_ft.py \
 The default `fsdp_config.yml` is set with 1 machine with 4 cards 8 tiles, If you are using different setting, please change the `num_processes: 8` accordingly. For example, to use 8 cards 16 tiles, the line in `fsdp_config.yml` should be changed to `num_processes: 16`.
 
 
-Example: Phi-3 Mini 4k full fine-tuning.
-
-
-```bash
-export CCL_PROCESS_LAUNCHER=none
-export TORCH_LLM_ALLREDUCE=1
-
-export model="microsoft/Phi-3-mini-4k-instruct"
-
-accelerate launch --config_file "fsdp_config.yaml"  phi3_ft.py \
-    --model_name_or_path ${model} \
-    --use_flashattn False \
-    --bf16 True \
-    --max_seq_length 128 \
-    --output_dir="output" \
-    --evaluation_strategy="epoch" \
-    --learning_rate=1e-3 \
-    --gradient_accumulation_steps=1 \
-    --per_device_train_batch_size=8 \
-    --per_device_eval_batch_size=8 \
-    --num_train_epochs=1 \
-    --save_steps=500 \
-    --logging_steps=1 \
-    --save_total_limit=8 2>&1 | tee phi3-mini_ft_fsdp_converge.log
-```
-
-
-Example: Phi-3 Mini 4k LoRA fine-tuning.
-
-
-```bash
-export CCL_PROCESS_LAUNCHER=none
-export TORCH_LLM_ALLREDUCE=1
-
-export model="microsoft/Phi-3-mini-4k-instruct"
-
-accelerate launch --config_file "fsdp_config.yaml"  phi3_ft.py \
-    --model_name_or_path ${model} \
-    --use_flashattn False \
-    --bf16 True \
-    --use_peft True \
-    --max_seq_length 128 \
-    --output_dir="output" \
-    --evaluation_strategy="epoch" \
-    --learning_rate=1e-3 \
-    --gradient_accumulation_steps=1 \
-    --per_device_train_batch_size=8 \
-    --per_device_eval_batch_size=8 \
-    --num_train_epochs=1 \
-    --save_steps=500 \
-    --logging_steps=1 \
-    --save_total_limit=8 2>&1 | tee phi3-mini_ft_fsdp_converge.log
-```
-
-
 Example: Phi3-Mini 4k LoRA fine-tuning.
 
 
 
@@ -55,31 +55,6 @@ Run_fsdp_dummy_dataset_lora_sequence_length_256() {
     #--optim "adamw_torch_fused"
 }
 
-Run_fsdp_dummy_dataset_sequence_length_2048() {
-    accelerate launch --config_file "fsdp_config.yaml" qwen2_ft.py \
-    --model_name_or_path $model \
-    --data_path $data \
-    --bf16 True \
-    --output_dir output_qwen \
-    --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --evaluation_strategy "no" \
-    --save_strategy "steps" \
-    --save_steps 2000 \
-    --save_total_limit 10 \
-    --learning_rate 3e-4 \
-    --weight_decay 0.01 \
-    --adam_beta2 0.95 \
-    --warmup_ratio 0.01 \
-    --lr_scheduler_type "cosine" \
-    --logging_steps 1 \
-    --report_to "none" \
-    --model_max_length 2048
-    #--optim "adamw_torch_fused"
-}
-
 Run_fsdp_dummy_dataset_lora_sequence_length_2048() {
     accelerate launch --config_file "fsdp_config.yaml" qwen2_ft.py \
     --model_name_or_path $model \
@@ -108,5 +83,4 @@ Run_fsdp_dummy_dataset_lora_sequence_length_2048() {
 
 Run_fsdp_dummy_dataset_sequence_length_256
 #Run_fsdp_dummy_dataset_lora_sequence_length_256
-#Run_fsdp_dummy_dataset_sequence_length_2048
 #Run_fsdp_dummy_dataset_lora_sequence_length_2048
@@ -54,7 +54,7 @@ Here we mainly focus on the memory-constrained fine-tuning on single GPU, and pr
 
 ### Profile the finetuning
 
-For profiling the process of finetuning, Apply the `patches/transformers.patch` to transformers v4.41.2 and set the following VARIABLE before finetuning.
+For profiling the process of finetuning, Apply the `patches/transformers.patch` to transformers v4.44.2 and set the following VARIABLE before finetuning.
 
 ```bash
 export PROFILE=1
 
@@ -12,29 +12,6 @@ Accuracy_lmeval_gpt-j-6b() {
     mv log_acc ${dir}
 }
 
-
-## Llama-7b
-Accuracy_lmeval_llama-7b() {
-    model=decapoda-research/llama-7b-hf
-    sub_model_name=llama-7b
-    dir=accuracy/${model}/task${task}
-    mkdir -p ${dir}
-    LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} 2>&1 | tee log_acc
-    mv log_acc ${dir}
-}
-
-
-## Llama-13b
-Accuracy_lmeval_llama-13b() {
-    model=decapoda-research/llama-13b-hf
-    sub_model_name=llama-13b
-    dir=accuracy/${model}/task${task}
-    mkdir -p ${dir}
-    LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} 2>&1 | tee log_acc
-    mv log_acc ${dir}
-}
-
-
 ## Llama2-7b
 Accuracy_lmeval_llama2-7b() {
     model=meta-llama/Llama-2-7b-hf
@@ -84,8 +61,6 @@ main() {
     export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 
     Accuracy_lmeval_gpt-j-6b
-    Accuracy_lmeval_llama-7b
-    Accuracy_lmeval_llama-13b
     Accuracy_lmeval_llama2-7b
     Accuracy_lmeval_llama2-13b
     Accuracy_lmeval_opt-6.7b
 
@@ -14,28 +14,6 @@ Accuracy_lmeval_gpt-j-6b() {
 }
 
 
-## Llama-7b
-Accuracy_lmeval_llama-7b() {
-    model=decapoda-research/llama-7b-hf
-    sub_model_name=llama-7b
-    dir=accuracy/${model}/task${task}_ranknum2
-    mkdir -p ${dir}
-    LLM_ACC_TEST=1 mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} 2>&1 | tee log_acc_ds
-    mv log_acc_ds ${dir}
-}
-
-
-## Llama-13b
-Accuracy_lmeval_llama-13b() {
-    model=decapoda-research/llama-13b-hf
-    sub_model_name=llama-13b
-    dir=accuracy/${model}/task${task}_ranknum2
-    mkdir -p ${dir}
-    LLM_ACC_TEST=1 mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} 2>&1 | tee log_acc_ds
-    mv log_acc_ds ${dir}
-}
-
-
 ## Llama2-7b
 Accuracy_lmeval_llama2-7b() {
     model=meta-llama/Llama-2-7b-hf
@@ -140,8 +118,6 @@ main() {
     export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
 
     Accuracy_lmeval_gpt-j-6b
-    Accuracy_lmeval_llama-7b
-    Accuracy_lmeval_llama-13b
     Accuracy_lmeval_llama2-7b
     Accuracy_lmeval_llama2-13b
     Accuracy_lmeval_llama2-34b