気温が下がってきたので、久しぶりに生成AI(とLinux)に触ったところ、sd-scriptsの環境整備に梃子摺ったのでメモ
6.2.4にアップデートしましたが問題ありませんでした
same for ROCm ver. 6.2.4.
Install sd-scripts
ROCm Install:
https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/index.html
https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.2/index.html
environmental variables
export HSA_OVERRIDE_GFX_VERSION=10.3.0
export PYTORCH_ROCM_ARCH=grx1030
sd-scripts installation
git clone https://github.com/kohya-ss/sd-scripts.git
cd sd-scripts
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip wheel
pip install -r requirements.txt
pip uninstall bitsandbytes torch triton -y
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
# pip install came-pytorch dadaptation lycoris-lora schedulefree prodigy-plus-schedule-free wandb
accelerate config
This machine,No distributed training, NO, NO, NO, all, fp16
bitsandbytes installation
v0.45.0 https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
deactivate # deactivate sd-scripts/venv
git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
cd bitsandbytes
python3 -m venv venv
source venv/bin/activate
pip install -r requirements-dev.txt
cmake -DCOMPUTE_BACKEND=hip -S .
make
deactivate # deactivate sd-scripts/bitsandbytes/venv
source ../venv/bin/activate # activate sd-scripts/venv
pip install .
cd .. # return to sd-scripts directory
v0.43.3
deactivate # deactivate sd-scripts/venv
git clone https://github.com/ROCm/bitsandbytes.git
cd bitsandbytes
git checkout enable_6.2_packaging
python3 -m venv venv
source venv/bin/activate
pip install -r requirements-dev.txt
cmake -DCOMPUTE_BACKEND=hip -S .
make
deactivate # deactivate sd-scripts/bitsandbytes/venv
source ../venv/bin/activate # activate sd-scripts/venv
pip install .
cd .. # return to sd-scripts directory
train SDXL LoRA for 8GB VRAM sample
accelerate launch sdxl_train_network.py \
--network_module=networks.lora \
--max_data_loader_n_workers=1 --persistent_data_loader_workers \
--save_model_as=safetensors \
--mixed_precision=fp16 --save_precision=fp16 \
--seed=42 \
--train_data_dir=train --output_dir=output --logging_dir=log \
--max_train_epochs=5 --train_batch_size=1 \
--resolution=1024 \
--fp8_base \
--network_dim=8 --network_alpha=4 \
--learning_rate=1e-4 --optimizer_type=AdamW8bit \
--lr_scheduler=cosine_with_restarts --lr_scheduler_num_cycles=2 \
--network_args "loraplus_lr_ratio=4" \
--scale_weight_norms=2 \
--noise_offset=0.1 \
--network_dropout=0.1 \
--cache_latents --cache_latents_to_disk \
--caption_extension=".txt" \
--keep_tokens_separator "|||" --shuffle_caption --max_token_length=225 \
--gradient_checkpointing --sdpa \
--no_half_vae \
--vae="models/VAE/sdxl_vae.safetensors" \
--pretrained_model_name_or_path="models/Stable-diffusion/Illustrious-XL-v0.1.safetensors" \
--output_name=jefuty_illustrious01_00
31images x 10 x 5epochs = 1550steps
1550/1550 [1:58:33<00:00, 4.59s/it, Average key norm=0.135, Keys Scaled=0, avr_loss=0.103]
If you can afford vram capacity
--max_bucket_reso=1536 \
# If NaN occurs during LoRA training, try switching from --sdpa
to --mem_eff_attn
.
If OOM, you can use block weights / train unet only
--network_args \
"down_lr_weight=0,0,0,0,0,0,0,0,1" \
"mid_lr_weight=1,1,1" \
"up_lr_weight=1,1,1,1,0,0,0,0,0" \
--network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk \
Environment
OS
os = Ubuntu 22.04.5 LTS #24.04.1でも動くと思うがpython3.10が必要
kernel = ubuntu 6.5.0-18-generic
# linux-image-6.8.0が入っているとROCm6.1.2のdkmsコンパイルに失敗したため削除したまま
# ROCm6.2.2はkernel6.8.0でも問題ない(はず)
ROCm
APT-Sources: https://repo.radeon.com/rocm/apt/6.2.2 jammy/main amd64 Packages
GPU
Name: gfx1030 # 環境変数で書き換えなければ gfx1032
Uuid: GPU-XX
Marketing Name: AMD Radeon RX 6650 XT
python version
$ python -V
Python 3.10.12
sd-scripts version
$ git rev-parse HEAD
2a61fc07846dc919ea64b568f7e18c010e5c8e06
$ git rev-parse --abbrev-ref HEAD
sd3
venv pip list
Package Version Editable project location
------------------------- -------------- ---------------------------
absl-py 2.1.0
accelerate 0.33.0
aiohappyeyeballs 2.4.3
aiohttp 3.10.10
aiosignal 1.3.1
altair 4.2.2
async-timeout 4.0.3
attrs 24.2.0
bitsandbytes 0.45.0.dev0-7e6f865
came-pytorch 0.1.3
certifi 2024.8.30
charset-normalizer 3.4.0
click 8.1.7
dadaptation 3.2
diffusers 0.25.0
docker-pycreds 0.4.0
easygui 0.98.3
einops 0.7.0
entrypoints 0.4
filelock 3.16.1
frozenlist 1.5.0
fsspec 2024.10.0
ftfy 6.1.1
gitdb 4.0.11
GitPython 3.1.43
grpcio 1.67.1
huggingface-hub 0.24.5
idna 3.10
imagesize 1.4.1
importlib_metadata 8.5.0
Jinja2 3.1.4
jsonschema 4.23.0
jsonschema-specifications 2024.10.1
library 0.0.0 /home/USER/SD/sd-scripts
lightning-utilities 0.11.8
lion-pytorch 0.0.6
Markdown 3.7
markdown-it-py 3.0.0
MarkupSafe 3.0.2
mdurl 0.1.2
mpmath 1.3.0
multidict 6.1.0
networkx 3.4.2
numpy 1.26.4
nvidia-cublas-cu12 12.4.5.8
nvidia-cuda-cupti-cu12 12.4.127
nvidia-cuda-nvrtc-cu12 12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12 9.1.0.70
nvidia-cufft-cu12 11.2.1.3
nvidia-curand-cu12 10.3.5.147
nvidia-cusolver-cu12 11.6.1.9
nvidia-cusparse-cu12 12.3.1.170
nvidia-nccl-cu12 2.21.5
nvidia-nvjitlink-cu12 12.4.127
nvidia-nvtx-cu12 12.4.127
opencv-python 4.8.1.78
packaging 24.1
pandas 2.2.3
pillow 11.0.0
pip 24.3.1
platformdirs 4.3.6
prodigyopt 1.0
propcache 0.2.0
protobuf 5.28.3
psutil 6.1.0
Pygments 2.18.0
python-dateutil 2.9.0.post0
pytorch-lightning 1.9.0
pytorch-triton-rocm 3.1.0
pytz 2024.2
PyYAML 6.0.2
referencing 0.35.1
regex 2024.9.11
requests 2.32.3
rich 13.7.0
rpds-py 0.20.1
safetensors 0.4.2
schedulefree 1.3
scipy 1.14.1
sentry-sdk 2.18.0
setproctitle 1.3.3
setuptools 59.6.0
six 1.16.0
smmap 5.0.1
sympy 1.13.1
tensorboard 2.18.0
tensorboard-data-server 0.7.2
tokenizers 0.19.1
toml 0.10.2
toolz 1.0.0
torch 2.5.1+rocm6.2
torchaudio 2.5.1+rocm6.2
torchmetrics 1.5.1
torchvision 0.20.1+rocm6.2
tqdm 4.66.6
transformers 4.44.0
typing_extensions 4.12.2
tzdata 2024.2
urllib3 2.2.3
voluptuous 0.13.1
wandb 0.18.5
wcwidth 0.2.13
Werkzeug 3.1.2
wheel 0.44.0
yarl 1.17.1
zipp 3.20.2
※memo
changes:
Jan-12-2025
accelerate 0.30.0 -> 0.33.0
bitsandbytes 0.43.3 -> 0.45.0
Dec-01-2024
accelerate 0.25.0 -> 0.30.0
huggingface-hub 0.20.1 -> 0.24.5
tokenizers 0.15.2 -> 0.19.1
sd-scripts branch main-> sd3