Merge pull request #3533 from vladmandic/dev

merge dev to master
vladmandic · Oct 29, 2024 · 99623da · 99623da
2 parents 06dd9e5 + 5d369d2
commit 99623da
Show file tree

Hide file tree

Showing 54 changed files with 1,379 additions and 881 deletions.
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,9 @@ tunableop_results*.csv
 !webui.sh
 !package.json
 
+# dynamically generated
+/repositories/ip-instruct/
+
 # all dynamic stuff
 /extensions/**/*
 /outputs/**/*
@@ -59,7 +62,6 @@ tunableop_results*.csv
 .vscode/
 .idea/
 /localizations
-
 .*/
 
 # force included

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,16 +1,69 @@
 # Change Log for SD.Next
 
-## Update for 2024-10-24
+## Update for 2024-10-29
 
-Improvements:
-- SD3 loader enhancements
+### Highlights for 2024-10-29
+
+- Support for **all SD3.x variants**  
+  *SD3.0-Medium, SD3.5-Medium, SD3.5-Large, SD3.0-Large-Turbo*
+- Allow quantization using `bitsandbytes` on-the-fly during models load
+  Load any variant of SD3.x or FLUX.1 and apply quantization during load without the need for pre-quantized models  
+- Allow for custom model URL in standard model selector  
+  Can be used to specify any model from *HuggingFace* or *CivitAI*  
+- Full support for `torch==2.5.1`
+- New wiki articles: [Gated Access](https://github.com/vladmandic/automatic/wiki/Gated), [Quantization](https://github.com/vladmandic/automatic/wiki/Quantization), [Offloading](https://github.com/vladmandic/automatic/wiki/Offload)  
+
+Plus tons of smaller improvements and cumulative fixes reported since last release  
+
+[README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867)
+
+### Details for 2024-10-29
+
+- model selector:
+  - change-in-behavior
+  - when typing, it will auto-load model as soon as exactly one match is found
+  - allows entering model that are not on the list which triggers huggingface search  
+    e.g. `stabilityai/stable-diffusion-xl-base-1.0`  
+    partial search hits are displayed in the log  
+    if exact model is found, it will be auto-downloaded and loaded  
+  - allows entering civitai direct download link which triggers model download  
+    e.g. `https://civitai.com/api/download/models/72396?type=Model&format=SafeTensor&size=full&fp=fp16`  
+  - auto-search-and-download can be disabled in settings -> models -> auto-download  
+    this also disables reference models as they are auto-downloaded on first use as well  
+- sd3 enhancements:  
+  - allow on-the-fly bnb quantization during load
   - report when loading incomplete model  
-  - handle missing model components  
+  - handle missing model components during load  
   - handle component preloading  
-- OpenVINO: add accuracy option  
-- ZLUDA: guess GPU arch  
-
-Fixes:  
+  - native lora handler  
+  - support for all sd35 variants: *medium/large/large-turbo*
+  - gguf transformer loader (prototype)  
+- flux.1 enhancements:  
+  - allow on-the-fly bnb quantization during load
+- samplers:
+  - support for original k-diffusion samplers  
+    select via *scripts -> k-diffusion -> sampler*  
+- ipadapter:
+  - list available adapters based on loaded model type
+  - add adapter `ostris consistency` for sd15/sdxl
+- detailer:
+  - add `[prompt]` to refine/defailer prompts as placeholder referencing original prompt  
+- torch
+  - use `torch==2.5.1` by default on supported platforms
+  - CUDA set device memory limit
+    in *settings -> compute settings -> torch memory limit*  
+    default=0 meaning no limit, if set torch will limit memory usage to specified fraction  
+    *note*: this is not a hard limit, torch will try to stay under this value  
+- compute backends:
+  - OpenVINO: add accuracy option  
+  - ZLUDA: guess GPU arch  
+- major model load refactor
+- wiki: new articles
+  - [Gated Access Wiki](https://github.com/vladmandic/automatic/wiki/Gated)  
+  - [Quantization Wiki](https://github.com/vladmandic/automatic/wiki/Quantization)  
+  - [Offloading Wiki](https://github.com/vladmandic/automatic/wiki/Offload)  
+
+fixes:  
 - fix send-to-control  
 - fix k-diffusion  
 - fix sd3 img2img and hires  

diff --git a/extensions-builtin/Lora/network_overrides.py b/extensions-builtin/Lora/network_overrides.py
@@ -26,7 +26,7 @@
 
 force_models = [ # forced always
     'sc',
-    'sd3',
+    # 'sd3',
     'kandinsky',
     'hunyuandit',
     'auraflow',

diff --git a/extensions-builtin/Lora/networks.py b/extensions-builtin/Lora/networks.py
@@ -127,6 +127,8 @@ def load_diffusers(name, network_on_disk, lora_scale=shared.opts.extra_networks_
 
 
 def load_network(name, network_on_disk) -> network.Network:
+    if not shared.sd_loaded:
+        return
     t0 = time.time()
     cached = lora_cache.get(name, None)
     if debug:

diff --git a/extensions-builtin/sdnext-modernui b/extensions-builtin/sdnext-modernui
diff --git a/html/reference.json b/html/reference.json
@@ -119,19 +119,27 @@
     "preview": "stabilityai--stable-diffusion-3.jpg",
     "extras": "sampler: Default, cfg_scale: 7.0"
   },
+  "StabilityAI Stable Diffusion 3.5 Medium": {
+    "path": "stabilityai/stable-diffusion-3.5-medium",
+    "skip": true,
+    "variant": "fp16",
+    "desc": "Stable Diffusion 3.5 Medium is a Multimodal Diffusion Transformer with improvements (MMDiT-X) text-to-image model that features improved performance in image quality, typography, complex prompt understanding, and resource-efficiency.",
+    "preview": "stabilityai--stable-diffusion-3_5.jpg",
+    "extras": "sampler: Default, cfg_scale: 7.0"
+  },
   "StabilityAI Stable Diffusion 3.5 Large": {
     "path": "stabilityai/stable-diffusion-3.5-large",
     "skip": true,
     "variant": "fp16",
-    "desc": "Stable Diffusion 3 Medium is a Multimodal Diffusion Transformer (MMDiT) text-to-image model that features greatly improved performance in image quality, typography, complex prompt understanding, and resource-efficiency",
+    "desc": "Stable Diffusion 3.5 Large is a Multimodal Diffusion Transformer (MMDiT) text-to-image model that features improved performance in image quality, typography, complex prompt understanding, and resource-efficiency.",
     "preview": "stabilityai--stable-diffusion-3_5.jpg",
     "extras": "sampler: Default, cfg_scale: 7.0"
   },
   "StabilityAI Stable Diffusion 3.5 Turbo": {
     "path": "stabilityai/stable-diffusion-3.5-large-turbo",
     "skip": true,
     "variant": "fp16",
-    "desc": "Stable Diffusion 3 Medium is a Multimodal Diffusion Transformer (MMDiT) text-to-image model that features greatly improved performance in image quality, typography, complex prompt understanding, and resource-efficiency",
+    "desc": "Stable Diffusion 3.5 Large Turbo is a Multimodal Diffusion Transformer (MMDiT) text-to-image model with Adversarial Diffusion Distillation (ADD) that features improved performance in image quality, typography, complex prompt understanding, and resource-efficiency, with a focus on fewer inference steps.",
     "preview": "stabilityai--stable-diffusion-3_5.jpg",
     "extras": "sampler: Default, cfg_scale: 7.0"
   },

diff --git a/installer.py b/installer.py
@@ -227,9 +227,9 @@ def installed(package, friendly: str = None, reload = False, quiet = False):
                     exact = pkg_version == p[1]
                     if not exact and not quiet:
                         if args.experimental:
-                            log.warning(f"Package: {p[0]} {pkg_version} required {p[1]} allowing experimental")
+                            log.warning(f"Package: {p[0]} installed={pkg_version} required={p[1]} allowing experimental")
                         else:
-                            log.warning(f"Package: {p[0]} {pkg_version} required {p[1]} version mismatch")
+                            log.warning(f"Package: {p[0]} installed={pkg_version} required={p[1]} version mismatch")
                     ok = ok and (exact or args.experimental)
             else:
                 if not quiet:
@@ -254,11 +254,12 @@ def uninstall(package, quiet = False):
 @lru_cache()
 def pip(arg: str, ignore: bool = False, quiet: bool = False, uv = True):
     originalArg = arg
-    uv = uv and args.uv
-    pipCmd = "uv pip" if uv else "pip"
     arg = arg.replace('>=', '==')
+    package = arg.replace("install", "").replace("--upgrade", "").replace("--no-deps", "").replace("--force", "").replace(" ", " ").strip()
+    uv = uv and args.uv and not package.startswith('git+')
+    pipCmd = "uv pip" if uv else "pip"
     if not quiet and '-r ' not in arg:
-        log.info(f'Install: package="{arg.replace("install", "").replace("--upgrade", "").replace("--no-deps", "").replace("--force", "").replace(" ", " ").strip()}" mode={"uv" if uv else "pip"}')
+        log.info(f'Install: package="{package}" mode={"uv" if uv else "pip"}')
     env_args = os.environ.get("PIP_EXTRA_ARGS", "")
     all_args = f'{pip_log}{arg} {env_args}'.strip()
     if not quiet:
@@ -454,7 +455,7 @@ def check_python(supported_minors=[9, 10, 11, 12], reason=None):
 
 # check diffusers version
 def check_diffusers():
-    sha = 'e45c25d03aeb0a967d8aaa0f6a79f280f6838e1f'
+    sha = '0d1d267b12e47b40b0e8f265339c76e0f45f8c49'
     pkg = pkg_resources.working_set.by_key.get('diffusers', None)
     minor = int(pkg.version.split('.')[1] if pkg is not None else 0)
     cur = opts.get('diffusers_version', '') if minor > 0 else ''
@@ -489,7 +490,7 @@ def install_cuda():
     log.info('CUDA: nVidia toolkit detected')
     install('onnxruntime-gpu', 'onnxruntime-gpu', ignore=True, quiet=True)
     # return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124')
-    return os.environ.get('TORCH_COMMAND', 'torch==2.4.1+cu124 torchvision==0.19.1+cu124 --index-url https://download.pytorch.org/whl/cu124')
+    return os.environ.get('TORCH_COMMAND', 'torch==2.5.1+cu124 torchvision==0.20.1+cu124 --index-url https://download.pytorch.org/whl/cu124')
 
 
 def install_rocm_zluda():
@@ -549,6 +550,7 @@ def install_rocm_zluda():
         log.warning("ZLUDA support: experimental")
         error = None
         from modules import zluda_installer
+        zluda_installer.set_default_agent(device)
         try:
             if args.reinstall_zluda:
                 zluda_installer.uninstall()
@@ -570,8 +572,10 @@ def install_rocm_zluda():
             log.info('Using CPU-only torch')
             torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
     else:
-        if rocm.version is None or float(rocm.version) >= 6.1: # assume the latest if version check fails
-            #torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1')
+        if rocm.version is None or float(rocm.version) > 6.1: # assume the latest if version check fails
+            # torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.5.1+rocm6.2 torchvision==0.20.1+rocm6.2 --index-url https://download.pytorch.org/whl/rocm6.2')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.1 torchvision==0.19.1+rocm6.1 --index-url https://download.pytorch.org/whl/rocm6.1')
+        elif rocm.version == "6.1": # lock to 2.4.1, older rocm (5.7) uses torch 2.3
             torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.1 torchvision==0.19.1+rocm6.1 --index-url https://download.pytorch.org/whl/rocm6.1')
         elif rocm.version == "6.0": # lock to 2.4.1, older rocm (5.7) uses torch 2.3
             torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.0 torchvision==0.19.1+rocm6.0 --index-url https://download.pytorch.org/whl/rocm6.0')
@@ -730,7 +734,7 @@ def check_torch():
             else:
                 if args.use_zluda:
                     log.warning("ZLUDA failed to initialize: no HIP SDK found")
-                log.info('Using CPU-only Torch')
+                log.warning('Torch: CPU-only version installed')
                 torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
     if 'torch' in torch_command and not args.version:
         install(torch_command, 'torch torchvision', quiet=True)
@@ -817,6 +821,7 @@ def install_packages():
     log.info('Verifying packages')
     clip_package = os.environ.get('CLIP_PACKAGE', "git+https://github.com/openai/CLIP.git")
     install(clip_package, 'clip', quiet=True)
+    install('open-clip-torch', no_deps=True, quiet=True)
     # tensorflow_package = os.environ.get('TENSORFLOW_PACKAGE', 'tensorflow==2.13.0')
     # tensorflow_package = os.environ.get('TENSORFLOW_PACKAGE', None)
     # if tensorflow_package is not None:

diff --git a/javascript/sdnext.css b/javascript/sdnext.css
@@ -38,7 +38,7 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; }
 .gradio-button.secondary-down, .gradio-button.secondary-down:hover { box-shadow: 1px 1px 1px rgba(0,0,0,0.25) inset, 0px 0px 3px rgba(0,0,0,0.15) inset; }
 .gradio-button.secondary-down:hover { background: var(--button-secondary-background-fill-hover); color: var(--button-secondary-text-color-hover); }
 .gradio-button.tool { max-width: min-content; min-width: min-content !important; font-size: 20px !important; color: var(--body-text-color) !important; align-self: end; margin-bottom: 4px; }
-.gradio-checkbox { margin: 0.75em 1.5em 0 0; align-self: center; }
+.gradio-checkbox { margin-right: 1em !important; align-self: center; }
 .gradio-column { min-width: min(160px, 100%) !important; }
 .gradio-container { max-width: unset !important; padding: var(--block-label-padding) !important; }
 .gradio-container .prose a, .gradio-container .prose a:visited{ color: unset; text-decoration: none; }

diff --git a/modules/devices.py b/modules/devices.py
@@ -4,6 +4,7 @@
 import contextlib
 from functools import wraps
 import torch
+from modules import rocm
 from modules.errors import log, display, install as install_traceback
 from installer import install
 
@@ -50,8 +51,8 @@ def has_zluda() -> bool:
     if not cuda_ok:
         return False
     try:
-        device = torch.device("cuda")
-        return torch.cuda.get_device_name(device).endswith("[ZLUDA]")
+        dev = torch.device("cuda")
+        return torch.cuda.get_device_name(dev).endswith("[ZLUDA]")
     except Exception:
         return False
 
@@ -206,7 +207,7 @@ def torch_gc(force=False, fast=False):
         force = True
     if oom > previous_oom:
         previous_oom = oom
-        log.warning(f'GPU out-of-memory error: {mem}')
+        log.warning(f'Torch GPU out-of-memory error: {mem}')
         force = True
     if force:
         # actual gc
@@ -246,13 +247,26 @@ def set_cuda_sync_mode(mode):
         return
     try:
         import ctypes
-        log.info(f'Set cuda sync: mode={mode}')
+        log.info(f'Torch CUDA sync: mode={mode}')
         torch.cuda.set_device(torch.device(get_optimal_device_name()))
         ctypes.CDLL('libcudart.so').cudaSetDeviceFlags({'auto': 0, 'spin': 1, 'yield': 2, 'block': 4}[mode])
     except Exception:
         pass
 
 
+def set_cuda_memory_limit():
+    if not cuda_ok or opts.cuda_mem_fraction == 0:
+        return
+    from modules.shared import cmd_opts
+    try:
+        torch_gc(force=True)
+        mem = torch.cuda.get_device_properties(device).total_memory
+        torch.cuda.set_per_process_memory_fraction(float(opts.cuda_mem_fraction), cmd_opts.device_id if cmd_opts.device_id is not None else 0)
+        log.info(f'Torch CUDA memory limit: fraction={opts.cuda_mem_fraction:.2f} limit={round(opts.cuda_mem_fraction * mem / 1024 / 1024)} total={round(mem / 1024 / 1024)}')
+    except Exception as e:
+        log.warning(f'Torch CUDA memory limit: fraction={opts.cuda_mem_fraction:.2f} {e}')
+
+
 def test_fp16():
     global fp16_ok # pylint: disable=global-statement
     if fp16_ok is not None:
@@ -283,16 +297,14 @@ def test_bf16():
         if sys.platform == "darwin" or backend == 'openvino' or backend == 'directml': # override
             bf16_ok = False
             return bf16_ok
-        elif backend == 'zluda':
-            device_name = torch.cuda.get_device_name(device)
-            if device_name.startswith("AMD Radeon RX "): # only force AMD
-                device_name = device_name.replace("AMD Radeon RX ", "").split(" ", maxsplit=1)[0]
-                if len(device_name) == 4 and device_name[0] in {"5", "6"}: # RDNA 1 and 2
-                    bf16_ok = False
-                    return bf16_ok
-        elif backend == 'rocm':
-            gcn_arch = getattr(torch.cuda.get_device_properties(device), "gcnArchName", "gfx0000")[3:7]
-            if len(gcn_arch) == 4 and gcn_arch[0:2] == "10": # RDNA 1 and 2
+        elif backend == 'rocm' or backend == 'zluda':
+            agent = None
+            if backend == 'rocm':
+                agent = rocm.Agent(getattr(torch.cuda.get_device_properties(device), "gcnArchName", "gfx0000"))
+            else:
+                from modules.zluda_installer import default_agent
+                agent = default_agent
+            if agent is not None and agent.gfx_version < 0x1100 and agent.arch != rocm.MicroArchitecture.CDNA: # all cards before RDNA 3 except for CDNA cards
                 bf16_ok = False
                 return bf16_ok
     try:
@@ -450,6 +462,7 @@ def set_dtype():
 
 def set_cuda_params():
     override_ipex_math()
+    set_cuda_memory_limit()
     set_cudnn_params()
     set_sdpa_params()
     set_dtype()

diff --git a/modules/extras.py b/modules/extras.py
@@ -188,7 +188,7 @@ def add_model_metadata(checkpoint_info):
     _, extension = os.path.splitext(output_modelname)
 
     if os.path.exists(output_modelname) and not kwargs.get("overwrite", False):
-        return [*[gr.Dropdown.update(choices=sd_models.checkpoint_tiles()) for _ in range(4)], f"Model alredy exists: {output_modelname}"]
+        return [*[gr.Dropdown.update(choices=sd_models.checkpoint_titles()) for _ in range(4)], f"Model alredy exists: {output_modelname}"]
     if extension.lower() == ".safetensors":
         safetensors.torch.save_file(theta_0, output_modelname, metadata=metadata)
     else:
@@ -202,7 +202,7 @@ def add_model_metadata(checkpoint_info):
         created_model.calculate_shorthash()
     devices.torch_gc(force=True)
     shared.state.end()
-    return [*[gr.Dropdown.update(choices=sd_models.checkpoint_tiles()) for _ in range(4)], f"Model saved to {output_modelname}"]
+    return [*[gr.Dropdown.update(choices=sd_models.checkpoint_titles()) for _ in range(4)], f"Model saved to {output_modelname}"]
 
 
 def run_modelconvert(model, checkpoint_formats, precision, conv_type, custom_name, unet_conv, text_encoder_conv,

diff --git a/modules/face/faceid.py b/modules/face/faceid.py
@@ -6,9 +6,10 @@
 import diffusers
 import huggingface_hub as hf
 from PIL import Image
-from modules import processing, shared, devices, extra_networks, sd_models, sd_hijack_freeu, script_callbacks, ipadapter
+from modules import processing, shared, devices, extra_networks, sd_hijack_freeu, script_callbacks, ipadapter, token_merge
 from modules.sd_hijack_hypertile import context_hypertile_vae, context_hypertile_unet
 
+
 FACEID_MODELS = {
     "FaceID Base": "h94/IP-Adapter-FaceID/ip-adapter-faceid_sd15.bin",
     "FaceID Plus v1": "h94/IP-Adapter-FaceID/ip-adapter-faceid-plus_sd15.bin",
@@ -69,7 +70,7 @@ def face_id(
         shared.prompt_styles.apply_styles_to_extra(p)
 
         if shared.opts.cuda_compile_backend == 'none':
-            sd_models.apply_token_merging(p.sd_model)
+            token_merge.apply_token_merging(p.sd_model)
             sd_hijack_freeu.apply_freeu(p, not shared.native)
 
         script_callbacks.before_process_callback(p)
@@ -246,7 +247,7 @@ def face_id(
         if faceid_model is not None and original_load_ip_adapter is not None:
             faceid_model.__class__.load_ip_adapter = original_load_ip_adapter
         if shared.opts.cuda_compile_backend == 'none':
-            sd_models.remove_token_merging(p.sd_model)
+            token_merge.remove_token_merging(p.sd_model)
         script_callbacks.after_process_callback(p)
 
     return processed_images
+3 −0		html/templates/main/template-control-params.html
+3 −0		html/templates/main/template-img2img-params.html
+3 −0		html/templates/main/template-txt2img-params.html