diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py index ef9374c441d..127247b2fc0 100755 --- a/.ci/update_windows/update.py +++ b/.ci/update_windows/update.py @@ -1,6 +1,9 @@ import pygit2 from datetime import datetime import sys +import os +import shutil +import filecmp def pull(repo, remote_name='origin', branch='master'): for remote in repo.remotes: @@ -42,7 +45,8 @@ def pull(repo, remote_name='origin', branch='master'): raise AssertionError('Unknown merge analysis result') pygit2.option(pygit2.GIT_OPT_SET_OWNER_VALIDATION, 0) -repo = pygit2.Repository(str(sys.argv[1])) +repo_path = str(sys.argv[1]) +repo = pygit2.Repository(repo_path) ident = pygit2.Signature('comfyui', 'comfy@ui') try: print("stashing current changes") @@ -51,7 +55,10 @@ def pull(repo, remote_name='origin', branch='master'): print("nothing to stash") backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S')) print("creating backup branch: {}".format(backup_branch_name)) -repo.branches.local.create(backup_branch_name, repo.head.peel()) +try: + repo.branches.local.create(backup_branch_name, repo.head.peel()) +except: + pass print("checking out master branch") branch = repo.lookup_branch('master') @@ -63,3 +70,41 @@ def pull(repo, remote_name='origin', branch='master'): print("Done!") +self_update = True +if len(sys.argv) > 2: + self_update = '--skip_self_update' not in sys.argv + +update_py_path = os.path.realpath(__file__) +repo_update_py_path = os.path.join(repo_path, ".ci/update_windows/update.py") + +cur_path = os.path.dirname(update_py_path) + + +req_path = os.path.join(cur_path, "current_requirements.txt") +repo_req_path = os.path.join(repo_path, "requirements.txt") + + +def files_equal(file1, file2): + try: + return filecmp.cmp(file1, file2, shallow=False) + except: + return False + +def file_size(f): + try: + return os.path.getsize(f) + except: + return 0 + + +if self_update and not files_equal(update_py_path, repo_update_py_path) and file_size(repo_update_py_path) > 10: + shutil.copy(repo_update_py_path, os.path.join(cur_path, "update_new.py")) + exit() + +if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path): + import subprocess + try: + subprocess.check_call([sys.executable, '-s', '-m', 'pip', 'install', '-r', repo_req_path]) + shutil.copy(repo_req_path, req_path) + except: + pass diff --git a/.ci/update_windows/update_comfyui.bat b/.ci/update_windows/update_comfyui.bat index 60d1e694fa4..bb08c0de0c7 100755 --- a/.ci/update_windows/update_comfyui.bat +++ b/.ci/update_windows/update_comfyui.bat @@ -1,2 +1,8 @@ +@echo off ..\python_embeded\python.exe .\update.py ..\ComfyUI\ -pause +if exist update_new.py ( + move /y update_new.py update.py + echo Running updater again since it got updated. + ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update +) +if "%~1"=="" pause diff --git a/.ci/update_windows/update_comfyui_and_python_dependencies.bat b/.ci/update_windows/update_comfyui_and_python_dependencies.bat deleted file mode 100755 index b7308550d1d..00000000000 --- a/.ci/update_windows/update_comfyui_and_python_dependencies.bat +++ /dev/null @@ -1,3 +0,0 @@ -..\python_embeded\python.exe .\update.py ..\ComfyUI\ -..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117 xformers -r ../ComfyUI/requirements.txt pygit2 -pause diff --git a/.ci/update_windows_cu118/update_comfyui_and_python_dependencies.bat b/.ci/update_windows_cu118/update_comfyui_and_python_dependencies.bat deleted file mode 100755 index c33adc0a7b8..00000000000 --- a/.ci/update_windows_cu118/update_comfyui_and_python_dependencies.bat +++ /dev/null @@ -1,11 +0,0 @@ -@echo off -..\python_embeded\python.exe .\update.py ..\ComfyUI\ -echo -echo This will try to update pytorch and all python dependencies, if you get an error wait for pytorch/xformers to fix their stuff -echo You should not be running this anyways unless you really have to -echo -echo If you just want to update normally, close this and run update_comfyui.bat instead. -echo -pause -..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 xformers -r ../ComfyUI/requirements.txt pygit2 -pause diff --git a/.github/workflows/windows_release_cu118_dependencies.yml b/.github/workflows/windows_release_cu118_dependencies.yml deleted file mode 100644 index 75c42b624a9..00000000000 --- a/.github/workflows/windows_release_cu118_dependencies.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: "Windows Release cu118 dependencies" - -on: - workflow_dispatch: -# push: -# branches: -# - master - -jobs: - build_dependencies: - env: - # you need at least cuda 5.0 for some of the stuff compiled here. - TORCH_CUDA_ARCH_LIST: "5.0+PTX 6.0 6.1 7.0 7.5 8.0 8.6 8.9" - FORCE_CUDA: 1 - MAX_JOBS: 1 # will crash otherwise - DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc - XFORMERS_BUILD_TYPE: "Release" - runs-on: windows-latest - steps: - - name: Cache Built Dependencies - uses: actions/cache@v3 - id: cache-cu118_python_stuff - with: - path: cu118_python_deps.tar - key: ${{ runner.os }}-build-cu118 - - - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - - - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true' - uses: actions/setup-python@v4 - with: - python-version: '3.10.9' - - - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true' - uses: comfyanonymous/cuda-toolkit@test - id: cuda-toolkit - with: - cuda: '11.8.0' - # copied from xformers github - - name: Setup MSVC - uses: ilammy/msvc-dev-cmd@v1 - - name: Configure Pagefile - # windows runners will OOM with many CUDA architectures - # we cheat here with a page file - uses: al-cheb/configure-pagefile-action@v1.3 - with: - minimum-size: 2GB - # really unfortunate: https://github.com/ilammy/msvc-dev-cmd#name-conflicts-with-shell-bash - - name: Remove link.exe - shell: bash - run: rm /usr/bin/link - - - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true' - shell: bash - run: | - python -m pip wheel --no-cache-dir torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 -r requirements.txt pygit2 -w ./temp_wheel_dir - python -m pip install --no-cache-dir ./temp_wheel_dir/* - echo installed basic - git clone --recurse-submodules https://github.com/facebookresearch/xformers.git - cd xformers - python -m pip install --no-cache-dir wheel setuptools twine - echo building xformers - python setup.py bdist_wheel -d ../temp_wheel_dir/ - cd .. - rm -rf xformers - ls -lah temp_wheel_dir - mv temp_wheel_dir cu118_python_deps - tar cf cu118_python_deps.tar cu118_python_deps - - diff --git a/.github/workflows/windows_release_cu118_dependencies_2.yml b/.github/workflows/windows_release_cu118_dependencies_2.yml deleted file mode 100644 index a7760b21e15..00000000000 --- a/.github/workflows/windows_release_cu118_dependencies_2.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: "Windows Release cu118 dependencies 2" - -on: - workflow_dispatch: - inputs: - xformers: - description: 'xformers version' - required: true - type: string - default: "xformers" - -# push: -# branches: -# - master - -jobs: - build_dependencies: - runs-on: windows-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10.9' - - - shell: bash - run: | - python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu118 -r requirements.txt pygit2 -w ./temp_wheel_dir - python -m pip install --no-cache-dir ./temp_wheel_dir/* - echo installed basic - ls -lah temp_wheel_dir - mv temp_wheel_dir cu118_python_deps - tar cf cu118_python_deps.tar cu118_python_deps - - - uses: actions/cache/save@v3 - with: - path: cu118_python_deps.tar - key: ${{ runner.os }}-build-cu118 diff --git a/.github/workflows/windows_release_cu118_package.yml b/.github/workflows/windows_release_cu118_package.yml deleted file mode 100644 index 0f0fbf28039..00000000000 --- a/.github/workflows/windows_release_cu118_package.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: "Windows Release cu118 packaging" - -on: - workflow_dispatch: -# push: -# branches: -# - master - -jobs: - package_comfyui: - permissions: - contents: "write" - packages: "write" - pull-requests: "read" - runs-on: windows-latest - steps: - - uses: actions/cache/restore@v3 - id: cache - with: - path: cu118_python_deps.tar - key: ${{ runner.os }}-build-cu118 - - shell: bash - run: | - mv cu118_python_deps.tar ../ - cd .. - tar xf cu118_python_deps.tar - pwd - ls - - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - persist-credentials: false - - shell: bash - run: | - cd .. - cp -r ComfyUI ComfyUI_copy - curl https://www.python.org/ftp/python/3.10.9/python-3.10.9-embed-amd64.zip -o python_embeded.zip - unzip python_embeded.zip -d python_embeded - cd python_embeded - echo 'import site' >> ./python310._pth - curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py - ./python.exe get-pip.py - ./python.exe -s -m pip install ../cu118_python_deps/* - sed -i '1i../ComfyUI' ./python310._pth - cd .. - - git clone https://github.com/comfyanonymous/taesd - cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/ - - mkdir ComfyUI_windows_portable - mv python_embeded ComfyUI_windows_portable - mv ComfyUI_copy ComfyUI_windows_portable/ComfyUI - - cd ComfyUI_windows_portable - - mkdir update - cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/update_windows_cu118/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ - - cd .. - - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable - mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z - - cd ComfyUI_windows_portable - python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu - - ls - - - name: Upload binaries to release - uses: svenstaro/upload-release-action@v2 - with: - repo_token: ${{ secrets.GITHUB_TOKEN }} - file: new_ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z - tag: "latest" - overwrite: true - diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index aafe8a21444..e0841fdf6c3 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -41,10 +41,9 @@ jobs: - shell: bash run: | echo "@echo off - ..\python_embeded\python.exe .\update.py ..\ComfyUI\\ + call update_comfyui.bat nopause echo - - echo This will try to update pytorch and all python dependencies, if you get an error wait for pytorch/xformers to fix their stuff - echo You should not be running this anyways unless you really have to + echo This will try to update pytorch and all python dependencies. echo - echo If you just want to update normally, close this and run update_comfyui.bat instead. echo - diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml index 90e09d27a53..56830935685 100644 --- a/.github/workflows/windows_release_nightly_pytorch.yml +++ b/.github/workflows/windows_release_nightly_pytorch.yml @@ -68,7 +68,7 @@ jobs: cp -r ComfyUI/.ci/update_windows/* ./update/ cp -r ComfyUI/.ci/windows_base_files/* ./ - echo "..\python_embeded\python.exe .\update.py ..\ComfyUI\\ + echo "call update_comfyui.bat nopause ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 pause" > ./update/update_comfyui_and_python_dependencies.bat cd .. diff --git a/comfy/clip_model.py b/comfy/clip_model.py index 9b82a246b2c..14f43c5687c 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -119,6 +119,9 @@ def __init__(self, config_dict, dtype, device, operations): super().__init__() self.num_layers = config_dict["num_hidden_layers"] self.text_model = CLIPTextModel_(config_dict, dtype, device, operations) + embed_dim = config_dict["hidden_size"] + self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device) + self.text_projection.weight.copy_(torch.eye(embed_dim)) self.dtype = dtype def get_input_embeddings(self): @@ -128,7 +131,10 @@ def set_input_embeddings(self, embeddings): self.text_model.embeddings.token_embedding = embeddings def forward(self, *args, **kwargs): - return self.text_model(*args, **kwargs) + x = self.text_model(*args, **kwargs) + out = self.text_projection(x[2]) + return (x[0], x[1], out, x[2]) + class CLIPVisionEmbeddings(torch.nn.Module): def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dtype=None, device=None, operations=None): diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 416197586a1..f859a50d4c3 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -287,13 +287,13 @@ class control_lora_ops(ControlLoraOps, comfy.ops.manual_cast): for k in sd: weight = sd[k] try: - comfy.utils.set_attr(self.control_model, k, weight) + comfy.utils.set_attr_param(self.control_model, k, weight) except: pass for k in self.control_weights: if k not in {"lora_controlnet"}: - comfy.utils.set_attr(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device())) + comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device())) def copy(self): c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling) diff --git a/comfy/diffusers_convert.py b/comfy/diffusers_convert.py index a9eb9302f14..eb561933aaa 100644 --- a/comfy/diffusers_convert.py +++ b/comfy/diffusers_convert.py @@ -237,8 +237,12 @@ def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""): capture_qkv_bias[k_pre][code2idx[k_code]] = v continue - relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k) - new_state_dict[relabelled_key] = v + text_proj = "transformer.text_projection.weight" + if k.endswith(text_proj): + new_state_dict[k.replace(text_proj, "text_projection")] = v.transpose(0, 1).contiguous() + else: + relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k) + new_state_dict[relabelled_key] = v for k_pre, tensors in capture_qkv_weight.items(): if None in tensors: diff --git a/comfy/extra_samplers/uni_pc.py b/comfy/extra_samplers/uni_pc.py index 08bf0fc9e67..a30d1d03f2e 100644 --- a/comfy/extra_samplers/uni_pc.py +++ b/comfy/extra_samplers/uni_pc.py @@ -358,9 +358,6 @@ def __init__( thresholding=False, max_val=1., variant='bh1', - noise_mask=None, - masked_image=None, - noise=None, ): """Construct a UniPC. @@ -372,9 +369,6 @@ def __init__( self.predict_x0 = predict_x0 self.thresholding = thresholding self.max_val = max_val - self.noise_mask = noise_mask - self.masked_image = masked_image - self.noise = noise def dynamic_thresholding_fn(self, x0, t=None): """ @@ -391,10 +385,7 @@ def noise_prediction_fn(self, x, t): """ Return the noise prediction model. """ - if self.noise_mask is not None: - return self.model(x, t) * self.noise_mask - else: - return self.model(x, t) + return self.model(x, t) def data_prediction_fn(self, x, t): """ @@ -409,8 +400,6 @@ def data_prediction_fn(self, x, t): s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims) x0 = torch.clamp(x0, -s, s) / s - if self.noise_mask is not None: - x0 = x0 * self.noise_mask + (1. - self.noise_mask) * self.masked_image return x0 def model_fn(self, x, t): @@ -723,8 +712,6 @@ def sample(self, x, timesteps, t_start=None, t_end=None, order=3, skip_type='tim assert timesteps.shape[0] - 1 == steps # with torch.no_grad(): for step_index in trange(steps, disable=disable_pbar): - if self.noise_mask is not None: - x = x * self.noise_mask + (1. - self.noise_mask) * (self.masked_image * self.noise_schedule.marginal_alpha(timesteps[step_index]) + self.noise * self.noise_schedule.marginal_std(timesteps[step_index])) if step_index == 0: vec_t = timesteps[0].expand((x.shape[0])) model_prev_list = [self.model_fn(x, vec_t)] @@ -766,7 +753,7 @@ def sample(self, x, timesteps, t_start=None, t_end=None, order=3, skip_type='tim model_x = self.model_fn(x, vec_t) model_prev_list[-1] = model_x if callback is not None: - callback(step_index, model_prev_list[-1], x, steps) + callback({'x': x, 'i': step_index, 'denoised': model_prev_list[-1]}) else: raise NotImplementedError() # if denoise_to_zero: @@ -858,7 +845,7 @@ def predict_eps_sigma(model, input, sigma_in, **kwargs): return (input - model(input, sigma_in, **kwargs)) / sigma -def sample_unipc(model, noise, image, sigmas, max_denoise, extra_args=None, callback=None, disable=False, noise_mask=None, variant='bh1'): +def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'): timesteps = sigmas.clone() if sigmas[-1] == 0: timesteps = sigmas[:] @@ -867,16 +854,7 @@ def sample_unipc(model, noise, image, sigmas, max_denoise, extra_args=None, call timesteps = sigmas.clone() ns = SigmaConvert() - if image is not None: - img = image * ns.marginal_alpha(timesteps[0]) - if max_denoise: - noise_mult = 1.0 - else: - noise_mult = ns.marginal_std(timesteps[0]) - img += noise * noise_mult - else: - img = noise - + noise = noise / torch.sqrt(1.0 + timesteps[0] ** 2.0) model_type = "noise" model_fn = model_wrapper( @@ -888,7 +866,10 @@ def sample_unipc(model, noise, image, sigmas, max_denoise, extra_args=None, call ) order = min(3, len(timesteps) - 2) - uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, noise_mask=noise_mask, masked_image=image, noise=noise, variant=variant) - x = uni_pc.sample(img, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable) + uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, variant=variant) + x = uni_pc.sample(noise, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable) x /= ns.marginal_alpha(timesteps[-1]) return x + +def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False): + return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2') \ No newline at end of file diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 03fd59e3da0..674364e720f 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -1,3 +1,4 @@ +import torch class LatentFormat: scale_factor = 1.0 @@ -34,6 +35,32 @@ def __init__(self): ] self.taesd_decoder_name = "taesdxl_decoder" +class SDXL_Playground_2_5(LatentFormat): + def __init__(self): + self.scale_factor = 0.5 + self.latents_mean = torch.tensor([-1.6574, 1.886, -1.383, 2.5155]).view(1, 4, 1, 1) + self.latents_std = torch.tensor([8.4927, 5.9022, 6.5498, 5.2299]).view(1, 4, 1, 1) + + self.latent_rgb_factors = [ + # R G B + [ 0.3920, 0.4054, 0.4549], + [-0.2634, -0.0196, 0.0653], + [ 0.0568, 0.1687, -0.0755], + [-0.3112, -0.2359, -0.2076] + ] + self.taesd_decoder_name = "taesdxl_decoder" + + def process_in(self, latent): + latents_mean = self.latents_mean.to(latent.device, latent.dtype) + latents_std = self.latents_std.to(latent.device, latent.dtype) + return (latent - latents_mean) * self.scale_factor / latents_std + + def process_out(self, latent): + latents_mean = self.latents_mean.to(latent.device, latent.dtype) + latents_std = self.latents_std.to(latent.device, latent.dtype) + return latent * latents_std / self.scale_factor + latents_mean + + class SD_X4(LatentFormat): def __init__(self): self.scale_factor = 0.08333 diff --git a/comfy/ldm/modules/diffusionmodules/openaimodel.py b/comfy/ldm/modules/diffusionmodules/openaimodel.py index 998afd977ca..cf89ae01782 100644 --- a/comfy/ldm/modules/diffusionmodules/openaimodel.py +++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py @@ -484,7 +484,6 @@ def __init__( self.predict_codebook_ids = n_embed is not None self.default_num_video_frames = None - self.default_image_only_indicator = None time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( @@ -708,27 +707,30 @@ def get_resblock( device=device, operations=operations )] - if transformer_depth_middle >= 0: - mid_block += [get_attention_layer( # always uses a self-attn - ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim, - disable_self_attn=disable_middle_self_attn, use_checkpoint=use_checkpoint - ), - get_resblock( - merge_factor=merge_factor, - merge_strategy=merge_strategy, - video_kernel_size=video_kernel_size, - ch=ch, - time_embed_dim=time_embed_dim, - dropout=dropout, - out_channels=None, - dims=dims, - use_checkpoint=use_checkpoint, - use_scale_shift_norm=use_scale_shift_norm, - dtype=self.dtype, - device=device, - operations=operations - )] - self.middle_block = TimestepEmbedSequential(*mid_block) + + self.middle_block = None + if transformer_depth_middle >= -1: + if transformer_depth_middle >= 0: + mid_block += [get_attention_layer( # always uses a self-attn + ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim, + disable_self_attn=disable_middle_self_attn, use_checkpoint=use_checkpoint + ), + get_resblock( + merge_factor=merge_factor, + merge_strategy=merge_strategy, + video_kernel_size=video_kernel_size, + ch=ch, + time_embed_dim=time_embed_dim, + dropout=dropout, + out_channels=None, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + dtype=self.dtype, + device=device, + operations=operations + )] + self.middle_block = TimestepEmbedSequential(*mid_block) self._feature_size += ch self.output_blocks = nn.ModuleList([]) @@ -827,7 +829,7 @@ def forward(self, x, timesteps=None, context=None, y=None, control=None, transfo transformer_patches = transformer_options.get("patches", {}) num_video_frames = kwargs.get("num_video_frames", self.default_num_video_frames) - image_only_indicator = kwargs.get("image_only_indicator", self.default_image_only_indicator) + image_only_indicator = kwargs.get("image_only_indicator", None) time_context = kwargs.get("time_context", None) assert (y is not None) == ( @@ -858,7 +860,8 @@ def forward(self, x, timesteps=None, context=None, y=None, control=None, transfo h = p(h, transformer_options) transformer_options["block"] = ("middle", 0) - h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator) + if self.middle_block is not None: + h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator) h = apply_control(h, control, 'middle') diff --git a/comfy/ldm/modules/diffusionmodules/util.py b/comfy/ldm/modules/diffusionmodules/util.py index 5a6aa7d77d1..ce14ad5e18c 100644 --- a/comfy/ldm/modules/diffusionmodules/util.py +++ b/comfy/ldm/modules/diffusionmodules/util.py @@ -46,23 +46,25 @@ def __init__( else: raise ValueError(f"unknown merge strategy {self.merge_strategy}") - def get_alpha(self, image_only_indicator: torch.Tensor) -> torch.Tensor: + def get_alpha(self, image_only_indicator: torch.Tensor, device) -> torch.Tensor: # skip_time_mix = rearrange(repeat(skip_time_mix, 'b -> (b t) () () ()', t=t), '(b t) 1 ... -> b 1 t ...', t=t) if self.merge_strategy == "fixed": # make shape compatible # alpha = repeat(self.mix_factor, '1 -> b () t () ()', t=t, b=bs) - alpha = self.mix_factor.to(image_only_indicator.device) + alpha = self.mix_factor.to(device) elif self.merge_strategy == "learned": - alpha = torch.sigmoid(self.mix_factor.to(image_only_indicator.device)) + alpha = torch.sigmoid(self.mix_factor.to(device)) # make shape compatible # alpha = repeat(alpha, '1 -> s () ()', s = t * bs) elif self.merge_strategy == "learned_with_images": - assert image_only_indicator is not None, "need image_only_indicator ..." - alpha = torch.where( - image_only_indicator.bool(), - torch.ones(1, 1, device=image_only_indicator.device), - rearrange(torch.sigmoid(self.mix_factor.to(image_only_indicator.device)), "... -> ... 1"), - ) + if image_only_indicator is None: + alpha = rearrange(torch.sigmoid(self.mix_factor.to(device)), "... -> ... 1") + else: + alpha = torch.where( + image_only_indicator.bool(), + torch.ones(1, 1, device=image_only_indicator.device), + rearrange(torch.sigmoid(self.mix_factor.to(image_only_indicator.device)), "... -> ... 1"), + ) alpha = rearrange(alpha, self.rearrange_pattern) # make shape compatible # alpha = repeat(alpha, '1 -> s () ()', s = t * bs) @@ -76,7 +78,7 @@ def forward( x_temporal, image_only_indicator=None, ) -> torch.Tensor: - alpha = self.get_alpha(image_only_indicator) + alpha = self.get_alpha(image_only_indicator, x_spatial.device) x = ( alpha.to(x_spatial.dtype) * x_spatial + (1.0 - alpha).to(x_spatial.dtype) * x_temporal diff --git a/comfy/lora.py b/comfy/lora.py index 5e4009b47f9..21b9897a63a 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -197,6 +197,15 @@ def model_lora_keys_clip(model, key_map={}): key_map[lora_key] = k lora_key = "text_encoder.text_model.encoder.layers.{}.{}".format(b, c) #diffusers lora key_map[lora_key] = k + lora_key = "lora_prior_te_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #cascade lora: TODO put lora key prefix in the model config + key_map[lora_key] = k + + + k = "clip_g.transformer.text_projection.weight" + if k in sdk: + key_map["lora_prior_te_text_projection"] = k #cascade lora? + # key_map["text_encoder.text_projection"] = k #TODO: check if other lora have the text_projection too + # key_map["lora_te_text_projection"] = k return key_map @@ -207,6 +216,7 @@ def model_lora_keys_unet(model, key_map={}): if k.startswith("diffusion_model.") and k.endswith(".weight"): key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_") key_map["lora_unet_{}".format(key_lora)] = k + key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config diffusers_keys = comfy.utils.unet_to_diffusers(model.model_config.unet_config) for k in diffusers_keys: diff --git a/comfy/model_base.py b/comfy/model_base.py index 421f271b28a..a9de1366724 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -15,9 +15,10 @@ class ModelType(Enum): V_PREDICTION = 2 V_PREDICTION_EDM = 3 STABLE_CASCADE = 4 + EDM = 5 -from comfy.model_sampling import EPS, V_PREDICTION, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling +from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling def model_sampling(model_config, model_type): @@ -33,6 +34,9 @@ def model_sampling(model_config, model_type): elif model_type == ModelType.STABLE_CASCADE: c = EPS s = StableCascadeSampling + elif model_type == ModelType.EDM: + c = EDM + s = ModelSamplingContinuousEDM class ModelSampling(s, c): pass @@ -163,6 +167,10 @@ def blank_inpaint_image_like(latent_image): if cross_attn_cnet is not None: out['crossattn_controlnet'] = comfy.conds.CONDCrossAttn(cross_attn_cnet) + c_concat = kwargs.get("noise_concat", None) + if c_concat is not None: + out['c_concat'] = comfy.conds.CONDNoiseShape(data) + return out def load_model_weights(self, sd, unet_prefix=""): @@ -368,7 +376,6 @@ def extra_conds(self, **kwargs): if "time_conditioning" in kwargs: out["time_context"] = comfy.conds.CONDCrossAttn(kwargs["time_conditioning"]) - out['image_only_indicator'] = comfy.conds.CONDConstant(torch.zeros((1,), device=device)) out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0]) return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 8fca6d8c8e4..07ee8570864 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -151,8 +151,10 @@ def detect_unet_config(state_dict, key_prefix): channel_mult.append(last_channel_mult) if "{}middle_block.1.proj_in.weight".format(key_prefix) in state_dict_keys: transformer_depth_middle = count_blocks(state_dict_keys, '{}middle_block.1.transformer_blocks.'.format(key_prefix) + '{}') - else: + elif "{}middle_block.0.in_layers.0.weight".format(key_prefix) in state_dict_keys: transformer_depth_middle = -1 + else: + transformer_depth_middle = -2 unet_config["in_channels"] = in_channels unet_config["out_channels"] = out_channels @@ -242,6 +244,7 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None): down_blocks = count_blocks(state_dict, "down_blocks.{}") for i in range(down_blocks): attn_blocks = count_blocks(state_dict, "down_blocks.{}.attentions.".format(i) + '{}') + res_blocks = count_blocks(state_dict, "down_blocks.{}.resnets.".format(i) + '{}') for ab in range(attn_blocks): transformer_count = count_blocks(state_dict, "down_blocks.{}.attentions.{}.transformer_blocks.".format(i, ab) + '{}') transformer_depth.append(transformer_count) @@ -250,8 +253,8 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None): attn_res *= 2 if attn_blocks == 0: - transformer_depth.append(0) - transformer_depth.append(0) + for i in range(res_blocks): + transformer_depth.append(0) match["transformer_depth"] = transformer_depth @@ -329,7 +332,19 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None): 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -1, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'use_temporal_attention': False, 'use_temporal_resblock': False} - supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega] + KOALA_700M = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, + 'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, + 'num_res_blocks': [1, 1, 1], 'transformer_depth': [0, 2, 5], 'transformer_depth_output': [0, 0, 2, 2, 5, 5], + 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, + 'use_temporal_attention': False, 'use_temporal_resblock': False} + + KOALA_1B = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, + 'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, + 'num_res_blocks': [1, 1, 1], 'transformer_depth': [0, 2, 6], 'transformer_depth_output': [0, 0, 2, 2, 6, 6], + 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 6, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, + 'use_temporal_attention': False, 'use_temporal_resblock': False} + + supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B] for unet_config in supported_models: matches = True diff --git a/comfy/model_management.py b/comfy/model_management.py index d2d8a4d06d8..f0d726fcdae 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -767,7 +767,7 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled #when the model doesn't actually fit on the card #TODO: actually test if GP106 and others have the same type of behavior - nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050"] + nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"] for x in nvidia_10_series: if x in props.name.lower(): fp16_works = True diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a88b737cca3..4a5d42b035c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -67,6 +67,9 @@ def set_model_sampler_post_cfg_function(self, post_cfg_function, disable_cfg1_op def set_model_unet_function_wrapper(self, unet_wrapper_function): self.model_options["model_function_wrapper"] = unet_wrapper_function + def set_model_denoise_mask_function(self, denoise_mask_function): + self.model_options["denoise_mask_function"] = denoise_mask_function + def set_model_patch(self, patch, name): to = self.model_options["transformer_options"] if "patches" not in to: @@ -176,10 +179,9 @@ def model_state_dict(self, filter_prefix=None): def patch_model(self, device_to=None, patch_weights=True): for k in self.object_patches: - old = getattr(self.model, k) + old = comfy.utils.set_attr(self.model, k, self.object_patches[k]) if k not in self.object_patches_backup: self.object_patches_backup[k] = old - setattr(self.model, k, self.object_patches[k]) if patch_weights: model_sd = self.model_state_dict() @@ -203,7 +205,7 @@ def patch_model(self, device_to=None, patch_weights=True): if inplace_update: comfy.utils.copy_to_param(self.model, key, out_weight) else: - comfy.utils.set_attr(self.model, key, out_weight) + comfy.utils.set_attr_param(self.model, key, out_weight) del temp_weight if device_to is not None: @@ -342,7 +344,7 @@ def unpatch_model(self, device_to=None): comfy.utils.copy_to_param(self.model, k, self.backup[k]) else: for k in keys: - comfy.utils.set_attr(self.model, k, self.backup[k]) + comfy.utils.set_attr_param(self.model, k, self.backup[k]) self.backup = {} @@ -352,6 +354,6 @@ def unpatch_model(self, device_to=None): keys = list(self.object_patches_backup.keys()) for k in keys: - setattr(self.model, k, self.object_patches_backup[k]) + comfy.utils.set_attr(self.model, k, self.object_patches_backup[k]) self.object_patches_backup = {} diff --git a/comfy/model_sampling.py b/comfy/model_sampling.py index 97e91a01d67..b46202b7803 100644 --- a/comfy/model_sampling.py +++ b/comfy/model_sampling.py @@ -11,12 +11,25 @@ def calculate_denoised(self, sigma, model_output, model_input): sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) return model_input - model_output * sigma + def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): + if max_denoise: + noise = noise * torch.sqrt(1.0 + sigma ** 2.0) + else: + noise = noise * sigma + if latent_image is not None: + noise += latent_image + return noise class V_PREDICTION(EPS): def calculate_denoised(self, sigma, model_output, model_input): sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 +class EDM(V_PREDICTION): + def calculate_denoised(self, sigma, model_output, model_input): + sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + class ModelSamplingDiscrete(torch.nn.Module): def __init__(self, model_config=None): @@ -92,8 +105,6 @@ def percent_to_sigma(self, percent): class ModelSamplingContinuousEDM(torch.nn.Module): def __init__(self, model_config=None): super().__init__() - self.sigma_data = 1.0 - if model_config is not None: sampling_settings = model_config.sampling_settings else: @@ -101,9 +112,11 @@ def __init__(self, model_config=None): sigma_min = sampling_settings.get("sigma_min", 0.002) sigma_max = sampling_settings.get("sigma_max", 120.0) - self.set_sigma_range(sigma_min, sigma_max) + sigma_data = sampling_settings.get("sigma_data", 1.0) + self.set_parameters(sigma_min, sigma_max, sigma_data) - def set_sigma_range(self, sigma_min, sigma_max): + def set_parameters(self, sigma_min, sigma_max, sigma_data): + self.sigma_data = sigma_data sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), 1000).exp() self.register_buffer('sigmas', sigmas) #for compatibility with some schedulers diff --git a/comfy/samplers.py b/comfy/samplers.py index c795f208d80..6863be4eb7b 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -208,6 +208,7 @@ def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): cur_patches[p] = cur_patches[p] + patches[p] else: cur_patches[p] = patches[p] + transformer_options["patches"] = cur_patches else: transformer_options["patches"] = patches @@ -271,13 +272,16 @@ def forward(self, *args, **kwargs): return self.apply_model(*args, **kwargs) class KSamplerX0Inpaint(torch.nn.Module): - def __init__(self, model): + def __init__(self, model, sigmas): super().__init__() self.inner_model = model + self.sigmas = sigmas def forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, model_options={}, seed=None): if denoise_mask is not None: + if "denoise_mask_function" in model_options: + denoise_mask = model_options["denoise_mask_function"](sigma, denoise_mask, extra_options={"model": self.inner_model, "sigmas": self.sigmas}) latent_mask = 1. - denoise_mask - x = x * denoise_mask + (self.latent_image + self.noise * sigma.reshape([sigma.shape[0]] + [1] * (len(self.noise.shape) - 1))) * latent_mask + x = x * denoise_mask + self.inner_model.inner_model.model_sampling.noise_scaling(sigma.reshape([sigma.shape[0]] + [1] * (len(self.noise.shape) - 1)), self.noise, self.latent_image) * latent_mask out = self.inner_model(x, sigma, cond=cond, uncond=uncond, cond_scale=cond_scale, model_options=model_options, seed=seed) if denoise_mask is not None: out = out * denoise_mask + self.latent_image * latent_mask @@ -513,14 +517,6 @@ def max_denoise(self, model_wrap, sigmas): sigma = float(sigmas[0]) return math.isclose(max_sigma, sigma, rel_tol=1e-05) or sigma > max_sigma -class UNIPC(Sampler): - def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): - return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, disable=disable_pbar) - -class UNIPCBH2(Sampler): - def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): - return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, variant='bh2', disable=disable_pbar) - KSAMPLER_NAMES = ["euler", "euler_ancestral", "heun", "heunpp2","dpm_2", "dpm_2_ancestral", "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu", "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm"] @@ -533,7 +529,7 @@ def __init__(self, sampler_function, extra_options={}, inpaint_options={}): def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): extra_args["denoise_mask"] = denoise_mask - model_k = KSamplerX0Inpaint(model_wrap) + model_k = KSamplerX0Inpaint(model_wrap, sigmas) model_k.latent_image = latent_image if self.inpaint_options.get("random", False): #TODO: Should this be the default? generator = torch.manual_seed(extra_args.get("seed", 41) + 1) @@ -541,19 +537,13 @@ def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=N else: model_k.noise = noise - if self.max_denoise(model_wrap, sigmas): - noise = noise * torch.sqrt(1.0 + sigmas[0] ** 2.0) - else: - noise = noise * sigmas[0] + noise = model_wrap.inner_model.model_sampling.noise_scaling(sigmas[0], noise, latent_image, self.max_denoise(model_wrap, sigmas)) k_callback = None total_steps = len(sigmas) - 1 if callback is not None: k_callback = lambda x: callback(x["i"], x["denoised"], x["x"], total_steps) - if latent_image is not None: - noise += latent_image - samples = self.sampler_function(model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar, **self.extra_options) return samples @@ -595,7 +585,7 @@ def sample(model, noise, positive, negative, cfg, device, sampler, sigmas, model calculate_start_end_timesteps(model, negative) calculate_start_end_timesteps(model, positive) - if latent_image is not None: + if latent_image is not None and torch.count_nonzero(latent_image) > 0: #Don't shift the empty latent image. latent_image = model.process_latent_in(latent_image) if hasattr(model, 'extra_conds'): @@ -640,9 +630,9 @@ def calculate_sigmas_scheduler(model, scheduler_name, steps): def sampler_object(name): if name == "uni_pc": - sampler = UNIPC() + sampler = KSAMPLER(uni_pc.sample_unipc) elif name == "uni_pc_bh2": - sampler = UNIPCBH2() + sampler = KSAMPLER(uni_pc.sample_unipc_bh2) elif name == "ddim": sampler = ksampler("euler", inpaint_options={"random": True}) else: diff --git a/comfy/sd.py b/comfy/sd.py index 48d567009a3..bcc900558f9 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -52,7 +52,7 @@ def load_clip_weights(model, sd): if ids.dtype == torch.float32: sd['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round() - sd = comfy.utils.transformers_convert(sd, "cond_stage_model.model.", "cond_stage_model.transformer.text_model.", 24) + sd = comfy.utils.clip_text_transformers_convert(sd, "cond_stage_model.model.", "cond_stage_model.transformer.") return load_model_weights(model, sd) @@ -123,10 +123,13 @@ def tokenize(self, text, return_word_ids=False): return self.tokenizer.tokenize_with_weights(text, return_word_ids) def encode_from_tokens(self, tokens, return_pooled=False): + self.cond_stage_model.reset_clip_options() + if self.layer_idx is not None: - self.cond_stage_model.clip_layer(self.layer_idx) - else: - self.cond_stage_model.reset_clip_layer() + self.cond_stage_model.set_clip_options({"layer": self.layer_idx}) + + if return_pooled == "unprojected": + self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model() cond, pooled = self.cond_stage_model.encode_token_weights(tokens) @@ -391,7 +394,10 @@ class EmptyClass: for i in range(len(clip_data)): if "transformer.resblocks.0.ln_1.weight" in clip_data[i]: - clip_data[i] = comfy.utils.transformers_convert(clip_data[i], "", "text_model.", 32) + clip_data[i] = comfy.utils.clip_text_transformers_convert(clip_data[i], "", "") + else: + if "text_projection" in clip_data[i]: + clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) #old models saved with the CLIPSave node clip_target = EmptyClass() clip_target.params = {} diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 8287ad2e8b8..87e3eaa4ddb 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -67,7 +67,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): ] def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel, - special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False): # clip-vit-base-patch32 + special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, return_projected_pooled=True): # clip-vit-base-patch32 super().__init__() assert layer in self.LAYERS @@ -86,16 +86,18 @@ def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_le self.layer = layer self.layer_idx = None self.special_tokens = special_tokens - self.text_projection = torch.nn.Parameter(torch.eye(self.transformer.get_input_embeddings().weight.shape[1])) + self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055)) self.enable_attention_masks = enable_attention_masks self.layer_norm_hidden_state = layer_norm_hidden_state + self.return_projected_pooled = return_projected_pooled + if layer == "hidden": assert layer_idx is not None assert abs(layer_idx) < self.num_layers - self.clip_layer(layer_idx) - self.layer_default = (self.layer, self.layer_idx) + self.set_clip_options({"layer": layer_idx}) + self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled) def freeze(self): self.transformer = self.transformer.eval() @@ -103,16 +105,19 @@ def freeze(self): for param in self.parameters(): param.requires_grad = False - def clip_layer(self, layer_idx): - if abs(layer_idx) > self.num_layers: + def set_clip_options(self, options): + layer_idx = options.get("layer", self.layer_idx) + self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled) + if layer_idx is None or abs(layer_idx) > self.num_layers: self.layer = "last" else: self.layer = "hidden" self.layer_idx = layer_idx - def reset_clip_layer(self): - self.layer = self.layer_default[0] - self.layer_idx = self.layer_default[1] + def reset_clip_options(self): + self.layer = self.options_default[0] + self.layer_idx = self.options_default[1] + self.return_projected_pooled = self.options_default[2] def set_up_textual_embeddings(self, tokens, current_embeds): out_tokens = [] @@ -177,23 +182,19 @@ def forward(self, tokens): else: z = outputs[1] - if outputs[2] is not None: - pooled_output = outputs[2].float() - else: - pooled_output = None + pooled_output = None + if len(outputs) >= 3: + if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None: + pooled_output = outputs[3].float() + elif outputs[2] is not None: + pooled_output = outputs[2].float() - if self.text_projection is not None and pooled_output is not None: - pooled_output = pooled_output.float().to(self.text_projection.device) @ self.text_projection.float() return z.float(), pooled_output def encode(self, tokens): return self(tokens) def load_sd(self, sd): - if "text_projection" in sd: - self.text_projection[:] = sd.pop("text_projection") - if "text_projection.weight" in sd: - self.text_projection[:] = sd.pop("text_projection.weight").transpose(0, 1) return self.transformer.load_state_dict(sd, strict=False) def parse_parentheses(string): @@ -354,11 +355,12 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No return embed_out class SDTokenizer: - def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True): + def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None): if tokenizer_path is None: tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer") self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path) self.max_length = max_length + self.min_length = min_length empty = self.tokenizer('')["input_ids"] if has_start_token: @@ -470,6 +472,8 @@ def tokenize_with_weights(self, text:str, return_word_ids=False): batch.append((self.end_token, 1.0, 0)) if self.pad_to_max_length: batch.extend([(pad_token, 1.0, 0)] * (self.max_length - len(batch))) + if self.min_length is not None and len(batch) < self.min_length: + batch.extend([(pad_token, 1.0, 0)] * (self.min_length - len(batch))) if not return_word_ids: batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens] @@ -503,11 +507,11 @@ def __init__(self, device="cpu", dtype=None, clip_name="l", clip_model=SDClipMod self.clip = "clip_{}".format(self.clip_name) setattr(self, self.clip, clip_model(device=device, dtype=dtype, **kwargs)) - def clip_layer(self, layer_idx): - getattr(self, self.clip).clip_layer(layer_idx) + def set_clip_options(self, options): + getattr(self, self.clip).set_clip_options(options) - def reset_clip_layer(self): - getattr(self, self.clip).reset_clip_layer() + def reset_clip_options(self): + getattr(self, self.clip).reset_clip_options() def encode_token_weights(self, token_weight_pairs): token_weight_pairs = token_weight_pairs[self.clip_name] diff --git a/comfy/sdxl_clip.py b/comfy/sdxl_clip.py index 3ce5c7e05e6..e62d1ed868c 100644 --- a/comfy/sdxl_clip.py +++ b/comfy/sdxl_clip.py @@ -40,13 +40,13 @@ def __init__(self, device="cpu", dtype=None): self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False) self.clip_g = SDXLClipG(device=device, dtype=dtype) - def clip_layer(self, layer_idx): - self.clip_l.clip_layer(layer_idx) - self.clip_g.clip_layer(layer_idx) + def set_clip_options(self, options): + self.clip_l.set_clip_options(options) + self.clip_g.set_clip_options(options) - def reset_clip_layer(self): - self.clip_g.reset_clip_layer() - self.clip_l.reset_clip_layer() + def reset_clip_options(self): + self.clip_g.reset_clip_options() + self.clip_l.reset_clip_options() def encode_token_weights(self, token_weight_pairs): token_weight_pairs_g = token_weight_pairs["g"] diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 5bb98d88a96..3758210326c 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -75,7 +75,7 @@ def process_clip_state_dict(self, state_dict): replace_prefix["conditioner.embedders.0.model."] = "clip_h." #SD2 in sgm format replace_prefix["cond_stage_model.model."] = "clip_h." state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True) - state_dict = utils.transformers_convert(state_dict, "clip_h.", "clip_h.transformer.text_model.", 24) + state_dict = utils.clip_text_transformers_convert(state_dict, "clip_h.", "clip_h.transformer.") return state_dict def process_clip_state_dict_for_saving(self, state_dict): @@ -134,7 +134,7 @@ def process_clip_state_dict(self, state_dict): replace_prefix["conditioner.embedders.0.model."] = "clip_g." state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True) - state_dict = utils.transformers_convert(state_dict, "clip_g.", "clip_g.transformer.text_model.", 32) + state_dict = utils.clip_text_transformers_convert(state_dict, "clip_g.", "clip_g.transformer.") state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace) return state_dict @@ -163,7 +163,13 @@ class SDXL(supported_models_base.BASE): latent_format = latent_formats.SDXL def model_type(self, state_dict, prefix=""): - if "v_pred" in state_dict: + if 'edm_mean' in state_dict and 'edm_std' in state_dict: #Playground V2.5 + self.latent_format = latent_formats.SDXL_Playground_2_5() + self.sampling_settings["sigma_data"] = 0.5 + self.sampling_settings["sigma_max"] = 80.0 + self.sampling_settings["sigma_min"] = 0.002 + return model_base.ModelType.EDM + elif "v_pred" in state_dict: return model_base.ModelType.V_PREDICTION else: return model_base.ModelType.EPS @@ -182,22 +188,24 @@ def process_clip_state_dict(self, state_dict): replace_prefix["conditioner.embedders.1.model."] = "clip_g." state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True) - state_dict = utils.transformers_convert(state_dict, "clip_g.", "clip_g.transformer.text_model.", 32) - keys_to_replace["clip_g.text_projection.weight"] = "clip_g.text_projection" - state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace) + state_dict = utils.clip_text_transformers_convert(state_dict, "clip_g.", "clip_g.transformer.") return state_dict def process_clip_state_dict_for_saving(self, state_dict): replace_prefix = {} keys_to_replace = {} state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g") - if "clip_g.transformer.text_model.embeddings.position_ids" in state_dict_g: - state_dict_g.pop("clip_g.transformer.text_model.embeddings.position_ids") for k in state_dict: if k.startswith("clip_l"): state_dict_g[k] = state_dict[k] + state_dict_g["clip_l.transformer.text_model.embeddings.position_ids"] = torch.arange(77).expand((1, -1)) + pop_keys = ["clip_l.transformer.text_projection.weight", "clip_l.logit_scale"] + for p in pop_keys: + if p in state_dict_g: + state_dict_g.pop(p) + replace_prefix["clip_g"] = "conditioner.embedders.1.model" replace_prefix["clip_l"] = "conditioner.embedders.0" state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix) @@ -226,6 +234,26 @@ class Segmind_Vega(SDXL): "use_temporal_attention": False, } +class KOALA_700M(SDXL): + unet_config = { + "model_channels": 320, + "use_linear_in_transformer": True, + "transformer_depth": [0, 2, 5], + "context_dim": 2048, + "adm_in_channels": 2816, + "use_temporal_attention": False, + } + +class KOALA_1B(SDXL): + unet_config = { + "model_channels": 320, + "use_linear_in_transformer": True, + "transformer_depth": [0, 2, 6], + "context_dim": 2048, + "adm_in_channels": 2816, + "use_temporal_attention": False, + } + class SVD_img2vid(supported_models_base.BASE): unet_config = { "model_channels": 320, @@ -338,6 +366,12 @@ def process_unet_state_dict(self, state_dict): state_dict[k_to] = weights[shape_from*x:shape_from*(x + 1)] return state_dict + def process_clip_state_dict(self, state_dict): + state_dict = utils.state_dict_prefix_replace(state_dict, {k: "" for k in self.text_encoder_key_prefix}, filter_keys=True) + if "clip_g.text_projection" in state_dict: + state_dict["clip_g.transformer.text_projection.weight"] = state_dict.pop("clip_g.text_projection").transpose(0, 1) + return state_dict + def get_model(self, state_dict, prefix="", device=None): out = model_base.StableCascade_C(self, device=device) return out @@ -366,5 +400,5 @@ def get_model(self, state_dict, prefix="", device=None): return out -models = [Stable_Zero123, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXLRefiner, SDXL, SSD1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B] +models = [Stable_Zero123, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B] models += [SVD_img2vid] diff --git a/comfy/utils.py b/comfy/utils.py index 1ccdda638a0..819b92badb5 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -124,8 +124,22 @@ def transformers_convert(sd, prefix_from, prefix_to, number): p = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"] k_to = "{}encoder.layers.{}.{}.{}".format(prefix_to, resblock, p[x], y) sd[k_to] = weights[shape_from*x:shape_from*(x + 1)] + return sd +def clip_text_transformers_convert(sd, prefix_from, prefix_to): + sd = transformers_convert(sd, prefix_from, "{}text_model.".format(prefix_to), 32) + + tp = "{}text_projection.weight".format(prefix_from) + if tp in sd: + sd["{}text_projection.weight".format(prefix_to)] = sd.pop(tp) + + tp = "{}text_projection".format(prefix_from) + if tp in sd: + sd["{}text_projection.weight".format(prefix_to)] = sd.pop(tp).transpose(0, 1).contiguous() + return sd + + UNET_MAP_ATTENTIONS = { "proj_in.weight", "proj_in.bias", @@ -306,8 +320,11 @@ def set_attr(obj, attr, value): for name in attrs[:-1]: obj = getattr(obj, name) prev = getattr(obj, attrs[-1]) - setattr(obj, attrs[-1], torch.nn.Parameter(value, requires_grad=False)) - del prev + setattr(obj, attrs[-1], value) + return prev + +def set_attr_param(obj, attr, value): + return set_attr(obj, attr, torch.nn.Parameter(value, requires_grad=False)) def copy_to_param(obj, attr, value): # inplace update tensor instead of replacing it diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py new file mode 100644 index 00000000000..7e858a71b08 --- /dev/null +++ b/comfy_extras/nodes_differential_diffusion.py @@ -0,0 +1,42 @@ +# code adapted from https://github.com/exx8/differential-diffusion + +import torch + +class DifferentialDiffusion(): + @classmethod + def INPUT_TYPES(s): + return {"required": {"model": ("MODEL", ), + }} + RETURN_TYPES = ("MODEL",) + FUNCTION = "apply" + CATEGORY = "_for_testing" + INIT = False + + def apply(self, model): + model = model.clone() + model.set_model_denoise_mask_function(self.forward) + return (model,) + + def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict): + model = extra_options["model"] + step_sigmas = extra_options["sigmas"] + sigma_to = model.inner_model.model_sampling.sigma_min + if step_sigmas[-1] > sigma_to: + sigma_to = step_sigmas[-1] + sigma_from = step_sigmas[0] + + ts_from = model.inner_model.model_sampling.timestep(sigma_from) + ts_to = model.inner_model.model_sampling.timestep(sigma_to) + current_ts = model.inner_model.model_sampling.timestep(sigma) + + threshold = (current_ts - ts_to) / (ts_from - ts_to) + + return (denoise_mask >= threshold).to(denoise_mask.dtype) + + +NODE_CLASS_MAPPINGS = { + "DifferentialDiffusion": DifferentialDiffusion, +} +NODE_DISPLAY_NAME_MAPPINGS = { + "DifferentialDiffusion": "Differential Diffusion", +} diff --git a/comfy_extras/nodes_model_advanced.py b/comfy_extras/nodes_model_advanced.py index 1b3f3945e38..21af4b73339 100644 --- a/comfy_extras/nodes_model_advanced.py +++ b/comfy_extras/nodes_model_advanced.py @@ -1,6 +1,7 @@ import folder_paths import comfy.sd import comfy.model_sampling +import comfy.latent_formats import torch class LCM(comfy.model_sampling.EPS): @@ -135,7 +136,7 @@ class ModelSamplingContinuousEDM: @classmethod def INPUT_TYPES(s): return {"required": { "model": ("MODEL",), - "sampling": (["v_prediction", "eps"],), + "sampling": (["v_prediction", "edm_playground_v2.5", "eps"],), "sigma_max": ("FLOAT", {"default": 120.0, "min": 0.0, "max": 1000.0, "step":0.001, "round": False}), "sigma_min": ("FLOAT", {"default": 0.002, "min": 0.0, "max": 1000.0, "step":0.001, "round": False}), }} @@ -148,17 +149,25 @@ def INPUT_TYPES(s): def patch(self, model, sampling, sigma_max, sigma_min): m = model.clone() + latent_format = None + sigma_data = 1.0 if sampling == "eps": sampling_type = comfy.model_sampling.EPS elif sampling == "v_prediction": sampling_type = comfy.model_sampling.V_PREDICTION + elif sampling == "edm_playground_v2.5": + sampling_type = comfy.model_sampling.EDM + sigma_data = 0.5 + latent_format = comfy.latent_formats.SDXL_Playground_2_5() class ModelSamplingAdvanced(comfy.model_sampling.ModelSamplingContinuousEDM, sampling_type): pass model_sampling = ModelSamplingAdvanced(model.model.model_config) - model_sampling.set_sigma_range(sigma_min, sigma_max) + model_sampling.set_parameters(sigma_min, sigma_max, sigma_data) m.add_object_patch("model_sampling", model_sampling) + if latent_format is not None: + m.add_object_patch("latent_format", latent_format) return (m, ) class RescaleCFG: diff --git a/comfy_extras/nodes_perpneg.py b/comfy_extras/nodes_perpneg.py index 45e4d418f4f..64bbc1dcd42 100644 --- a/comfy_extras/nodes_perpneg.py +++ b/comfy_extras/nodes_perpneg.py @@ -35,7 +35,7 @@ def cfg_function(args): pos = noise_pred_pos - noise_pred_nocond neg = noise_pred_neg - noise_pred_nocond - perp = ((torch.mul(pos, neg).sum())/(torch.norm(neg)**2)) * neg + perp = neg - ((torch.mul(neg, pos).sum())/(torch.norm(pos)**2)) * pos perp_neg = perp * neg_scale cfg_result = noise_pred_nocond + cond_scale*(pos - perp_neg) cfg_result = x - cfg_result diff --git a/custom_nodes/example_node.py.example b/custom_nodes/example_node.py.example index 7ce271ec617..f066325930d 100644 --- a/custom_nodes/example_node.py.example +++ b/custom_nodes/example_node.py.example @@ -103,6 +103,9 @@ class Example: #def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen): # return "" +# Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension +# WEB_DIRECTORY = "./somejs" + # A dictionary that contains all nodes you want to export with their names # NOTE: names should be globally unique NODE_CLASS_MAPPINGS = { diff --git a/main.py b/main.py index 69d9bce6cb7..5d07ce2d1fb 100644 --- a/main.py +++ b/main.py @@ -193,6 +193,13 @@ def load_extra_path_config(yaml_path): folder_paths.set_temp_directory(temp_dir) cleanup_temp() + if args.windows_standalone_build: + try: + import new_updater + new_updater.update_windows_updater() + except: + pass + loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) server = server.PromptServer(loop) diff --git a/new_updater.py b/new_updater.py new file mode 100644 index 00000000000..a49e0877cb1 --- /dev/null +++ b/new_updater.py @@ -0,0 +1,35 @@ +import os +import shutil + +base_path = os.path.dirname(os.path.realpath(__file__)) + + +def update_windows_updater(): + top_path = os.path.dirname(base_path) + updater_path = os.path.join(base_path, ".ci/update_windows/update.py") + bat_path = os.path.join(base_path, ".ci/update_windows/update_comfyui.bat") + + dest_updater_path = os.path.join(top_path, "update/update.py") + dest_bat_path = os.path.join(top_path, "update/update_comfyui.bat") + dest_bat_deps_path = os.path.join(top_path, "update/update_comfyui_and_python_dependencies.bat") + + try: + with open(dest_bat_path, 'rb') as f: + contents = f.read() + except: + return + + if not contents.startswith(b"..\\python_embeded\\python.exe .\\update.py"): + return + + shutil.copy(updater_path, dest_updater_path) + try: + with open(dest_bat_deps_path, 'rb') as f: + contents = f.read() + contents = contents.replace(b'..\\python_embeded\\python.exe .\\update.py ..\\ComfyUI\\', b'call update_comfyui.bat nopause') + with open(dest_bat_deps_path, 'wb') as f: + f.write(contents) + except: + pass + shutil.copy(bat_path, dest_bat_path) + print("Updated the windows standalone package updater.") diff --git a/nodes.py b/nodes.py index a577c212628..2cfea1a7329 100644 --- a/nodes.py +++ b/nodes.py @@ -1003,7 +1003,7 @@ def INPUT_TYPES(s): def append(self, conditioning_to, clip, gligen_textbox_model, text, width, height, x, y): c = [] - cond, cond_pooled = clip.encode_from_tokens(clip.tokenize(text), return_pooled=True) + cond, cond_pooled = clip.encode_from_tokens(clip.tokenize(text), return_pooled="unprojected") for t in conditioning_to: n = [t[0], t[1].copy()] position_params = [(cond_pooled, height // 8, width // 8, y // 8, x // 8)] @@ -1961,6 +1961,7 @@ def init_custom_nodes(): "nodes_photomaker.py", "nodes_cond.py", "nodes_stable_cascade.py", + "nodes_differential_diffusion.py", ] for node_file in extras_files: diff --git a/server.py b/server.py index dca06f6fc32..c6132cdf91e 100644 --- a/server.py +++ b/server.py @@ -539,11 +539,11 @@ def add_routes(self): for name, dir in nodes.EXTENSION_WEB_DIRS.items(): self.app.add_routes([ - web.static('/extensions/' + urllib.parse.quote(name), dir, follow_symlinks=True), + web.static('/extensions/' + urllib.parse.quote(name), dir), ]) self.app.add_routes([ - web.static('/', self.web_root, follow_symlinks=True), + web.static('/', self.web_root), ]) def get_queue_info(self):