gradient

54rt1n · Jan 29, 2024 · 7c62608 · 7c62608
1 parent 9abe438
commit 7c62608
Show file tree

Hide file tree

Showing 16 changed files with 3,364 additions and 359 deletions.
diff --git a/README.md b/README.md
@@ -6,14 +6,23 @@ Merge two checkpoint models by dare ties (https://github.com/yule-BUAA/MergeLM).
 ## U-Net
 |category|node name|input type|output type|desc.|
 | --- | --- | --- | --- | --- |
-|unet|Model Merger (Masked)|`MODEL`, `MODEL`, `MODEL_MASK`|`MODEL`|Performs a masked block merge|
-|unet|Model Merger (DARE)|`MODEL`, `MODEL`, `MODEL_MASK (optional)`|`MODEL`|Performs a DARE block merge|
-|unet|MBW Merger (DARE)|`MODEL`, `MODEL`, `MODEL_MASK (optional)`|`MODEL`|Performs a DARE block merge, with full layer control (like MBW)|
+|unet|Model Merger (Advanced)|`MODEL`, `MODEL`, `LAYER_GRADIENT`, `MODEL_MASK (optional)`|`MODEL`|Performs a model merge, with gradient configuration for layer weights|
+|unet|Model Merger (Advanced/DARE)|`MODEL`, `MODEL`, `LAYER_GRADIENT`, `MODEL_MASK (optional)`|`MODEL`|Performs a DARE-TIES merge (using layers is for targeted control)|
+| --- | --- | --- | --- | --- |
+|unet|Model Merger (Block)|`MODEL`, `MODEL`, `MODEL_MASK (optional)`|`MODEL`|Performs a block merge|
+|unet|Model Merger (Block/DARE)|`MODEL`, `MODEL`, `MODEL_MASK (optional)`|`MODEL`|Performs a DARE block merge|
+|unet|Model Merger (MBW/DARE)|`MODEL`, `MODEL`, `MODEL_MASK (optional)`|`MODEL`|Performs a DARE block merge (using MBW)|
+|unet|Model Merger (Attention/DARE)|`MODEL`, `MODEL`, `MODEL_MASK (optional)`|`MODEL`|Performs a DARE block merge (targeting attention)|
 
-## CLIP
+## Layer Gradient
 |category|node name|input type|output type|desc.|
 | --- | --- | --- | --- | --- |
-|clip|CLIP Merger (DARE)|`CLIP`, `CLIP`|`CLIP`|Performs a DARE merge on two CLIP|
+|grad|Gradient Operations|`LAYER_GRADIENT`, `LAYER_GRADIENT`|`LAYER_GRADIENT`|Performs operations on layer gradients|
+|grad|Gradient Edit|`LAYER_GRADIENT`|`LAYER_GRADIENT`|Directly target layers for editing with wildcards|
+|grad|Block Gradient|`MODEL`|`LAYER_GRADIENT`|Returns the block gradient for a model|
+|grad|Attention Gradient|`MODEL`|`LAYER_GRADIENT`|Returns the attention gradient for a model|
+|grad|Shell Gradient|`MODEL`|`LAYER_GRADIENT`|Returns the balanced layers (onion) gradient for a model|
+|grad|MBW Gradient|`MODEL`|`LAYER_GRADIENT`|Returns the MBW-style gradient for a model|
 
 ## Masking
 |category|node name|input type|output type|desc.|
@@ -24,6 +33,11 @@ Merge two checkpoint models by dare ties (https://github.com/yule-BUAA/MergeLM).
 |mask|Mask Operations|`MODEL_MASK`, `MODEL_MASK`|`MODEL_MASK`|Allows set operations to be performed on masks|
 |mask|Mask Edit|`MODEL_MASK`|`MODEL_MASK`|Allows the direct editing of mask layers|
 
+## CLIP
+|category|node name|input type|output type|desc.|
+| --- | --- | --- | --- | --- |
+|clip|CLIP Merger (DARE)|`CLIP`, `CLIP`|`CLIP`|Performs a DARE merge on two CLIP|
+
 ## LoRA
 |category|node name|input type|output type|desc.|
 | --- | --- | --- | --- | --- |
@@ -37,9 +51,10 @@ Merge two checkpoint models by dare ties (https://github.com/yule-BUAA/MergeLM).
 ## Reporting
 |category|node name|input type|output type|desc.|
 | --- | --- | --- | --- | --- |
-|report|Mask Reporting|`MODEL_MASK`|`STRING`|Returns basic layer statistics for the mask|
+|report|Mask Reporting|`MODEL_MASK`|`STRING`, `IMAGE`|Returns basic layer statistics for the mask|
 |report|Model Reporting|`MODEL`|`STRING`, `IMAGE`|Returns a plot of a model layer|
 |report|LoRA Reporting||`STRING`, `IMAGE`|Returns stats and information about a LoRA|
+|report|Gradient Reporting|`LAYER_GRADIENT`|`STRING`, `IMAGE`|Returns a report on the layer gradient|
 
 
 ### Merging
@@ -55,6 +70,9 @@ DARE-TIES does a stochastic selection of the parameters to keep, and then only p
 
 *Of note, this merge method does use random sampling, so you should not just assume that your first random seed is the best one for your merge, and if it is not set to fixed that the merge will change every run.*
 
+### Layer Gradients
+Layer gradients are a generalized term to describe the merge ratios for a model merge.  These define the ratios at each layer of a model, which allows model ratio selection to become a more fine-grained operation which operations can be performed on.  I have left some basic components which do not use LAYER_GRADIENT, but there is nothing special about them as internally they are just using the advanced components.
+
 ### Masks
 * A mask creates a signature of a model to filter parts of a model merge.  Areas that are selected in the mask will be included in the merge, while areas that are not selected will be excluded.
 * Using the mask you can select the parameters to target either stronger, by using the 'above' or weaker, by using the 'below' option.  The threshold will determine quantile of the parameters to target.

diff --git a/__init__.py b/__init__.py
@@ -1,18 +1,20 @@
 from .components.clip import DareClipMerger
-from .components.dare import DareUnetMerger
+from .components.dare import DareUnetMerger, DareUnetMergerGradient
 from .components.dare_mbw import DareUnetMergerMBW
 from .components.dare_element import DareUnetMergerElement
-from .components.block import BlockUnetMerger
+from .components.gradients import BlockLayerGradient, ShellLayerGradient, AttentionLayerGradient, MBWLayerGradient, LayerGradientOperations, LayerGradientEdit
+from .components.block import BlockUnetMerger, GradientUnetMerger
 from .components.normalize import NormalizeUnet
 from .components.mask_model import MagnitudeMasker, MaskOperations, MaskEdit, SimpleMasker, QuadMasker
-from .components.reports import MaskReporting, ModelReporting, LoRAReporting
+from .components.reports import MaskReporting, ModelReporting, LoRAReporting, LayerGradientReporting
 from .components.lora import LoraLoaderTags
 
 
-
 NODE_CLASS_MAPPINGS = {
-    "DM_MaskedModelMerger": BlockUnetMerger,
-    "DM_DareModelMerger": DareUnetMerger,
+    "DM_AdvancedModelMerger": GradientUnetMerger,
+    "DM_AdvancedDareModelMerger": DareUnetMergerGradient,
+    "DM_BlockModelMerger": BlockUnetMerger,
+    "DM_DareModelMergerBlock": DareUnetMerger,
     "DM_DareModelMergerMBW": DareUnetMergerMBW,
     "DM_DareModelMergerElement": DareUnetMergerElement,
     "DM_DareClipMerger": DareClipMerger,
@@ -21,28 +23,43 @@
     "DM_QuadMasker": QuadMasker,
     "DM_MaskOperations": MaskOperations,
     "DM_MaskEdit": MaskEdit,
+    "DM_GradientOperations": LayerGradientOperations,
+    "DM_GradientEdit": LayerGradientEdit,
+    "DM_BlockGradient": BlockLayerGradient,
+    "DM_ShellGradient": ShellLayerGradient,
+    "DM_AttentionGradient": AttentionLayerGradient,
+    "DM_MBWGradient": MBWLayerGradient,
     "DM_ModelReporting": ModelReporting,
     "DM_MaskReporting": MaskReporting,
     "DM_LoRAReporting": LoRAReporting,
+    "DM_GradientReporting": LayerGradientReporting,
     "DM_NormalizeModel": NormalizeUnet,
     "DM_LoRALoaderTags": LoraLoaderTags,
-
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "DM_MaskedModelMerger": "Model Merger (Masked)",
-    "DM_DareModelMerger": "Model Merger (DARE)",
-    "DM_DareModelMergerMBW": "MBW Merger (DARE)",
-    "DM_DareModelMergerElement": "Element Merger (DARE)",
+    "DM_AdvancedModelMerger": "Model Merger (Advanced)",
+    "DM_AdvancedDareModelMerger": "Model Merger (Advanced/DARE)",
+    "DM_BlockModelMerger": "Model Merger (Block)",
+    "DM_DareModelMergerBlock": "Model Merger (Block/DARE)",
+    "DM_DareModelMergerMBW": "Model Merger (MBW/DARE)",
+    "DM_DareModelMergerElement": "Model Merger (Attention/DARE)",
     "DM_DareClipMerger": "CLIP Merger (DARE)",
     "DM_SimpleMasker": "Simple Masker",
     "DM_MagnitudeMasker": "Magnitude Masker",
     "DM_QuadMasker": "Quad Masker",
     "DM_MaskOperations": "Mask Operations",
     "DM_MaskEdit": "Mask Edit",
+    "DM_GradientOperations": "Gradient Operations",
+    "DM_GradientEdit": "Gradient Edit",
+    "DM_BlockGradient": "Block Gradient",
+    "DM_ShellGradient": "Shell Gradient",
+    "DM_AttentionGradient": "Attention Gradient",
+    "DM_MBWGradient": "MBW Gradient",
     "DM_ModelReporting": "Model Reporting",
     "DM_MaskReporting": "Mask Reporting",
     "DM_LoRAReporting": "LoRA Reporting",
+    "DM_GradientReporting": "Gradient Reporting",
     "DM_NormalizeModel": "Normalize Model",
     "DM_LoRALoaderTags": "LoRA Loader (Tags)",
 }

diff --git a/components/block.py b/components/block.py
@@ -3,15 +3,17 @@
 import torch
 from typing import Dict, Tuple, Optional
 
-from ..ddare.const import UNET_CATEGORY
+from ..ddare.const import UNET_CATEGORY, LAYER_GRADIENT, MODEL_MASK
 from ..ddare.mask import ModelMask
 from ..ddare.merge import merge_tensors, METHODS
-from ..ddare.util import cuda_memory_profiler, get_device, get_patched_state
+from ..ddare.util import cuda_memory_profiler, get_device, get_patched_state, merge_input_types
 
+from .gradients import BlockLayerGradient
 
-class BlockUnetMerger:
+
+class GradientUnetMerger:
     """
-    A class to merge two diffusion U-Net models using m mask.
+    A class to merge two diffusion U-Net models using m mask and a gradient.
     """
     @classmethod
     def INPUT_TYPES(cls) -> Dict[str, tuple]:
@@ -25,38 +27,28 @@ def INPUT_TYPES(cls) -> Dict[str, tuple]:
             "required": {
                 "model_a": ("MODEL",),
                 "model_b": ("MODEL",),
-                "model_mask": ("MODEL_MASK",),
-                "time": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                "label": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                "input": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                "middle": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                "output": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+                "gradient": (LAYER_GRADIENT,),
                 "method": (["comfy",] + METHODS, {"default": "comfy"} ),
             },
             "optional": {
+                "model_mask": (MODEL_MASK,),
             }
         }
 
     RETURN_TYPES = ("MODEL",)
     FUNCTION = "merge"
     CATEGORY = UNET_CATEGORY
 
-    def merge(self, model_a: ModelPatcher, model_b: ModelPatcher,
-              input : float, middle : float, output : float, out : float, time : float, method : str,
-              clear_cache : bool = True, model_mask: Optional[ModelMask] = None,
+    def merge(self, model_a: ModelPatcher, model_b: ModelPatcher, gradient: Dict[str, float],
+              method : str, clear_cache : bool = True, model_mask: Optional[ModelMask] = None,
               **kwargs) -> Tuple[ModelPatcher]:
         """
         Merges two ModelPatcher instances based on the weighted consensus of their parameters and sparsity.
 
         Args:
             model_a (ModelPatcher): The base model to be merged.
             model_b (ModelPatcher): The model to merge into the base model.
-            input (float): The ratio (lambda) of the input layer to keep from model_a.
-            middle (float): The ratio (lambda) of the middle layers to keep from model_a.
-            output (float): The ratio (lambda) of the output layer to keep from model_a.
-            out (float): The ratio (lambda) of the output layer to keep from model_a.
-            time (float): The ratio (lambda) of the time layers to keep from model_a.
+            gradient (Dict[str, float]): A ModelGradient instance to use for the gradient.
             method (str): The method to use for merging, either "comfy", "lerp", "slerp", or "gradient".
             clear_cache (bool): Whether to clear the CUDA cache after each chunk. Default is True.
             model_mask (ModelMask): A ModelMask instance to use for masking the model. Default is None.
@@ -83,33 +75,23 @@ def merge(self, model_a: ModelPatcher, model_b: ModelPatcher,
                     print("could not patch. key doesn't exist in model:", k)
                     continue
 
-                k_unet = k[len("diffusion_model."):]
-
-                # Get our ratio for this layer
-                if k_unet.startswith("input"):
-                    ratio = input
-                elif k_unet.startswith("middle"):
-                    ratio = middle
-                elif k_unet.startswith("output"):
-                    ratio = output
-                elif k_unet.startswith("out"):
-                    ratio = out
-                elif k_unet.startswith("time"):
-                    ratio = time
-                else:
-                    print(f"Unknown key: {k}, skipping.")
+                ratio = gradient.get(k, None)
+                if ratio is None:
+                    print("no gradient for key:", k)
                     continue
-
-                # Apply sparsification by the delta for this layer
+                
+                # Get our model mask and our two tensors
                 mask : torch.Tensor = model_mask.get_layer_mask(k) if model_mask is not None else None
                 a : torch.Tensor = model_a_sd[k]
                 b : torch.Tensor = model_b_sd[k]
                 if mask is None:
-                    mask = torch.ones_like(a)
-
-                result_tensor = torch.where(mask.to(device), a.to(device), b.to(device))
+                    mask = torch.ones_like(a, dtype=torch.bool)
+
+                # Apply our mask
+                result_tensor = torch.where(mask.to(device), b.to(device), a.to(device))
                 del mask
 
+                # Merge our tensors
                 if method == "comfy":
                     strength_patch = 1.0 - ratio
                     strength_model = ratio
@@ -124,4 +106,50 @@ def merge(self, model_a: ModelPatcher, model_b: ModelPatcher,
         if clear_cache and torch.cuda.is_available():
             torch.cuda.empty_cache()
 
-        return (m,)
+        return (m,)
+
+
+class BlockUnetMerger:
+    """
+    A class to merge two diffusion U-Net models using m mask.
+    """
+    @classmethod
+    def INPUT_TYPES(cls) -> Dict[str, tuple]:
+        """
+        Defines the input types for the merging process.
+
+        Returns:
+            Dict[str, tuple]: A dictionary specifying the required model types and parameters.
+        """
+        merged = merge_input_types(GradientUnetMerger.INPUT_TYPES(), BlockLayerGradient.INPUT_TYPES())
+        del merged["required"]["gradient"]
+        del merged["required"]["model"]
+        return merged
+
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "merge"
+    CATEGORY = UNET_CATEGORY
+
+    def merge(self, model_a: ModelPatcher, **kwargs) -> Tuple[ModelPatcher]:
+        """
+        Merges two ModelPatcher instances based on the weighted consensus of their parameters and sparsity.
+
+        Args:
+            model_a (ModelPatcher): The base model to be merged.
+            model_b (ModelPatcher): The model to merge into the base model.
+            input (float): The ratio (lambda) of the input layer to keep from model_a.
+            middle (float): The ratio (lambda) of the middle layers to keep from model_a.
+            output (float): The ratio (lambda) of the output layer to keep from model_a.
+            out (float): The ratio (lambda) of the output layer to keep from model_a.
+            time (float): The ratio (lambda) of the time layers to keep from model_a.
+            method (str): The method to use for merging, either "comfy", "lerp", "slerp", or "gradient".
+            clear_cache (bool): Whether to clear the CUDA cache after each chunk. Default is True.
+            model_mask (ModelMask): A ModelMask instance to use for masking the model. Default is None.
+            **kwargs: Additional arguments specifying the merge ratios for different layers and sparsity.
+
+        Returns:
+            Tuple[ModelPatcher]: A tuple containing the merged ModelPatcher instance.
+        """
+
+        gradient = BlockLayerGradient().gradient(model=model_a, **kwargs)[0]
+        return GradientUnetMerger().merge(model_a=model_a, gradient=gradient, **kwargs)