Merge pull request #1270 from bghira/main

merge
bghira · Jan 12, 2025 · aa59b9a · aa59b9a
2 parents 1dd4393 + 29e6b9d
commit aa59b9a
Show file tree

Hide file tree

Showing 24 changed files with 571 additions and 250 deletions.
diff --git a/OPTIONS.md b/OPTIONS.md
diff --git a/README.md b/README.md
@@ -175,7 +175,3 @@ Enable debug logs for a more detailed insight by adding `export SIMPLETUNER_LOG_
 For performance analysis of the training loop, setting `SIMPLETUNER_TRAINING_LOOP_LOG_LEVEL=DEBUG` will have timestamps that highlight any issues in your configuration.
 
 For a comprehensive list of options available, consult [this documentation](/OPTIONS.md).
-
-## Discord
-
-For more help or to discuss training with like-minded folks, join [our Discord server](https://discord.gg/cSmvcU9Me9)
diff --git a/configure.py b/configure.py
@@ -752,69 +752,46 @@ def configure_env():
         sys.exit(1)
 
     # dataloader configuration
+    resolution_configs = {
+        256: {"resolution": 256, "minimum_image_size": 128},
+        512: {"resolution": 512, "minimum_image_size": 256},
+        768: {"resolution": 768, "minimum_image_size": 512},
+        1024: {"resolution": 1024, "minimum_image_size": 768},
+        1440: {"resolution": 1440, "minimum_image_size": 1024},
+        2048: {"resolution": 2048, "minimum_image_size": 1440},
+    }
+    default_dataset_configuration = {
+        "id": "PLACEHOLDER",
+        "type": "local",
+        "instance_data_dir": None,
+        "crop": False,
+        "resolution_type": "pixel_area",
+        "metadata_backend": "discovery",
+        "caption_strategy": "filename",
+        "cache_dir_vae": "vae",
+    }
+    default_cropped_dataset_configuration = {
+        "id": "PLACEHOLDER-crop",
+        "type": "local",
+        "instance_data_dir": None,
+        "crop": True,
+        "crop_aspect": "square",
+        "crop_style": "center",
+        "vae_cache_clear_each_epoch": False,
+        "resolution_type": "pixel_area",
+        "metadata_backend": "discovery",
+        "caption_strategy": "filename",
+        "cache_dir_vae": "vae-crop",
+    }
+
     default_local_configuration = [
-        {
-            "id": "PLACEHOLDER-512",
-            "type": "local",
-            "instance_data_dir": None,
-            "crop": False,
-            "crop_style": "random",
-            "minimum_image_size": 128,
-            "resolution": 512,
-            "resolution_type": "pixel_area",
-            "repeats": 10,
-            "metadata_backend": "discovery",
-            "caption_strategy": "filename",
-            "cache_dir_vae": "vae-512",
-        },
-        {
-            "id": "PLACEHOLDER-1024",
-            "type": "local",
-            "instance_data_dir": None,
-            "crop": False,
-            "crop_style": "random",
-            "minimum_image_size": 128,
-            "resolution": 1024,
-            "resolution_type": "pixel_area",
-            "repeats": 10,
-            "metadata_backend": "discovery",
-            "caption_strategy": "filename",
-            "cache_dir_vae": "vae-1024",
-        },
-        {
-            "id": "PLACEHOLDER-512-crop",
-            "type": "local",
-            "instance_data_dir": None,
-            "crop": True,
-            "crop_style": "random",
-            "minimum_image_size": 128,
-            "resolution": 512,
-            "resolution_type": "pixel_area",
-            "repeats": 10,
-            "metadata_backend": "discovery",
-            "caption_strategy": "filename",
-            "cache_dir_vae": "vae-512-crop",
-        },
-        {
-            "id": "PLACEHOLDER-1024-crop",
-            "type": "local",
-            "instance_data_dir": None,
-            "crop": True,
-            "crop_style": "random",
-            "minimum_image_size": 128,
-            "resolution": 1024,
-            "resolution_type": "pixel_area",
-            "repeats": 10,
-            "metadata_backend": "discovery",
-            "caption_strategy": "filename",
-            "cache_dir_vae": "vae-1024-crop",
-        },
         {
             "id": "text-embed-cache",
             "dataset_type": "text_embeds",
             "default": True,
             "type": "local",
             "cache_dir": "text",
+            "write_batch_size": 128,
         },
     ]
 
@@ -894,9 +871,36 @@ def configure_env():
         )
     dataset_repeats = int(
         prompt_user(
-            "How many times do you want to repeat each image in the dataset?", 10
+            "How many times do you want to repeat each image in the dataset? A value of zero means the dataset will only be seen once; a value of one will cause the dataset to be sampled twice.",
+            10,
         )
     )
+    default_base_resolutions = "1024"
+    multi_resolution_recommendation_text = (
+        "Multiple resolutions may be provided, but this is only recommended for Flux."
+    )
+    multi_resolution_capable_models = ["flux"]
+    if env_contents["--model_family"] in multi_resolution_capable_models:
+        default_base_resolutions = "256,512,768,1024,1440"
+    multi_resolution_recommendation_text = "A comma-separated list of values or a single item can be given to train on multiple base resolutions."
+    dataset_resolutions = prompt_user(
+        f"Which resolutions do you want to train? {multi_resolution_recommendation_text}",
+        default_base_resolutions,
+    )
+    if "," in dataset_resolutions:
+        # most models don't work with multi base resolution training.
+        if env_contents["--model_family"] not in multi_resolution_capable_models:
+            print(
+                "WARNING: Most models do not play well with multi-resolution training, resulting in degraded outputs and broken hearts. Proceed with caution."
+            )
+        dataset_resolutions = [int(res) for res in dataset_resolutions.split(",")]
+    else:
+        try:
+            dataset_resolutions = [int(dataset_resolutions)]
+        except:
+            print("Invalid resolution value. Using 1024 instead.")
+            dataset_resolutions = [1024]
+
     dataset_cache_prefix = prompt_user(
         "Where will your VAE and text encoder caches be written to? Subdirectories will be created inside for you automatically.",
         "cache/",
@@ -910,13 +914,21 @@ def configure_env():
     )
 
     # Now we'll modify the default json and if has_very_large_images is true, we will add two keys to each image dataset, 'maximum_image_size' and 'target_downsample_size' equal to the dataset's resolution value
-    for dataset in default_local_configuration:
-        if dataset.get("dataset_type") == "text_embeds":
-            dataset["cache_dir"] = f"{dataset_cache_prefix}/{dataset['cache_dir']}"
-            continue
-        dataset["instance_data_dir"] = dataset_path
+    def create_dataset_config(resolution, default_config):
+        dataset = default_config.copy()
+        dataset.update(resolution_configs[resolution])
+        dataset["id"] = f"{dataset['id']}-{resolution}"
+        dataset["instance_data_dir"] = os.path.abspath(dataset_path)
         dataset["repeats"] = dataset_repeats
-        dataset["cache_dir_vae"] = f"{dataset_cache_prefix}/{dataset['cache_dir_vae']}"
+        # we want the absolute path, as this works best with datasets containing nested subdirectories.
+        dataset["cache_dir_vae"] = os.path.abspath(
+            os.path.join(
+                dataset_cache_prefix,
+                env_contents["--model_family"],
+                dataset["cache_dir_vae"],
+                str(resolution),
+            )
+        )
         if has_very_large_images:
             dataset["maximum_image_size"] = dataset["resolution"]
             dataset["target_downsample_size"] = dataset["resolution"]
@@ -925,6 +937,26 @@ def configure_env():
             dataset["instance_prompt"] = dataset_instance_prompt
         dataset["caption_strategy"] = dataset_caption_strategy
 
+        if has_very_large_images:
+            dataset["maximum_image_size"] = dataset["resolution"]
+            dataset["target_downsample_size"] = dataset["resolution"]
+        return dataset
+
+    # this is because the text embed dataset is in the default config list at the top.
+    # it's confusingly written because i'm lazy, but you could do this any number of ways.
+    default_local_configuration[0]["cache_dir"] = os.path.abspath(
+        os.path.join(dataset_cache_prefix, env_contents["--model_family"], "text")
+    )
+    for resolution in dataset_resolutions:
+        uncropped_dataset = create_dataset_config(
+            resolution, default_dataset_configuration
+        )
+        default_local_configuration.append(uncropped_dataset)
+        cropped_dataset = create_dataset_config(
+            resolution, default_cropped_dataset_configuration
+        )
+        default_local_configuration.append(cropped_dataset)
+
     print("Dataloader configuration:")
     print(default_local_configuration)
     confirm = prompt_user("Does this look correct? (y/n)", "y").lower() == "y"

diff --git a/documentation/quickstart/FLUX.md b/documentation/quickstart/FLUX.md
@@ -206,15 +206,15 @@ Flow-matching models such as Flux and SD3 have a property called "shift" that al
 By default, no schedule shift is applied to flux, which results in a sigmoid bell-shape to the timestep sampling distribution. This is unlikely to be the ideal approach for Flux, but it results in a greater amount of learning in a shorter period of time than auto-shift.
 
 ##### Auto-shift
-A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flux_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.
+A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flow_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.
 
 ##### Manual specification
 _Thanks to General Awareness from Discord for the following examples_
 
-When using a `--flux_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
+When using a `--flow_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
 ![image](https://github.com/user-attachments/assets/991ca0ad-e25a-4b13-a3d6-b4f2de1fe982)
 
-When using a `--flux_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
+When using a `--flow_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
 ![image](https://github.com/user-attachments/assets/857a1f8a-07ab-4b75-8e6a-eecff616a28d)
 
 

diff --git a/documentation/quickstart/SANA.md b/documentation/quickstart/SANA.md
@@ -181,15 +181,15 @@ If you wish to enable evaluations to score the model's performance, see [this do
 Flow-matching models such as Sana, Sana, and SD3 have a property called "shift" that allows us to shift the trained portion of the timestep schedule using a simple decimal value.
 
 ##### Auto-shift
-A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flux_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.
+A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flow_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.
 
 ##### Manual specification
 _Thanks to General Awareness from Discord for the following examples_
 
-When using a `--flux_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
+When using a `--flow_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
 ![image](https://github.com/user-attachments/assets/991ca0ad-e25a-4b13-a3d6-b4f2de1fe982)
 
-When using a `--flux_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
+When using a `--flow_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
 ![image](https://github.com/user-attachments/assets/857a1f8a-07ab-4b75-8e6a-eecff616a28d)
 
 #### Dataset considerations

diff --git a/documentation/quickstart/SD3.md b/documentation/quickstart/SD3.md
@@ -195,12 +195,12 @@ In your `/home/user/simpletuner/config` directory, create a multidatabackend.jso
     "crop": true,
     "crop_aspect": "square",
     "crop_style": "center",
-    "resolution": 1.0,
+    "resolution": 1024,
     "minimum_image_size": 0,
-    "maximum_image_size": 1.0,
-    "target_downsample_size": 1.0,
-    "resolution_type": "area",
-    "cache_dir_vae": "cache/vae/sd3/pseudo-camera-10k",
+    "maximum_image_size": 1024,
+    "target_downsample_size": 1024,
+    "resolution_type": "pixel_area",
+    "cache_dir_vae": "/home/user/simpletuner/output/cache/vae/sd3/pseudo-camera-10k",
     "instance_data_dir": "/home/user/simpletuner/datasets/pseudo-camera-10k",
     "disabled": false,
     "skip_file_discovery": "",
@@ -277,8 +277,8 @@ The following values are recommended for `config.json`:
   "--validation_guidance_skip_layers_stop": 0.2,
   "--validation_guidance_skip_scale": 2.8,
   "--validation_guidance": 4.0,
-  "--flux_use_uniform_schedule": true,
-  "--flux_schedule_auto_shift": true
+  "--flow_use_uniform_schedule": true,
+  "--flow_schedule_auto_shift": true
 }
 ```
 
@@ -309,17 +309,17 @@ Some changes were made to SimpleTuner's SD3.5 support:
 - No longer zeroing T5 padding space by default (`--t5_padding`)
 - Offering a switch (`--sd3_clip_uncond_behaviour` and `--sd3_t5_uncond_behaviour`) to use empty encoded blank captions for unconditional predictions (`empty_string`, **default**) or zeros (`zero`), not a recommended setting to tweak.
 - SD3.5 training loss function was updated to match that found in the upstream StabilityAI/SD3.5 repository
-- Updated default `--flux_schedule_shift` value to 3 to match the static 1024px value for SD3
-  - StabilityAI followed-up with documentation to use `--flux_schedule_shift=1` with `--flux_use_uniform_schedule`
-  - Community members have reported that `--flux_schedule_auto_shift` works better when using mult-aspect or multi-resolution training
-- Updated the hard-coded tokeniser sequence length limit to **256** with the option to revert it to **77** tokens to save disk space or compute at the cost of output quality degradation
+- Updated default `--flow_schedule_shift` value to 3 to match the static 1024px value for SD3
+  - StabilityAI followed-up with documentation to use `--flow_schedule_shift=1` with `--flow_use_uniform_schedule`
+  - Community members have reported that `--flow_schedule_auto_shift` works better when using mult-aspect or multi-resolution training
+- Updated the hard-coded tokeniser sequence length limit to **154** with the option to revert it to **77** tokens to save disk space or compute at the cost of output quality degradation
 
 
 #### Stable configuration values
 
 These options have been known to keep SD3.5 in-tact for as long as possible:
 - optimizer=adamw_bf16
-- flux_schedule_shift=1
+- flow_schedule_shift=1
 - learning_rate=1e-4
 - batch_size=4 * 3 GPUs
 - max_grad_norm=0.1