Skip to content

Commit

Permalink
Merge pull request #1270 from bghira/main
Browse files Browse the repository at this point in the history
merge
  • Loading branch information
bghira authored Jan 12, 2025
2 parents 1dd4393 + 29e6b9d commit aa59b9a
Show file tree
Hide file tree
Showing 24 changed files with 571 additions and 250 deletions.
149 changes: 97 additions & 52 deletions OPTIONS.md

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,3 @@ Enable debug logs for a more detailed insight by adding `export SIMPLETUNER_LOG_
For performance analysis of the training loop, setting `SIMPLETUNER_TRAINING_LOOP_LOG_LEVEL=DEBUG` will have timestamps that highlight any issues in your configuration.

For a comprehensive list of options available, consult [this documentation](/OPTIONS.md).

## Discord

For more help or to discuss training with like-minded folks, join [our Discord server](https://discord.gg/cSmvcU9Me9)
158 changes: 95 additions & 63 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,69 +752,46 @@ def configure_env():
sys.exit(1)

# dataloader configuration
resolution_configs = {
256: {"resolution": 256, "minimum_image_size": 128},
512: {"resolution": 512, "minimum_image_size": 256},
768: {"resolution": 768, "minimum_image_size": 512},
1024: {"resolution": 1024, "minimum_image_size": 768},
1440: {"resolution": 1440, "minimum_image_size": 1024},
2048: {"resolution": 2048, "minimum_image_size": 1440},
}
default_dataset_configuration = {
"id": "PLACEHOLDER",
"type": "local",
"instance_data_dir": None,
"crop": False,
"resolution_type": "pixel_area",
"metadata_backend": "discovery",
"caption_strategy": "filename",
"cache_dir_vae": "vae",
}
default_cropped_dataset_configuration = {
"id": "PLACEHOLDER-crop",
"type": "local",
"instance_data_dir": None,
"crop": True,
"crop_aspect": "square",
"crop_style": "center",
"vae_cache_clear_each_epoch": False,
"resolution_type": "pixel_area",
"metadata_backend": "discovery",
"caption_strategy": "filename",
"cache_dir_vae": "vae-crop",
}

default_local_configuration = [
{
"id": "PLACEHOLDER-512",
"type": "local",
"instance_data_dir": None,
"crop": False,
"crop_style": "random",
"minimum_image_size": 128,
"resolution": 512,
"resolution_type": "pixel_area",
"repeats": 10,
"metadata_backend": "discovery",
"caption_strategy": "filename",
"cache_dir_vae": "vae-512",
},
{
"id": "PLACEHOLDER-1024",
"type": "local",
"instance_data_dir": None,
"crop": False,
"crop_style": "random",
"minimum_image_size": 128,
"resolution": 1024,
"resolution_type": "pixel_area",
"repeats": 10,
"metadata_backend": "discovery",
"caption_strategy": "filename",
"cache_dir_vae": "vae-1024",
},
{
"id": "PLACEHOLDER-512-crop",
"type": "local",
"instance_data_dir": None,
"crop": True,
"crop_style": "random",
"minimum_image_size": 128,
"resolution": 512,
"resolution_type": "pixel_area",
"repeats": 10,
"metadata_backend": "discovery",
"caption_strategy": "filename",
"cache_dir_vae": "vae-512-crop",
},
{
"id": "PLACEHOLDER-1024-crop",
"type": "local",
"instance_data_dir": None,
"crop": True,
"crop_style": "random",
"minimum_image_size": 128,
"resolution": 1024,
"resolution_type": "pixel_area",
"repeats": 10,
"metadata_backend": "discovery",
"caption_strategy": "filename",
"cache_dir_vae": "vae-1024-crop",
},
{
"id": "text-embed-cache",
"dataset_type": "text_embeds",
"default": True,
"type": "local",
"cache_dir": "text",
"write_batch_size": 128,
},
]

Expand Down Expand Up @@ -894,9 +871,36 @@ def configure_env():
)
dataset_repeats = int(
prompt_user(
"How many times do you want to repeat each image in the dataset?", 10
"How many times do you want to repeat each image in the dataset? A value of zero means the dataset will only be seen once; a value of one will cause the dataset to be sampled twice.",
10,
)
)
default_base_resolutions = "1024"
multi_resolution_recommendation_text = (
"Multiple resolutions may be provided, but this is only recommended for Flux."
)
multi_resolution_capable_models = ["flux"]
if env_contents["--model_family"] in multi_resolution_capable_models:
default_base_resolutions = "256,512,768,1024,1440"
multi_resolution_recommendation_text = "A comma-separated list of values or a single item can be given to train on multiple base resolutions."
dataset_resolutions = prompt_user(
f"Which resolutions do you want to train? {multi_resolution_recommendation_text}",
default_base_resolutions,
)
if "," in dataset_resolutions:
# most models don't work with multi base resolution training.
if env_contents["--model_family"] not in multi_resolution_capable_models:
print(
"WARNING: Most models do not play well with multi-resolution training, resulting in degraded outputs and broken hearts. Proceed with caution."
)
dataset_resolutions = [int(res) for res in dataset_resolutions.split(",")]
else:
try:
dataset_resolutions = [int(dataset_resolutions)]
except:
print("Invalid resolution value. Using 1024 instead.")
dataset_resolutions = [1024]

dataset_cache_prefix = prompt_user(
"Where will your VAE and text encoder caches be written to? Subdirectories will be created inside for you automatically.",
"cache/",
Expand All @@ -910,13 +914,21 @@ def configure_env():
)

# Now we'll modify the default json and if has_very_large_images is true, we will add two keys to each image dataset, 'maximum_image_size' and 'target_downsample_size' equal to the dataset's resolution value
for dataset in default_local_configuration:
if dataset.get("dataset_type") == "text_embeds":
dataset["cache_dir"] = f"{dataset_cache_prefix}/{dataset['cache_dir']}"
continue
dataset["instance_data_dir"] = dataset_path
def create_dataset_config(resolution, default_config):
dataset = default_config.copy()
dataset.update(resolution_configs[resolution])
dataset["id"] = f"{dataset['id']}-{resolution}"
dataset["instance_data_dir"] = os.path.abspath(dataset_path)
dataset["repeats"] = dataset_repeats
dataset["cache_dir_vae"] = f"{dataset_cache_prefix}/{dataset['cache_dir_vae']}"
# we want the absolute path, as this works best with datasets containing nested subdirectories.
dataset["cache_dir_vae"] = os.path.abspath(
os.path.join(
dataset_cache_prefix,
env_contents["--model_family"],
dataset["cache_dir_vae"],
str(resolution),
)
)
if has_very_large_images:
dataset["maximum_image_size"] = dataset["resolution"]
dataset["target_downsample_size"] = dataset["resolution"]
Expand All @@ -925,6 +937,26 @@ def configure_env():
dataset["instance_prompt"] = dataset_instance_prompt
dataset["caption_strategy"] = dataset_caption_strategy

if has_very_large_images:
dataset["maximum_image_size"] = dataset["resolution"]
dataset["target_downsample_size"] = dataset["resolution"]
return dataset

# this is because the text embed dataset is in the default config list at the top.
# it's confusingly written because i'm lazy, but you could do this any number of ways.
default_local_configuration[0]["cache_dir"] = os.path.abspath(
os.path.join(dataset_cache_prefix, env_contents["--model_family"], "text")
)
for resolution in dataset_resolutions:
uncropped_dataset = create_dataset_config(
resolution, default_dataset_configuration
)
default_local_configuration.append(uncropped_dataset)
cropped_dataset = create_dataset_config(
resolution, default_cropped_dataset_configuration
)
default_local_configuration.append(cropped_dataset)

print("Dataloader configuration:")
print(default_local_configuration)
confirm = prompt_user("Does this look correct? (y/n)", "y").lower() == "y"
Expand Down
6 changes: 3 additions & 3 deletions documentation/quickstart/FLUX.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,15 @@ Flow-matching models such as Flux and SD3 have a property called "shift" that al
By default, no schedule shift is applied to flux, which results in a sigmoid bell-shape to the timestep sampling distribution. This is unlikely to be the ideal approach for Flux, but it results in a greater amount of learning in a shorter period of time than auto-shift.

##### Auto-shift
A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flux_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.
A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flow_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.

##### Manual specification
_Thanks to General Awareness from Discord for the following examples_

When using a `--flux_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
When using a `--flow_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
![image](https://github.com/user-attachments/assets/991ca0ad-e25a-4b13-a3d6-b4f2de1fe982)

When using a `--flux_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
When using a `--flow_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
![image](https://github.com/user-attachments/assets/857a1f8a-07ab-4b75-8e6a-eecff616a28d)


Expand Down
6 changes: 3 additions & 3 deletions documentation/quickstart/SANA.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,15 @@ If you wish to enable evaluations to score the model's performance, see [this do
Flow-matching models such as Sana, Sana, and SD3 have a property called "shift" that allows us to shift the trained portion of the timestep schedule using a simple decimal value.

##### Auto-shift
A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flux_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.
A commonly-recommended approach is to follow several recent works and enable resolution-dependent timestep shift, `--flow_schedule_auto_shift` which uses higher shift values for larger images, and lower shift values for smaller images. This results in stable but potentially mediocre training results.

##### Manual specification
_Thanks to General Awareness from Discord for the following examples_

When using a `--flux_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
When using a `--flow_schedule_shift` value of 0.1 (a very low value), only the finer details of the image are affected:
![image](https://github.com/user-attachments/assets/991ca0ad-e25a-4b13-a3d6-b4f2de1fe982)

When using a `--flux_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
When using a `--flow_schedule_shift` value of 4.0 (a very high value), the large compositional features and potentially colour space of the model becomes impacted:
![image](https://github.com/user-attachments/assets/857a1f8a-07ab-4b75-8e6a-eecff616a28d)

#### Dataset considerations
Expand Down
24 changes: 12 additions & 12 deletions documentation/quickstart/SD3.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,12 @@ In your `/home/user/simpletuner/config` directory, create a multidatabackend.jso
"crop": true,
"crop_aspect": "square",
"crop_style": "center",
"resolution": 1.0,
"resolution": 1024,
"minimum_image_size": 0,
"maximum_image_size": 1.0,
"target_downsample_size": 1.0,
"resolution_type": "area",
"cache_dir_vae": "cache/vae/sd3/pseudo-camera-10k",
"maximum_image_size": 1024,
"target_downsample_size": 1024,
"resolution_type": "pixel_area",
"cache_dir_vae": "/home/user/simpletuner/output/cache/vae/sd3/pseudo-camera-10k",
"instance_data_dir": "/home/user/simpletuner/datasets/pseudo-camera-10k",
"disabled": false,
"skip_file_discovery": "",
Expand Down Expand Up @@ -277,8 +277,8 @@ The following values are recommended for `config.json`:
"--validation_guidance_skip_layers_stop": 0.2,
"--validation_guidance_skip_scale": 2.8,
"--validation_guidance": 4.0,
"--flux_use_uniform_schedule": true,
"--flux_schedule_auto_shift": true
"--flow_use_uniform_schedule": true,
"--flow_schedule_auto_shift": true
}
```

Expand Down Expand Up @@ -309,17 +309,17 @@ Some changes were made to SimpleTuner's SD3.5 support:
- No longer zeroing T5 padding space by default (`--t5_padding`)
- Offering a switch (`--sd3_clip_uncond_behaviour` and `--sd3_t5_uncond_behaviour`) to use empty encoded blank captions for unconditional predictions (`empty_string`, **default**) or zeros (`zero`), not a recommended setting to tweak.
- SD3.5 training loss function was updated to match that found in the upstream StabilityAI/SD3.5 repository
- Updated default `--flux_schedule_shift` value to 3 to match the static 1024px value for SD3
- StabilityAI followed-up with documentation to use `--flux_schedule_shift=1` with `--flux_use_uniform_schedule`
- Community members have reported that `--flux_schedule_auto_shift` works better when using mult-aspect or multi-resolution training
- Updated the hard-coded tokeniser sequence length limit to **256** with the option to revert it to **77** tokens to save disk space or compute at the cost of output quality degradation
- Updated default `--flow_schedule_shift` value to 3 to match the static 1024px value for SD3
- StabilityAI followed-up with documentation to use `--flow_schedule_shift=1` with `--flow_use_uniform_schedule`
- Community members have reported that `--flow_schedule_auto_shift` works better when using mult-aspect or multi-resolution training
- Updated the hard-coded tokeniser sequence length limit to **154** with the option to revert it to **77** tokens to save disk space or compute at the cost of output quality degradation


#### Stable configuration values

These options have been known to keep SD3.5 in-tact for as long as possible:
- optimizer=adamw_bf16
- flux_schedule_shift=1
- flow_schedule_shift=1
- learning_rate=1e-4
- batch_size=4 * 3 GPUs
- max_grad_norm=0.1
Expand Down
Loading

0 comments on commit aa59b9a

Please sign in to comment.