Skip to content

Commit 9751cfb

Browse files
committed
small updates
1 parent d53b43e commit 9751cfb

File tree

4 files changed

+22
-10
lines changed

4 files changed

+22
-10
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ audio_array = generate_audio(text_prompt)
7272

7373
### 🎤 Voice Presets and Voice/Audio Cloning
7474

75-
Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. The model also attempts to preserve music, ambient noise, etc. from input audio. However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from for each language. Specify following the pattern: `{lang_code}_speaker_{number}`.
75+
Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. The model also attempts to preserve music, ambient noise, etc. from input audio. However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from for each language. Specify following the pattern: `{lang_code}_speaker_{0-9}`.
7676

7777
```python
7878
text_prompt = """

bark/api.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ def text_to_semantic(
99
text: str,
1010
history_prompt: Optional[str] = None,
1111
temp: float = 0.7,
12+
silent: bool = False,
1213
):
1314
"""Generate semantic array from text.
1415
1516
Args:
1617
text: text to be turned into audio
1718
history_prompt: history choice for audio cloning
1819
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
20+
silent: disable progress bar
1921
2022
Returns:
2123
numpy semantic array to be fed into `semantic_to_waveform`
@@ -24,6 +26,7 @@ def text_to_semantic(
2426
text,
2527
history_prompt=history_prompt,
2628
temp=temp,
29+
silent=silent,
2730
)
2831
return x_semantic
2932

@@ -32,13 +35,15 @@ def semantic_to_waveform(
3235
semantic_tokens: np.ndarray,
3336
history_prompt: Optional[str] = None,
3437
temp: float = 0.7,
38+
silent: bool = False,
3539
):
3640
"""Generate audio array from semantic input.
3741
3842
Args:
3943
semantic_tokens: semantic token output from `text_to_semantic`
4044
history_prompt: history choice for audio cloning
4145
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
46+
silent: disable progress bar
4247
4348
Returns:
4449
numpy audio array at sample frequency 24khz
@@ -47,6 +52,7 @@ def semantic_to_waveform(
4752
semantic_tokens,
4853
history_prompt=history_prompt,
4954
temp=temp,
55+
silent=silent,
5056
)
5157
x_fine_gen = generate_fine(
5258
x_coarse_gen,
@@ -62,6 +68,7 @@ def generate_audio(
6268
history_prompt: Optional[str] = None,
6369
text_temp: float = 0.7,
6470
waveform_temp: float = 0.7,
71+
silent: bool = False,
6572
):
6673
"""Generate audio array from input text.
6774
@@ -70,10 +77,15 @@ def generate_audio(
7077
history_prompt: history choice for audio cloning
7178
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
7279
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
80+
silent: disable progress bar
7381
7482
Returns:
7583
numpy audio array at sample frequency 24khz
7684
"""
77-
x_semantic = text_to_semantic(text, history_prompt=history_prompt, temp=text_temp)
78-
audio_arr = semantic_to_waveform(x_semantic, history_prompt=history_prompt, temp=waveform_temp)
85+
x_semantic = text_to_semantic(
86+
text, history_prompt=history_prompt, temp=text_temp, silent=silent,
87+
)
88+
audio_arr = semantic_to_waveform(
89+
x_semantic, history_prompt=history_prompt, temp=waveform_temp, silent=silent,
90+
)
7991
return audio_arr

bark/generation.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ def _parse_s3_filepath(s3_filepath):
137137
def _download(from_s3_path, to_local_path):
138138
os.makedirs(CACHE_DIR, exist_ok=True)
139139
response = requests.get(from_s3_path, stream=True)
140-
total_size_in_bytes = int(response.headers.get('content-length', 0))
141-
block_size = 1024 # 1 Kibibyte
142-
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
140+
total_size_in_bytes = int(response.headers.get("content-length", 0))
141+
block_size = 1024
142+
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
143143
with open(to_local_path, "wb") as file:
144144
for data in response.iter_content(block_size):
145145
progress_bar.update(len(data))
@@ -191,7 +191,7 @@ def clean_models(model_key=None):
191191

192192
def _load_model(ckpt_path, device, model_type="text"):
193193
if "cuda" not in device:
194-
logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
194+
logger.warning("No GPU being used. Careful, inference might be extremely slow!")
195195
if model_type == "text":
196196
ConfigClass = GPTConfig
197197
ModelClass = GPT
@@ -207,10 +207,10 @@ def _load_model(ckpt_path, device, model_type="text"):
207207
os.path.exists(ckpt_path) and
208208
_md5(ckpt_path) != REMOTE_MODEL_PATHS[model_type]["checksum"]
209209
):
210-
logger.warning(f"found outdated {model_type} model, removing...")
210+
logger.warning(f"found outdated {model_type} model, removing.")
211211
os.remove(ckpt_path)
212212
if not os.path.exists(ckpt_path):
213-
logger.info(f"{model_type} model not found, downloading...")
213+
logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
214214
_download(REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path)
215215
checkpoint = torch.load(ckpt_path, map_location=device)
216216
# this is a hack

model-card.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ The following is additional information about the models released here.
88

99
Bark is a series of three transformer models that turn text into audio.
1010
### Text to semantic tokens
11-
- Input: text, tokenized with [BERT tokenizer from huggingface](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
11+
- Input: text, tokenized with [BERT tokenizer from Hugging Face](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
1212
- Output: semantic tokens that encode the audio to be generated
1313

1414
### Semantic to coarse tokens

0 commit comments

Comments
 (0)