Skip to content

Commit

Permalink
Enable notebooks to be generated in any language (huggingface#311)
Browse files Browse the repository at this point in the history
* Enable notebooks to be generated in any language
  • Loading branch information
lewtun authored Sep 13, 2022
1 parent 8f1ba9a commit 2c0b3ba
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 58 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,10 @@ python -m pip install -r requirements.txt
Then run the following script:

```bash
python utils/generate_notebooks.py --output_dir nbs
python utils/generate_notebooks.py --language LANG-ID --output_dir nbs
```

This script extracts all the code snippets from the English chapters and stores them as notebooks in the `nbs` folder (which is ignored by Git by default).
This script extracts all the code snippets from the chapters associated with `chapters/LANG-ID` and stores them as notebooks in the `nbs` folder (which is ignored by Git by default).

## ✍️ Contributing a new chapter

Expand Down
21 changes: 10 additions & 11 deletions chapters/fr/chapter9/4.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ Pour ajouter du contenu supplémentaire à votre démo, la classe `Interface` su
- `live` : si vous voulez que votre modèle soit relancé à chaque fois que l'entrée change, vous pouvez mettre `live=True`. Ceci est utile pour les modèles rapides (nous verrons un exemple à la fin de cette section).
En utilisant les options ci-dessus, nous obtenons une interface plus complète. Exécutez le code ci-dessous pour pouvoir discuter avec Rick et Morty :

```python out
title = "Ask Rick a Question" # "Posez une question à Rick"
description =
"""
```py
title = "Ask Rick a Question" # "Posez une question à Rick"
description = """
The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
# Le robot a été entraîné à répondre à des questions basées sur les dialogues de Rick et Morty.
# Demandez à Rick ce que vous voulez !
Expand All @@ -41,15 +40,15 @@ article = "Check out [the original Rick and Morty Bot](https://huggingface.co/sp
# Jetez un coup d'œil au [bot original Rick et Morty](https://huggingface.co/spaces/kingabzpro/Rick_and_Morty_Bot) sur lequel cette démo est basée.

gr.Interface(
fn=predict,
inputs="textbox",
fn=predict,
inputs="textbox",
outputs="text",
title=title,
description=description,
title=title,
description=description,
article=article,
examples=[["What are you doing?"], ["Where should we time travel to?"]]
# ["Que faites-vous ?"], ["Où devrions-nous voyager dans le temps ?"]
).launch()
examples=[["What are you doing?"], ["Where should we time travel to?"]]
# ["Que faites-vous ?"], ["Où devrions-nous voyager dans le temps ?"]
).launch()
```

En utilisant les options ci-dessus, nous obtenons une interface plus complète. Essayez l'interface ci-dessous :
Expand Down
97 changes: 52 additions & 45 deletions utils/generate_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

from pathlib import Path

PATH_TO_COURSE = "chapters/en/"

re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$")
re_framework_else = re.compile(r"^{:else}\s*$")
re_framework_end = re.compile(r"^{/if}\s*$")
Expand Down Expand Up @@ -128,57 +126,63 @@ def build_notebook(fname, title, output_dir="."):
"""
sections = read_and_split_frameworks(fname)
sections_with_accelerate = [
"A full training",
"Token classification (PyTorch)",
"Fine-tuning a masked language model (PyTorch)",
"Translation (PyTorch)",
"Summarization (PyTorch)",
"Training a causal language model from scratch (PyTorch)",
"Question answering (PyTorch)",
"chapter3/4", # "A full training",
"chapter7/2_pt", # "Token classification (PyTorch)",
"chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)"
"chapter7/4_pt", # "Translation (PyTorch)"
"chapter7/5_pt", # "Summarization (PyTorch)",
"chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)"
"chapter7/7_pt", # "Question answering (PyTorch)"
]
sections_with_hf_hub = [
"Sharing pretrained models (PyTorch)",
"Sharing pretrained models (TensorFlow)",
"Creating your own dataset",
"Token classification (PyTorch)",
"Token classification (TensorFlow)",
"Training a new tokenizer from an old one",
"Fine-tuning a masked language model (PyTorch)",
"Fine-tuning a masked language model (TensorFlow)",
"Translation (PyTorch)",
"Translation (TensorFlow)",
"Summarization (PyTorch)",
"Summarization (TensorFlow)",
"Training a causal language model from scratch (PyTorch)",
"Training a causal language model from scratch (TensorFlow)",
"Question answering (PyTorch)",
"Question answering (TensorFlow)",
"What to do when you get an error",
"chapter4/3_pt", # "Sharing pretrained models (PyTorch)"
"chapter4/3_tf", # "Sharing pretrained models (TensorFlow)"
"chapter5/5", # "Creating your own dataset"
"chapter7/2_pt", # "Token classification (PyTorch)"
"chapter7/2_tf", # "Token classification (TensorFlow)"
"chapter6/2", # "Training a new tokenizer from an old one"
"chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)"
"chapter7/3_tf", # "Fine-tuning a masked language model (TensorFlow)"
"chapter7/4_pt", # "Translation (PyTorch)"
"chapter7/4_tf", # "Translation (TensorFlow)"
"chapter7/5_pt", # "Summarization (PyTorch)"
"chapter7/5_tf", # "Summarization (TensorFlow)"
"chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)"
"chapter7/6_tf", # "Training a causal language model from scratch (TensorFlow)"
"chapter7/7_pt", # "Question answering (PyTorch)"
"chapter7/7_tf", # "Question answering (TensorFlow)"
"chapter8/2", # "What to do when you get an error"
]
sections_with_faiss = [
"chapter5/6_pt", # "Semantic search with FAISS (PyTorch)"
"chapter5/6_tf", # "Semantic search with FAISS (TensorFlow)"
]
sections_with_faiss = ["Semantic search with FAISS (PyTorch)", "Semantic search with FAISS (TensorFlow)"]
sections_with_gradio = [
"Building your first demo",
"Understanding the Interface class",
"Sharing demos with others",
"Integrations with the Hugging Face Hub",
"Advanced Interface features",
"Introduction to Blocks",
"chapter9/2", # "Building your first demo"
"chapter9/3", # "Understanding the Interface class"
"chapter9/4", # "Sharing demos with others"
"chapter9/5", # "Integrations with the Hugging Face Hub"
"chapter9/6", # "Advanced Interface features"
"chapter9/7", # "Introduction to Blocks"
]
stem = Path(fname).stem
if not isinstance(sections, dict):
contents = [sections]
titles = [title]
fnames = [f"section{stem}.ipynb"]
section_names = [f"{Path(fname).parent.stem}/{stem}"]
else:
contents = []
titles = []
fnames = []
section_names = []
for key, section in sections.items():
contents.append(section)
titles.append(f"{title} ({frameworks[key]})")
fnames.append(f"section{stem}_{key}.ipynb")
section_names.append(f"{Path(fname).parent.stem}/{stem}_{key}")

for title, content, fname in zip(titles, contents, fnames):
for title, content, fname, section_name in zip(titles, contents, fnames, section_names):
cells = extract_cells(content)
if len(cells) == 0:
continue
Expand All @@ -190,22 +194,22 @@ def build_notebook(fname, title, output_dir="."):

# Install cell
installs = ["!pip install datasets evaluate transformers[sentencepiece]"]
if title in sections_with_accelerate:
if section_name in sections_with_accelerate:
installs.append("!pip install accelerate")
installs.append("# To run the training on TPU, you will need to uncomment the followin line:")
installs.append(
"# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl"
)
if title in sections_with_hf_hub:
if section_name in sections_with_hf_hub:
installs.append("!apt install git-lfs")
if title in sections_with_faiss:
if section_name in sections_with_faiss:
installs.append("!pip install faiss-gpu")
if title in sections_with_gradio:
if section_name in sections_with_gradio:
installs.append("!pip install gradio")

nb_cells.append(nb_cell("\n".join(installs)))

if title in sections_with_hf_hub:
if section_name in sections_with_hf_hub:
nb_cells.extend(
[
nb_cell(
Expand All @@ -229,11 +233,11 @@ def build_notebook(fname, title, output_dir="."):
nbformat.write(notebook, os.path.join(output_dir, fname), version=4)


def get_titles():
def get_titles(language):
"""
Parse the _toctree.yml file to get the correspondence filename to title
"""
table = yaml.safe_load(open(os.path.join(PATH_TO_COURSE, "_toctree.yml"), "r"))
table = yaml.safe_load(open(os.path.join(f"chapters/{language}", "_toctree.yml"), "r"))
result = {}
for entry in table:
for section in entry["sections"]:
Expand All @@ -248,22 +252,25 @@ def get_titles():
return {k: v for k, v in result.items() if "quiz" not in v}


def create_notebooks(output_dir):
def create_notebooks(language, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for folder in os.listdir(output_dir):
if folder.startswith("chapter"):
shutil.rmtree(os.path.join(output_dir, folder))
titles = get_titles()
titles = get_titles(language)
for fname, title in titles.items():
build_notebook(
os.path.join(PATH_TO_COURSE, f"{fname}.mdx"),
os.path.join(f"chapters/{language}", f"{fname}.mdx"),
title,
os.path.join(output_dir, Path(fname).parent),
)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--language", type=str, default="en", help="Path to the course MDX files")
parser.add_argument("--output_dir", type=str, help="Where to output the notebooks")
args = parser.parse_args()

create_notebooks(args.output_dir)
create_notebooks(args.language, args.output_dir)

0 comments on commit 2c0b3ba

Please sign in to comment.