forked from huggingface/course
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_notebooks.py
289 lines (255 loc) · 11 KB
/
generate_notebooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import argparse
import os
import re
import nbformat
import shutil
import yaml
from pathlib import Path
re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$")
re_framework_else = re.compile(r"^{:else}\s*$")
re_framework_end = re.compile(r"^{/if}\s*$")
re_html_line = re.compile(r"^<[^>]*/>\s*$")
re_html_tag = re.compile(r"<([^/>]*)>\s*$")
re_python_code = re.compile(r"^```(?:py|python|py no\-format|python no\-format)\s*$")
re_output_code = re.compile(r"^```(?:py|python)\s+out\s*$")
re_end_code = re.compile(r"^```\s*$")
frameworks = {"pt": "PyTorch", "tf": "TensorFlow"}
PATH_TO_COURSE = Path("chapters/")
# Languages to exlude from the notebook generation because the notebooks were
# created manually
LANGS_TO_EXCLUDE = ["fr"]
def read_and_split_frameworks(fname):
"""
Read the MDX in fname and creates two versions (if necessary) for each framework.
"""
with open(fname, "r") as f:
content = f.readlines()
contents = {"pt": [], "tf": []}
differences = False
current_content = []
line_idx = 0
for line in content:
if re_framework_test.search(line) is not None:
differences = True
framework = re_framework_test.search(line).groups()[0]
for key in contents:
contents[key].extend(current_content)
current_content = []
elif re_framework_else.search(line) is not None:
contents[framework].extend(current_content)
current_content = []
framework = "pt" if framework == "tf" else "tf"
elif re_framework_end.search(line) is not None:
contents[framework].extend(current_content)
current_content = []
else:
current_content.append(line)
if len(current_content) > 0:
for key in contents:
contents[key].extend(current_content)
if differences:
return {k: "".join(content) for k, content in contents.items()}
else:
return "".join(content)
def extract_cells(content):
"""
Extract the code/output cells from content.
"""
cells = []
current_cell = None
is_output = False
for line in content.split("\n"):
if re_python_code.search(line) is not None:
is_output = False
current_cell = []
elif re_output_code.search(line) is not None:
is_output = True
current_cell = []
elif re_end_code.search(line) is not None and current_cell is not None:
cell = "\n".join(current_cell)
if is_output:
if not isinstance(cells[-1], tuple):
cells[-1] = (cells[-1], cell)
else:
cells.append(cell)
current_cell = None
current_md = []
elif current_cell is not None:
current_cell.append(line)
return cells
def convert_to_nb_cell(cell):
"""
Convert some cell (either just code or tuple (code, output)) to a proper notebook cell.
"""
nb_cell = {"cell_type": "code", "execution_count": None, "metadata": {}}
if isinstance(cell, tuple):
nb_cell["source"] = cell[0]
nb_cell["outputs"] = [
nbformat.notebooknode.NotebookNode(
{
"data": {"text/plain": cell[1]},
"execution_count": None,
"metadata": {},
"output_type": "execute_result",
}
)
]
else:
nb_cell["source"] = cell
nb_cell["outputs"] = []
return nbformat.notebooknode.NotebookNode(nb_cell)
def nb_cell(source, code=True):
if not code:
return nbformat.notebooknode.NotebookNode({"cell_type": "markdown", "source": source, "metadata": {}})
return nbformat.notebooknode.NotebookNode(
{"cell_type": "code", "metadata": {}, "source": source, "execution_count": None, "outputs": []}
)
def build_notebook(fname, title, output_dir="."):
"""
Build the notebook for fname with a given title in output_dir.
"""
sections = read_and_split_frameworks(fname)
sections_with_accelerate = [
"chapter3/4", # "A full training",
"chapter7/2_pt", # "Token classification (PyTorch)",
"chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)"
"chapter7/4_pt", # "Translation (PyTorch)"
"chapter7/5_pt", # "Summarization (PyTorch)",
"chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)"
"chapter7/7_pt", # "Question answering (PyTorch)"
]
sections_with_hf_hub = [
"chapter4/3_pt", # "Sharing pretrained models (PyTorch)"
"chapter4/3_tf", # "Sharing pretrained models (TensorFlow)"
"chapter5/5", # "Creating your own dataset"
"chapter7/2_pt", # "Token classification (PyTorch)"
"chapter7/2_tf", # "Token classification (TensorFlow)"
"chapter6/2", # "Training a new tokenizer from an old one"
"chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)"
"chapter7/3_tf", # "Fine-tuning a masked language model (TensorFlow)"
"chapter7/4_pt", # "Translation (PyTorch)"
"chapter7/4_tf", # "Translation (TensorFlow)"
"chapter7/5_pt", # "Summarization (PyTorch)"
"chapter7/5_tf", # "Summarization (TensorFlow)"
"chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)"
"chapter7/6_tf", # "Training a causal language model from scratch (TensorFlow)"
"chapter7/7_pt", # "Question answering (PyTorch)"
"chapter7/7_tf", # "Question answering (TensorFlow)"
"chapter8/2", # "What to do when you get an error"
]
sections_with_faiss = [
"chapter5/6_pt", # "Semantic search with FAISS (PyTorch)"
"chapter5/6_tf", # "Semantic search with FAISS (TensorFlow)"
]
sections_with_gradio = [
"chapter9/2", # "Building your first demo"
"chapter9/3", # "Understanding the Interface class"
"chapter9/4", # "Sharing demos with others"
"chapter9/5", # "Integrations with the Hugging Face Hub"
"chapter9/6", # "Advanced Interface features"
"chapter9/7", # "Introduction to Blocks"
]
stem = Path(fname).stem
if not isinstance(sections, dict):
contents = [sections]
titles = [title]
fnames = [f"section{stem}.ipynb"]
section_names = [f"{Path(fname).parent.stem}/{stem}"]
else:
contents = []
titles = []
fnames = []
section_names = []
for key, section in sections.items():
contents.append(section)
titles.append(f"{title} ({frameworks[key]})")
fnames.append(f"section{stem}_{key}.ipynb")
section_names.append(f"{Path(fname).parent.stem}/{stem}_{key}")
for title, content, fname, section_name in zip(titles, contents, fnames, section_names):
cells = extract_cells(content)
if len(cells) == 0:
continue
nb_cells = [
nb_cell(f"# {title}", code=False),
nb_cell("Install the Transformers, Datasets, and Evaluate libraries to run this notebook.", code=False),
]
# Install cell
installs = ["!pip install datasets evaluate transformers[sentencepiece]"]
if section_name in sections_with_accelerate:
installs.append("!pip install accelerate")
installs.append("# To run the training on TPU, you will need to uncomment the following line:")
installs.append(
"# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl"
)
if section_name in sections_with_hf_hub:
installs.append("!apt install git-lfs")
if section_name in sections_with_faiss:
installs.append("!pip install faiss-gpu")
if section_name in sections_with_gradio:
installs.append("!pip install gradio")
nb_cells.append(nb_cell("\n".join(installs)))
if section_name in sections_with_hf_hub:
nb_cells.extend(
[
nb_cell(
"You will need to setup git, adapt your email and name in the following cell.", code=False
),
nb_cell(
'!git config --global user.email "[email protected]"\n!git config --global user.name "Your Name"'
),
nb_cell(
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.",
code=False,
),
nb_cell("from huggingface_hub import notebook_login\n\nnotebook_login()"),
]
)
nb_cells += [convert_to_nb_cell(cell) for cell in cells]
metadata = {"colab": {"name": title, "provenance": []}}
nb_dict = {"cells": nb_cells, "metadata": metadata, "nbformat": 4, "nbformat_minor": 4}
notebook = nbformat.notebooknode.NotebookNode(nb_dict)
os.makedirs(output_dir, exist_ok=True)
nbformat.write(notebook, os.path.join(output_dir, fname), version=4)
def get_titles(language):
"""
Parse the _toctree.yml file to get the correspondence filename to title
"""
table = yaml.safe_load(open(os.path.join(f"chapters/{language}", "_toctree.yml"), "r"))
result = {}
for entry in table:
for section in entry["sections"]:
section_title = section["title"]
if "local_fw" in section:
section_names = section["local_fw"]
result[section_names["pt"]] = section_title
result[section_names["tf"]] = section_title
else:
section_name = section["local"]
result[section_name] = section_title
return {k: v for k, v in result.items() if "quiz" not in v}
def create_notebooks(language, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for folder in os.listdir(output_dir):
if folder.startswith("chapter"):
shutil.rmtree(os.path.join(output_dir, folder))
titles = get_titles(language)
for fname, title in titles.items():
build_notebook(
os.path.join(f"chapters/{language}", f"{fname}.mdx"),
title,
os.path.join(output_dir, Path(fname).parent),
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, help="Where to output the notebooks")
args = parser.parse_args()
languages = [f.stem for f in PATH_TO_COURSE.iterdir() if f.is_dir()]
for language in languages:
if language in LANGS_TO_EXCLUDE:
continue
language_output_dir = f"{args.output_dir}/{language}"
create_notebooks(language, language_output_dir)
# Remove empty notebook folders
if not any(Path(language_output_dir).iterdir()):
shutil.rmtree(language_output_dir)