tooltips (h2oai#50)

* add tooltips * rm BN sync * rm unused * readme for CLI users
SaraTGRN · Apr 28, 2023 · cf0bdd3 · cf0bdd3
1 parent 3d96578
commit cf0bdd3
Show file tree

Hide file tree

Showing 80 changed files with 348 additions and 190 deletions.
diff --git a/README.md b/README.md
@@ -93,7 +93,6 @@ The interactive chat will also work with model that were finetuned using the UI.
 H2O LLM studio expects a csv file with at least two columns, one being the instruct column, the other 
 being the answer that the model should generate. You can also provide an extra validation dataframe using the same format or use an automatic train/validation split to evaluate the model performance. 
 
-
 During an experiment you can adapt the data representation with the following settings 
 
 - **Prompt Column:** The column in the dataset containing the user prompt.
@@ -110,12 +109,13 @@ With H2O LLM Studio, training your large language model is easy and intuitive.
 First, upload your dataset and then start training your model.
 
 ### Starting an experiment
-H2O LLM Studio provides various parameters to set for a given experiment, with some of the most important being:
+H2O LLM Studio allows to tune a variety of parameters and enables fast iterations to be able to explore different hyperparameters easily.
+The default settings are chosen with care and should give a good baseline. The most important parameters are:
 
 - **LLM Backbone**: This parameter determines the LLM architecture to use.
 - **Mask Prompt Labels**: This option controls whether to mask the prompt labels during training and only train on the loss of the answer.
 - **Hyperparameters** such as learning rate, batch size, and number of epochs determine the training process.
-An overview of all parameters is given in the [parameter description](docs/parameters.md).
+Please consult the tooltips of each hyperparameter to learn more about them. The tooltips are shown next to each hyperparameter in the GUI and can be found as plain text `.mdx` files in the [tooltips/](tooltips/) folder.
 - **Evaluate Before Training** This option lets you evaluate the model before training, which can help you judge the quality of the LLM backbone before fine-tuning.
 
 We provide several metric options for evaluating the performance of your model.

diff --git a/app_utils/sections/experiment.py b/app_utils/sections/experiment.py
@@ -13,6 +13,7 @@
 from h2o_wave import Q, data, ui
 from jinja2 import Environment, FileSystemLoader
 from sqlitedict import SqliteDict
+from llm_studio.src.tooltips import tooltips
 
 from app_utils.config import default_cfg
 from app_utils.sections.common import clean_dashboard
@@ -111,6 +112,7 @@ async def experiment_start(q: Q) -> None:
                 for _, row in df_datasets.iterrows()
             ],
             trigger=True,
+            tooltip=tooltips["experiments_dataset"],
         ),
     ]
 
@@ -198,6 +200,7 @@ async def experiment_start(q: Q) -> None:
                 choices=choices_problem_types,
                 value=q.client["experiment/start/cfg_file"],
                 trigger=True,
+                tooltip=tooltips["experiments_problem_type"],
             )
         ]
 
@@ -286,6 +289,7 @@ async def experiment_start(q: Q) -> None:
                 label="Import config from YAML",
                 value=False,
                 trigger=True,
+                tooltip=tooltips["experiments_import_config_from_yaml"],
             )
         ]
 

diff --git a/docs/parameters.md b/docs/parameters.md
diff --git a/llm_studio/python_configs/base.py b/llm_studio/python_configs/base.py
@@ -5,6 +5,7 @@
 from llm_studio.src import possible_values
 from llm_studio.src.nesting import Dependency, Nesting
 from llm_studio.src.order import Order
+from llm_studio.src.tooltips import tooltips
 
 logger = logging.getLogger(__name__)
 
@@ -99,7 +100,7 @@ def _get_tooltips(self, field: str, predict: bool = False) -> Optional[str]:
         """
         Returns a tooltip for the field provided
         """
-        return None
+        return tooltips.get(f"experiments_{field}", None)
 
     def _get_visibility(self, field: str) -> Optional[int]:
         """Returns a visibility level for the field provided.

diff --git a/llm_studio/python_configs/text_causal_language_modeling_config.py b/llm_studio/python_configs/text_causal_language_modeling_config.py
@@ -295,7 +295,6 @@ class ConfigNLPCausalLMEnvironment(DefaultConfig):
     use_fsdp: bool = False
 
     find_unused_parameters: bool = False
-    sync_batch_normalization: bool = False
     trust_remote_code: bool = False
     number_of_workers: int = 4
     seed: int = -1
@@ -323,9 +322,6 @@ def __post_init__(self):
         self._possible_values["number_of_workers"] = (1, multiprocessing.cpu_count(), 1)
         self._possible_values["seed"] = possible_values.Number(step=1, min=-1)
 
-        if torch.cuda.device_count() <= 1:
-            self._visibility["sync_batch_normalization"] = -1
-
 
 @dataclass
 class ConfigNLPCausalLMLogging(DefaultConfig):

diff --git a/llm_studio/src/tooltips.py b/llm_studio/src/tooltips.py
@@ -0,0 +1,139 @@
+import glob
+import re
+from dataclasses import dataclass
+
+CLEANR = re.compile("<[^<]+?>")
+tooltip_files = glob.glob("tooltips/**/*.mdx", recursive=True)
+
+
+def read_tooltip_file(path: str) -> str:
+    """
+    Reads all lines of a text file.
+
+    Args:
+        filename: path to the file
+
+    Returns:
+        str: the text of the file
+    """
+
+    with open(path) as f:
+        lines = f.readlines()
+    return "".join(lines)
+
+
+def cleanhtml(raw_html: str) -> str:
+    """
+    Removes html tags from a string.
+
+    Args:
+        raw_html: the string to clean
+
+    Returns:
+        str: the cleaned string
+    """
+
+    cleantext = re.sub(CLEANR, "", raw_html)
+    return cleantext
+
+
+def clean_docusaurus_tags(text: str) -> str:
+    """
+    Removes docusaurus tags from a string.
+
+    Args:
+        text: the string to clean
+
+    Returns:
+        str: the cleaned string
+    """
+
+    text = text.replace(":::info note", "")
+    text = text.replace(":::info Note", "")
+    text = text.replace(":::tip tip", "")
+    text = text.replace(":::", "")
+    return text
+
+
+def clean_md_links(text: str) -> str:
+    """
+    Removes markdown links from a string.
+
+    Args:
+        text: the string to clean
+
+    Returns:
+        str: the cleaned string
+    """
+
+    text = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", text)
+    return text
+
+
+@dataclass
+class Tooltip:
+    """
+    A tooltip.
+
+    Returns:
+        str: the text of the tooltip
+    """
+
+    name: str
+    text: str
+
+    def __repr__(self):
+        return f"{self.name}: {self.text}"
+
+
+class Tooltips:
+    """
+    A collection of tooltips.
+
+    During initialization, all tooltips are read from the tooltip files.
+
+    Usage:
+        tooltips = Tooltips()
+        a tooltip can be accessed by its name:
+        tooltips["name"] returns the tooltip with the name "name"
+    """
+
+    def __init__(self):
+        self.tooltips = {}
+        for filename in tooltip_files:
+            name = filename.split("/")[-1].split(".")[0]
+            name = name.replace("-", "_")
+            name = name[1:]  # remove leading underscore
+            section = filename.split("/")[1]
+            text = read_tooltip_file(filename)
+            text = cleanhtml(text)
+            text = clean_docusaurus_tags(text)
+            text = clean_md_links(text)
+            if name in self.tooltips.keys():
+                raise ValueError
+            self.add_tooltip(Tooltip(f"{section}_{name}", text))
+
+    def add_tooltip(self, tooltip):
+        self.tooltips[tooltip.name] = tooltip
+
+    def __getitem__(self, name: str) -> str:
+        try:
+            text = self.tooltips[name].text
+        except KeyError:
+            text = None
+        return text
+
+    def __len__(self):
+        return len(self.tooltips)
+
+    def __repr__(self):
+        return f"{self.tooltips}"
+
+    def get(self, name: str, default=None):
+        if name in self.tooltips.keys():
+            return self.tooltips[name].text
+        else:
+            return default
+
+
+tooltips = Tooltips()
diff --git a/llm_studio/src/utils/gpu_utils.py b/llm_studio/src/utils/gpu_utils.py
@@ -76,15 +76,3 @@ def is_oom_error(exception: BaseException) -> bool:
         or is_cudnn_snafu(exception)
         or is_out_of_cpu_memory(exception)
     )
-
-
-def garbage_collection_cuda() -> None:
-    """Garbage collection Torch (CUDA) memory."""
-    gc.collect()
-    try:
-        # This is the last thing that should cause an OOM error, but seemingly it can.
-        torch.cuda.empty_cache()
-    except RuntimeError as exception:
-        if not is_oom_error(exception):
-            # Only handle OOM errors
-            raise