allenai · vwxyzjn · Jan 10, 2025 · Dec 20, 2024 · Jan 9, 2025 · Jan 9, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -1,42 +1,4 @@
-ARG CUDA
-ARG DIST
-ARG TARGET
-FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
-
-ARG DEBIAN_FRONTEND="noninteractive"
-ENV TZ="America/Los_Angeles"
-
-# Install base tools.
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    jq \
-    language-pack-en \
-    make \
-    sudo \
-    unzip \
-    vim \
-    wget \
-    parallel \
-    iputils-ping \
-    tmux
-
-ARG BEAKER_VERSION
-RUN curl --silent \
-    --connect-timeout 5 \
-    --max-time 10 \
-    --retry 5 \
-    --retry-delay 0 \
-    --retry-max-time 40 \
-    --output beaker.tar.gz \
-    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
-    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
-    && rm beaker.tar.gz
-
-# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
-# puts the right NVIDIA things in the right place (that THOR requires).
-ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+FROM ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
 
 # Install conda. We give anyone in the users group the ability to run
 # conda commands and install packages in the base (default) environment.
@@ -50,50 +12,25 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86
     && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
 
 ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-
 # Install a few additional utilities via pip
 RUN /opt/miniconda3/bin/pip install --no-cache-dir \
     gpustat \
     jupyter \
     beaker-gantry \
     oocmap
 
-# Ensure users can modify their container environment.
-RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-
-# Make the base image friendlier for interactive workloads. This makes things like the man command
-# work.
-RUN yes | unminimize
-
-# Install MLNX OFED user-space drivers
-# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
-ENV MOFED_VER 5.8-1.1.2.1
-ENV OS_VER ubuntu20.04
-ENV PLATFORM x86_64
-RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
-    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
-    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
-    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
-    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
-
-# The -l flag makes bash act as a login shell and load /etc/profile, etc.
-ENTRYPOINT ["bash", "-l"]
-
 WORKDIR /stage/
 
 # TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. 
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
-COPY requirements.txt .
 RUN pip install --upgrade pip "setuptools<70.0.0" wheel 
-# TODO, unpin setuptools when this issue in flash attention is resolved
-RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+RUN pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
 RUN pip install packaging
-RUN pip install flash-attn==2.6.3 --no-build-isolation
+RUN pip install flash-attn==2.7.2.post1 --no-build-isolation
+COPY requirements.txt .
 RUN pip install -r requirements.txt
-
-# NLTK download
 RUN python -m nltk.downloader punkt
+
 COPY open_instruct open_instruct
 COPY oe-eval-internal oe-eval-internal
 

diff --git a/Dockerfile.base b/Dockerfile.base
@@ -0,0 +1,80 @@
+# Run the following command to build `ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04`
+# docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 -f  Dockerfile.base . -t cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
+
+ARG CUDA
+ARG DIST
+ARG TARGET
+FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ="America/Los_Angeles"
+
+# Install base tools.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    jq \
+    language-pack-en \
+    make \
+    man-db \
+    manpages \
+    manpages-dev \
+    manpages-posix \
+    manpages-posix-dev \
+    sudo \
+    unzip \
+    vim \
+    wget \
+    fish \
+    parallel \
+    iputils-ping \
+    htop \
+    emacs \
+    zsh \
+    rsync \
+    tmux
+
+# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
+# puts the right NVIDIA things in the right place (that THOR requires).
+ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Ensure users can modify their container environment.
+RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Make the base image friendlier for interactive workloads. This makes things like the man command
+# work.
+RUN yes | unminimize
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 24.01-0.3.3.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+# Install Docker CLI. Version matches Beaker on-premise servers.
+RUN curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-20.10.21.tgz -o docker.tgz \
+    && sudo tar xzvf docker.tgz --strip 1 -C /usr/local/bin docker/docker \
+    && rm docker.tgz
+
+# Install Beaker
+ARG BEAKER_VERSION
+RUN curl --silent \
+    --connect-timeout 5 \
+    --max-time 10 \
+    --retry 5 \
+    --retry-delay 0 \
+    --retry-max-time 40 \
+    --output beaker.tar.gz \
+    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
+    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
+    && rm beaker.tar.gz
+
+# The -l flag makes bash act as a login shell and load /etc/profile, etc.
+ENTRYPOINT ["bash", "-l"]
diff --git a/Dockerfile.uv b/Dockerfile.uv
@@ -0,0 +1,37 @@
+FROM ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV UV_COMPILE_BYTECODE=1
+
+# setup files
+WORKDIR /stage/
+
+# Install the project's dependencies using the lockfile and settings
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync --frozen --no-install-project
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync --frozen --no-install-project --extra compile
+
+COPY open_instruct open_instruct
+COPY eval eval
+COPY configs configs
+COPY scripts scripts
+# hack: only copy oe-eval-internal if it exists
+COPY mason.py oe-eval-internal* /stage/
+COPY .git/ ./.git/
+COPY pyproject.toml uv.lock .
+RUN chmod +x scripts/*
+
+# install dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --extra compile
+
+# for interactive session
+RUN chmod -R 777 /stage/
+
+# uv dockerfile magic: place executables in the environment at the front of the path
+ENV PATH=/stage/.venv/bin:$PATH
diff --git a/README.md b/README.md
@@ -51,22 +51,45 @@ Our setup mostly follows our [Dockerfile](./Dockerfile), which uses Python 3.10.
 ```bash
 pip install --upgrade pip "setuptools<70.0.0" wheel 
 # TODO, unpin setuptools when this issue in flash attention is resolved
-pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
 pip install packaging
-pip install flash-attn==2.6.3 --no-build-isolation
+pip install flash-attn==2.7.2.post1 --no-build-isolation
 pip install -r requirements.txt
-python -m nltk.downloader punkt
 pip install -e .
+python -m nltk.downloader punkt
+```
+
+* **Local installation with uv (preview)**: We are experimenting with using [uv](https://docs.astral.sh/uv/). You can install via
+```bash
+uv sync
+uv sync --extra compile # to install flash attention
 ```
 
 
 * **Docker installation**: You can also use the Dockerfile to build a Docker image. You can build the image with the following command:
 
 ```bash
-docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct_dev
+docker build . -t open_instruct_dev
+# if you are interally at AI2, you can create an image like this:
+beaker_user=$(beaker account whoami --format json | jq -r '.[0].name')
+beaker image delete $beaker_user/open_instruct_dev 
+beaker image create open_instruct_dev -n open_instruct_dev -w ai2/$beaker_user
+```
+
+Optionally you can build the base image with the following command:
+
+```bash
+docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 -f  Dockerfile.base . -t cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
+```
+
+* **Docker with uv**: You can also use the Dockerfile to build a Docker image with uv. You can build the image with the following command:
+
+```bash
+docker build -f Dockerfile.uv -t open_instruct_dev_uv .
 # if you are interally at AI2, you can create an image like this:
-beaker image delete $(whoami)/open_instruct_dev 
-beaker image create open_instruct_dev -n open_instruct_dev -w ai2/$(whoami)
+beaker_user=$(beaker account whoami --format json | jq -r '.[0].name')
+beaker image delete $beaker_user/open_instruct_dev_uv 
+beaker image create open_instruct_dev_uv -n open_instruct_dev_uv -w ai2/$beaker_user
 ```
 
 If you are internally at AI2, you may launch experiments using our always-up-to-date auto-built image `nathanl/open_instruct_auto`.

diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py
@@ -34,7 +34,7 @@
 import deepspeed
 import torch
 import transformers
-from accelerate import Accelerator
+from accelerate import Accelerator, DataLoaderConfiguration
 from accelerate.logging import get_logger
 from accelerate.utils import InitProcessGroupKwargs, set_seed
 from datasets import load_dataset
@@ -499,10 +499,11 @@ def main(args: FlatArguments):
 
     # if you get timeouts (e.g. due to long tokenization) increase this.
     timeout_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=args.timeout))
+    dataloader_config = DataLoaderConfiguration(use_seedable_sampler=True)
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
-        use_seedable_sampler=True,
+        dataloader_config=dataloader_config,
         **accelerator_log_kwargs,
         kwargs_handlers=[timeout_kwargs],
     )

diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
@@ -30,7 +30,7 @@
 import deepspeed
 import torch
 import transformers
-from accelerate import Accelerator
+from accelerate import Accelerator, DataLoaderConfiguration
 from accelerate.logging import get_logger
 from accelerate.utils import InitProcessGroupKwargs, set_seed
 from datasets import load_dataset
@@ -483,10 +483,11 @@ def main(args: FlatArguments):
 
     # if you get timeouts (e.g. due to long tokenization) increase this.
     timeout_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=args.timeout))
+    dataloader_config = DataLoaderConfiguration(use_seedable_sampler=True)
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
-        use_seedable_sampler=True,
+        dataloader_config=dataloader_config,
         **accelerator_log_kwargs,
         kwargs_handlers=[timeout_kwargs],
     )

diff --git a/open_instruct/ppo_vllm_thread_ray_gtrl.py b/open_instruct/ppo_vllm_thread_ray_gtrl.py
@@ -762,11 +762,11 @@ def train(
             world_size = vllm_num_engines * vllm_tensor_parallel_size + 1
             backend = args.vllm_sync_backend
             # https://github.com/OpenRLHF/OpenRLHF/issues/313
-            if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
-                backend = "gloo"
-                print(
-                    "Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
-                )
+            # if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
+            #     backend = "gloo"
+            #     print(
+            #         "Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
+            #     )
             refs = [
                 engine.init_process_group.remote(
                     master_address,
@@ -1000,10 +1000,10 @@ def vllm_generate(
 
                 start_time = time.time()
                 broadcast_to_vllm()
-                print(
-                    f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
-                )
                 if accelerator.is_main_process:
+                    print(
+                        f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
+                    )
                     param_prompt_Q.put((None, remove_padding(global_queries, tokenizer.pad_token_id)))
             else:
                 if training_step != 1:
@@ -1021,10 +1021,10 @@ def vllm_generate(
                     datasets_next = data[DATASET_SOURCE_KEY]
                     start_time = time.time()
                     broadcast_to_vllm()
-                    print(
-                        f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
-                    )
                     if accelerator.is_main_process:
+                        print(
+                            f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
+                        )
                         param_prompt_Q.put((None, remove_padding(global_queries, tokenizer.pad_token_id)))
                     queries = queries_next
                     ground_truths = ground_truths_next
@@ -1614,11 +1614,11 @@ def main(args: Args, dataset_config: DatasetConfig, model_config: ModelConfig):
         print(
             f"Dataset splits not provided for all datasets. Using the same {args.dataset_train_splits[0]} split for all datasets."
         )
-    if len(args.dataset_eval_splits) != len(args.dataset_eval_mixer_dict) and len(args.dataset_eval_splits) == 1:
-        args.dataset_eval_splits = [args.dataset_eval_splits[0]] * len(args.dataset_eval_mixer_dict)
-        print(
-            f"Dataset splits not provided for all datasets. Using the same {args.dataset_eval_splits[0]} split for all datasets."
-        )
+    # if len(args.dataset_eval_splits) != len(args.dataset_eval_mixer_dict) and len(args.dataset_eval_splits) == 1:
+    #     args.dataset_eval_splits = [args.dataset_eval_splits[0]] * len(args.dataset_eval_mixer_dict)
+    #     print(
+    #         f"Dataset splits not provided for all datasets. Using the same {args.dataset_eval_splits[0]} split for all datasets."
+    #     )
     train_dataset = combine_dataset(
         args.dataset_mixer_dict,
         splits=args.dataset_train_splits,