Skip to content

Commit

Permalink
uv2 (#492)
Browse files Browse the repository at this point in the history
* uv2

* bump dependencies

* optimize dockerfile

* add nltk

* push
  • Loading branch information
vwxyzjn authored Jan 10, 2025
1 parent a3ea937 commit c95fbbe
Show file tree
Hide file tree
Showing 10 changed files with 3,862 additions and 157 deletions.
73 changes: 5 additions & 68 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,42 +1,4 @@
ARG CUDA
ARG DIST
ARG TARGET
FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}

ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ="America/Los_Angeles"

# Install base tools.
RUN apt-get update && apt-get install -y \
build-essential \
curl \
git \
jq \
language-pack-en \
make \
sudo \
unzip \
vim \
wget \
parallel \
iputils-ping \
tmux

ARG BEAKER_VERSION
RUN curl --silent \
--connect-timeout 5 \
--max-time 10 \
--retry 5 \
--retry-delay 0 \
--retry-max-time 40 \
--output beaker.tar.gz \
"https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
&& tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
&& rm beaker.tar.gz

# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
# puts the right NVIDIA things in the right place (that THOR requires).
ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
FROM ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04

# Install conda. We give anyone in the users group the ability to run
# conda commands and install packages in the base (default) environment.
Expand All @@ -50,50 +12,25 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86
&& rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh

ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Install a few additional utilities via pip
RUN /opt/miniconda3/bin/pip install --no-cache-dir \
gpustat \
jupyter \
beaker-gantry \
oocmap

# Ensure users can modify their container environment.
RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

# Make the base image friendlier for interactive workloads. This makes things like the man command
# work.
RUN yes | unminimize

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 5.8-1.1.2.1
ENV OS_VER ubuntu20.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

# The -l flag makes bash act as a login shell and load /etc/profile, etc.
ENTRYPOINT ["bash", "-l"]

WORKDIR /stage/

# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file.
ENV HF_HUB_ENABLE_HF_TRANSFER=1
COPY requirements.txt .
RUN pip install --upgrade pip "setuptools<70.0.0" wheel
# TODO, unpin setuptools when this issue in flash attention is resolved
RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
RUN pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
RUN pip install packaging
RUN pip install flash-attn==2.6.3 --no-build-isolation
RUN pip install flash-attn==2.7.2.post1 --no-build-isolation
COPY requirements.txt .
RUN pip install -r requirements.txt

# NLTK download
RUN python -m nltk.downloader punkt

COPY open_instruct open_instruct
COPY oe-eval-internal oe-eval-internal

Expand Down
80 changes: 80 additions & 0 deletions Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Run the following command to build `ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04`
# docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 -f Dockerfile.base . -t cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04

ARG CUDA
ARG DIST
ARG TARGET
FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}

ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ="America/Los_Angeles"

# Install base tools.
RUN apt-get update && apt-get install -y \
build-essential \
curl \
git \
jq \
language-pack-en \
make \
man-db \
manpages \
manpages-dev \
manpages-posix \
manpages-posix-dev \
sudo \
unzip \
vim \
wget \
fish \
parallel \
iputils-ping \
htop \
emacs \
zsh \
rsync \
tmux

# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
# puts the right NVIDIA things in the right place (that THOR requires).
ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Ensure users can modify their container environment.
RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

# Make the base image friendlier for interactive workloads. This makes things like the man command
# work.
RUN yes | unminimize

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 24.01-0.3.3.1
ENV OS_VER ubuntu20.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

# Install Docker CLI. Version matches Beaker on-premise servers.
RUN curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-20.10.21.tgz -o docker.tgz \
&& sudo tar xzvf docker.tgz --strip 1 -C /usr/local/bin docker/docker \
&& rm docker.tgz

# Install Beaker
ARG BEAKER_VERSION
RUN curl --silent \
--connect-timeout 5 \
--max-time 10 \
--retry 5 \
--retry-delay 0 \
--retry-max-time 40 \
--output beaker.tar.gz \
"https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
&& tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
&& rm beaker.tar.gz

# The -l flag makes bash act as a login shell and load /etc/profile, etc.
ENTRYPOINT ["bash", "-l"]
37 changes: 37 additions & 0 deletions Dockerfile.uv
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV UV_COMPILE_BYTECODE=1

# setup files
WORKDIR /stage/

# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --no-install-project
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --no-install-project --extra compile

COPY open_instruct open_instruct
COPY eval eval
COPY configs configs
COPY scripts scripts
# hack: only copy oe-eval-internal if it exists
COPY mason.py oe-eval-internal* /stage/
COPY .git/ ./.git/
COPY pyproject.toml uv.lock .
RUN chmod +x scripts/*

# install dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --extra compile

# for interactive session
RUN chmod -R 777 /stage/

# uv dockerfile magic: place executables in the environment at the front of the path
ENV PATH=/stage/.venv/bin:$PATH
35 changes: 29 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,45 @@ Our setup mostly follows our [Dockerfile](./Dockerfile), which uses Python 3.10.
```bash
pip install --upgrade pip "setuptools<70.0.0" wheel
# TODO, unpin setuptools when this issue in flash attention is resolved
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
pip install packaging
pip install flash-attn==2.6.3 --no-build-isolation
pip install flash-attn==2.7.2.post1 --no-build-isolation
pip install -r requirements.txt
python -m nltk.downloader punkt
pip install -e .
python -m nltk.downloader punkt
```

* **Local installation with uv (preview)**: We are experimenting with using [uv](https://docs.astral.sh/uv/). You can install via
```bash
uv sync
uv sync --extra compile # to install flash attention
```


* **Docker installation**: You can also use the Dockerfile to build a Docker image. You can build the image with the following command:

```bash
docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct_dev
docker build . -t open_instruct_dev
# if you are interally at AI2, you can create an image like this:
beaker_user=$(beaker account whoami --format json | jq -r '.[0].name')
beaker image delete $beaker_user/open_instruct_dev
beaker image create open_instruct_dev -n open_instruct_dev -w ai2/$beaker_user
```

Optionally you can build the base image with the following command:

```bash
docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 -f Dockerfile.base . -t cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
```

* **Docker with uv**: You can also use the Dockerfile to build a Docker image with uv. You can build the image with the following command:

```bash
docker build -f Dockerfile.uv -t open_instruct_dev_uv .
# if you are interally at AI2, you can create an image like this:
beaker image delete $(whoami)/open_instruct_dev
beaker image create open_instruct_dev -n open_instruct_dev -w ai2/$(whoami)
beaker_user=$(beaker account whoami --format json | jq -r '.[0].name')
beaker image delete $beaker_user/open_instruct_dev_uv
beaker image create open_instruct_dev_uv -n open_instruct_dev_uv -w ai2/$beaker_user
```

If you are internally at AI2, you may launch experiments using our always-up-to-date auto-built image `nathanl/open_instruct_auto`.
Expand Down
5 changes: 3 additions & 2 deletions open_instruct/dpo_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import deepspeed
import torch
import transformers
from accelerate import Accelerator
from accelerate import Accelerator, DataLoaderConfiguration
from accelerate.logging import get_logger
from accelerate.utils import InitProcessGroupKwargs, set_seed
from datasets import load_dataset
Expand Down Expand Up @@ -499,10 +499,11 @@ def main(args: FlatArguments):

# if you get timeouts (e.g. due to long tokenization) increase this.
timeout_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=args.timeout))
dataloader_config = DataLoaderConfiguration(use_seedable_sampler=True)

accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,
use_seedable_sampler=True,
dataloader_config=dataloader_config,
**accelerator_log_kwargs,
kwargs_handlers=[timeout_kwargs],
)
Expand Down
5 changes: 3 additions & 2 deletions open_instruct/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import deepspeed
import torch
import transformers
from accelerate import Accelerator
from accelerate import Accelerator, DataLoaderConfiguration
from accelerate.logging import get_logger
from accelerate.utils import InitProcessGroupKwargs, set_seed
from datasets import load_dataset
Expand Down Expand Up @@ -483,10 +483,11 @@ def main(args: FlatArguments):

# if you get timeouts (e.g. due to long tokenization) increase this.
timeout_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=args.timeout))
dataloader_config = DataLoaderConfiguration(use_seedable_sampler=True)

accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,
use_seedable_sampler=True,
dataloader_config=dataloader_config,
**accelerator_log_kwargs,
kwargs_handlers=[timeout_kwargs],
)
Expand Down
32 changes: 16 additions & 16 deletions open_instruct/ppo_vllm_thread_ray_gtrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,11 +764,11 @@ def train(
world_size = vllm_num_engines * vllm_tensor_parallel_size + 1
backend = args.vllm_sync_backend
# https://github.com/OpenRLHF/OpenRLHF/issues/313
if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
backend = "gloo"
print(
"Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
)
# if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
# backend = "gloo"
# print(
# "Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
# )
refs = [
engine.init_process_group.remote(
master_address,
Expand Down Expand Up @@ -1002,10 +1002,10 @@ def vllm_generate(

start_time = time.time()
broadcast_to_vllm()
print(
f"πŸ”₯πŸ”₯πŸ”₯ Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
if accelerator.is_main_process:
print(
f"πŸ”₯πŸ”₯πŸ”₯ Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
param_prompt_Q.put((None, remove_padding(global_queries, tokenizer.pad_token_id)))
else:
if training_step != 1:
Expand All @@ -1023,10 +1023,10 @@ def vllm_generate(
datasets_next = data[DATASET_SOURCE_KEY]
start_time = time.time()
broadcast_to_vllm()
print(
f"πŸ”₯πŸ”₯πŸ”₯ Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
if accelerator.is_main_process:
print(
f"πŸ”₯πŸ”₯πŸ”₯ Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
param_prompt_Q.put((None, remove_padding(global_queries, tokenizer.pad_token_id)))
queries = queries_next
ground_truths = ground_truths_next
Expand Down Expand Up @@ -1616,11 +1616,11 @@ def main(args: Args, dataset_config: DatasetConfig, model_config: ModelConfig):
print(
f"Dataset splits not provided for all datasets. Using the same {args.dataset_train_splits[0]} split for all datasets."
)
if len(args.dataset_eval_splits) != len(args.dataset_eval_mixer_dict) and len(args.dataset_eval_splits) == 1:
args.dataset_eval_splits = [args.dataset_eval_splits[0]] * len(args.dataset_eval_mixer_dict)
print(
f"Dataset splits not provided for all datasets. Using the same {args.dataset_eval_splits[0]} split for all datasets."
)
# if len(args.dataset_eval_splits) != len(args.dataset_eval_mixer_dict) and len(args.dataset_eval_splits) == 1:
# args.dataset_eval_splits = [args.dataset_eval_splits[0]] * len(args.dataset_eval_mixer_dict)
# print(
# f"Dataset splits not provided for all datasets. Using the same {args.dataset_eval_splits[0]} split for all datasets."
# )
train_dataset = combine_dataset(
args.dataset_mixer_dict,
splits=args.dataset_train_splits,
Expand Down
Loading

0 comments on commit c95fbbe

Please sign in to comment.