Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

uv2 #492

Merged
merged 6 commits into from
Jan 10, 2025
Merged

uv2 #492

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 5 additions & 68 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,42 +1,4 @@
ARG CUDA
ARG DIST
ARG TARGET
FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}

ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ="America/Los_Angeles"

# Install base tools.
RUN apt-get update && apt-get install -y \
build-essential \
curl \
git \
jq \
language-pack-en \
make \
sudo \
unzip \
vim \
wget \
parallel \
iputils-ping \
tmux

ARG BEAKER_VERSION
RUN curl --silent \
--connect-timeout 5 \
--max-time 10 \
--retry 5 \
--retry-delay 0 \
--retry-max-time 40 \
--output beaker.tar.gz \
"https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
&& tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
&& rm beaker.tar.gz

# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
# puts the right NVIDIA things in the right place (that THOR requires).
ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
FROM ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04

# Install conda. We give anyone in the users group the ability to run
# conda commands and install packages in the base (default) environment.
Expand All @@ -50,50 +12,25 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86
&& rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh

ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Install a few additional utilities via pip
RUN /opt/miniconda3/bin/pip install --no-cache-dir \
gpustat \
jupyter \
beaker-gantry \
oocmap

# Ensure users can modify their container environment.
RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

# Make the base image friendlier for interactive workloads. This makes things like the man command
# work.
RUN yes | unminimize

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 5.8-1.1.2.1
ENV OS_VER ubuntu20.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

# The -l flag makes bash act as a login shell and load /etc/profile, etc.
ENTRYPOINT ["bash", "-l"]

WORKDIR /stage/

# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file.
ENV HF_HUB_ENABLE_HF_TRANSFER=1
COPY requirements.txt .
RUN pip install --upgrade pip "setuptools<70.0.0" wheel
# TODO, unpin setuptools when this issue in flash attention is resolved
RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
RUN pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
RUN pip install packaging
RUN pip install flash-attn==2.6.3 --no-build-isolation
RUN pip install flash-attn==2.7.2.post1 --no-build-isolation
COPY requirements.txt .
RUN pip install -r requirements.txt

# NLTK download
RUN python -m nltk.downloader punkt

COPY open_instruct open_instruct
COPY oe-eval-internal oe-eval-internal

Expand Down
80 changes: 80 additions & 0 deletions Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Run the following command to build `ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04`
# docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 -f Dockerfile.base . -t cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04

ARG CUDA
ARG DIST
ARG TARGET
FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}

ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ="America/Los_Angeles"

# Install base tools.
RUN apt-get update && apt-get install -y \
build-essential \
curl \
git \
jq \
language-pack-en \
make \
man-db \
manpages \
manpages-dev \
manpages-posix \
manpages-posix-dev \
sudo \
unzip \
vim \
wget \
fish \
parallel \
iputils-ping \
htop \
emacs \
zsh \
rsync \
tmux

# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
# puts the right NVIDIA things in the right place (that THOR requires).
ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Ensure users can modify their container environment.
RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

# Make the base image friendlier for interactive workloads. This makes things like the man command
# work.
RUN yes | unminimize

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 24.01-0.3.3.1
ENV OS_VER ubuntu20.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

# Install Docker CLI. Version matches Beaker on-premise servers.
RUN curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-20.10.21.tgz -o docker.tgz \
&& sudo tar xzvf docker.tgz --strip 1 -C /usr/local/bin docker/docker \
&& rm docker.tgz

# Install Beaker
ARG BEAKER_VERSION
RUN curl --silent \
--connect-timeout 5 \
--max-time 10 \
--retry 5 \
--retry-delay 0 \
--retry-max-time 40 \
--output beaker.tar.gz \
"https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
&& tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
&& rm beaker.tar.gz

# The -l flag makes bash act as a login shell and load /etc/profile, etc.
ENTRYPOINT ["bash", "-l"]
37 changes: 37 additions & 0 deletions Dockerfile.uv
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM ghcr.io/allenai/oi-cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV UV_COMPILE_BYTECODE=1

# setup files
WORKDIR /stage/

# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --no-install-project
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --no-install-project --extra compile

COPY open_instruct open_instruct
COPY eval eval
COPY configs configs
COPY scripts scripts
# hack: only copy oe-eval-internal if it exists
COPY mason.py oe-eval-internal* /stage/
COPY .git/ ./.git/
COPY pyproject.toml uv.lock .
RUN chmod +x scripts/*

# install dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --extra compile

# for interactive session
RUN chmod -R 777 /stage/

# uv dockerfile magic: place executables in the environment at the front of the path
ENV PATH=/stage/.venv/bin:$PATH
35 changes: 29 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,45 @@ Our setup mostly follows our [Dockerfile](./Dockerfile), which uses Python 3.10.
```bash
pip install --upgrade pip "setuptools<70.0.0" wheel
# TODO, unpin setuptools when this issue in flash attention is resolved
pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
pip install packaging
pip install flash-attn==2.6.3 --no-build-isolation
pip install flash-attn==2.7.2.post1 --no-build-isolation
pip install -r requirements.txt
python -m nltk.downloader punkt
pip install -e .
python -m nltk.downloader punkt
```

* **Local installation with uv (preview)**: We are experimenting with using [uv](https://docs.astral.sh/uv/). You can install via
```bash
uv sync
uv sync --extra compile # to install flash attention
```


* **Docker installation**: You can also use the Dockerfile to build a Docker image. You can build the image with the following command:

```bash
docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct_dev
docker build . -t open_instruct_dev
# if you are interally at AI2, you can create an image like this:
beaker_user=$(beaker account whoami --format json | jq -r '.[0].name')
beaker image delete $beaker_user/open_instruct_dev
beaker image create open_instruct_dev -n open_instruct_dev -w ai2/$beaker_user
```

Optionally you can build the base image with the following command:

```bash
docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 -f Dockerfile.base . -t cuda-no-conda:12.1-cudnn8-dev-ubuntu20.04
```

* **Docker with uv**: You can also use the Dockerfile to build a Docker image with uv. You can build the image with the following command:

```bash
docker build -f Dockerfile.uv -t open_instruct_dev_uv .
# if you are interally at AI2, you can create an image like this:
beaker image delete $(whoami)/open_instruct_dev
beaker image create open_instruct_dev -n open_instruct_dev -w ai2/$(whoami)
beaker_user=$(beaker account whoami --format json | jq -r '.[0].name')
beaker image delete $beaker_user/open_instruct_dev_uv
beaker image create open_instruct_dev_uv -n open_instruct_dev_uv -w ai2/$beaker_user
```

If you are internally at AI2, you may launch experiments using our always-up-to-date auto-built image `nathanl/open_instruct_auto`.
Expand Down
5 changes: 3 additions & 2 deletions open_instruct/dpo_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import deepspeed
import torch
import transformers
from accelerate import Accelerator
from accelerate import Accelerator, DataLoaderConfiguration
from accelerate.logging import get_logger
from accelerate.utils import InitProcessGroupKwargs, set_seed
from datasets import load_dataset
Expand Down Expand Up @@ -499,10 +499,11 @@ def main(args: FlatArguments):

# if you get timeouts (e.g. due to long tokenization) increase this.
timeout_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=args.timeout))
dataloader_config = DataLoaderConfiguration(use_seedable_sampler=True)

accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,
use_seedable_sampler=True,
dataloader_config=dataloader_config,
**accelerator_log_kwargs,
kwargs_handlers=[timeout_kwargs],
)
Expand Down
5 changes: 3 additions & 2 deletions open_instruct/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import deepspeed
import torch
import transformers
from accelerate import Accelerator
from accelerate import Accelerator, DataLoaderConfiguration
from accelerate.logging import get_logger
from accelerate.utils import InitProcessGroupKwargs, set_seed
from datasets import load_dataset
Expand Down Expand Up @@ -483,10 +483,11 @@ def main(args: FlatArguments):

# if you get timeouts (e.g. due to long tokenization) increase this.
timeout_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=args.timeout))
dataloader_config = DataLoaderConfiguration(use_seedable_sampler=True)

accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps,
use_seedable_sampler=True,
dataloader_config=dataloader_config,
**accelerator_log_kwargs,
kwargs_handlers=[timeout_kwargs],
)
Expand Down
32 changes: 16 additions & 16 deletions open_instruct/ppo_vllm_thread_ray_gtrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,11 +762,11 @@ def train(
world_size = vllm_num_engines * vllm_tensor_parallel_size + 1
backend = args.vllm_sync_backend
# https://github.com/OpenRLHF/OpenRLHF/issues/313
if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
backend = "gloo"
print(
"Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
)
# if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
# backend = "gloo"
# print(
# "Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
# )
refs = [
engine.init_process_group.remote(
master_address,
Expand Down Expand Up @@ -1000,10 +1000,10 @@ def vllm_generate(

start_time = time.time()
broadcast_to_vllm()
print(
f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
if accelerator.is_main_process:
print(
f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
param_prompt_Q.put((None, remove_padding(global_queries, tokenizer.pad_token_id)))
else:
if training_step != 1:
Expand All @@ -1021,10 +1021,10 @@ def vllm_generate(
datasets_next = data[DATASET_SOURCE_KEY]
start_time = time.time()
broadcast_to_vllm()
print(
f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
if accelerator.is_main_process:
print(
f"🔥🔥🔥 Loading weights using shared memory; Time to load weights: {time.time() - start_time:.2f} seconds"
)
param_prompt_Q.put((None, remove_padding(global_queries, tokenizer.pad_token_id)))
queries = queries_next
ground_truths = ground_truths_next
Expand Down Expand Up @@ -1614,11 +1614,11 @@ def main(args: Args, dataset_config: DatasetConfig, model_config: ModelConfig):
print(
f"Dataset splits not provided for all datasets. Using the same {args.dataset_train_splits[0]} split for all datasets."
)
if len(args.dataset_eval_splits) != len(args.dataset_eval_mixer_dict) and len(args.dataset_eval_splits) == 1:
args.dataset_eval_splits = [args.dataset_eval_splits[0]] * len(args.dataset_eval_mixer_dict)
print(
f"Dataset splits not provided for all datasets. Using the same {args.dataset_eval_splits[0]} split for all datasets."
)
# if len(args.dataset_eval_splits) != len(args.dataset_eval_mixer_dict) and len(args.dataset_eval_splits) == 1:
# args.dataset_eval_splits = [args.dataset_eval_splits[0]] * len(args.dataset_eval_mixer_dict)
# print(
# f"Dataset splits not provided for all datasets. Using the same {args.dataset_eval_splits[0]} split for all datasets."
# )
train_dataset = combine_dataset(
args.dataset_mixer_dict,
splits=args.dataset_train_splits,
Expand Down
Loading
Loading