lmtp: docker serve

lbeurerkellner · lbeurerkellner · commit 4525e623733e · 2023-08-07T11:09:37.000+02:00
diff --git a/.dockerignore b/.dockerignore
@@ -1,5 +1,6 @@
 api.env
 **/node_modules/*
+
 *.tokens
 
 dist
@@ -178,5 +179,9 @@ wip-snippets
 .lmql-algorithms-cache
 
 *.tokens
+
 scripts/Dockerfile*
+scripts/Dockerfile.serve
+scripts/lmql-serve-docker.py
 transformers-cache
+web/
diff --git a/scripts/Dockerfile.serve b/scripts/Dockerfile.serve
@@ -1,9 +1,8 @@
 FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 
 ENV PYTHONUNBUFFERED=1 
-ENV PORT=8899
 
-# SYSTEM
+# NVIDIA GPU support
 RUN apt-get update --yes --quiet && DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \
     software-properties-common \
     build-essential apt-utils \
@@ -27,6 +26,7 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 9
 
 RUN pip install --upgrade pip
 
+# install lmql
 WORKDIR lmql
 VOLUME ~/.lmql/
 ARG GPU_ENABLED
@@ -35,13 +35,35 @@ HEALTHCHECK CMD curl --fail http://localhost:$PORT/ || exit 1
 
 RUN apt-get update
 
+# install release version of lmql (for dependencies)
+RUN pip install "lmql[hf]"
+
 COPY . /lmql
-# install lmql from source
+# re-install lmql from source 
 WORKDIR /lmql
 RUN pip install -e ".[hf]"
 
-EXPOSE $PORT
-
+VOLUME /transformers
 RUN ls /transformers
 
-ENTRYPOINT ["lmql", "serve-model", "--port", "$PORT", "--host", "0.0.0.0"]
+ENV LMQL_VERSION="latest"
+
+# checkout LMQL version
+RUN if [ "$LMQL_VERSION" != "latest" ]; then git checkout tags/$LMQL_VERSION; fi
+
+# if ENV EXTRA_PIP_PACKAGES is set, install them at runtime (e.g. bitsandbytes or auto_gptq), not at build time
+ENV EXTRA_PIP_PACKAGES=""
+
+# create run.sh 
+RUN echo "#!/bin/bash \n\
+    set -x \n\
+    # install extra pip packages \n\
+    if [ \"\$EXTRA_PIP_PACKAGES\" != \"\" ]; then \n\
+        pip install \$EXTRA_PIP_PACKAGES \n\
+    fi \n\
+    # start lmql \n\
+    lmql serve-model --port 8899 --host 0.0.0.0 --docker_hide_port \$@" > run.sh
+
+RUN chmod +x run.sh
+
+ENTRYPOINT ["./run.sh"]
diff --git a/scripts/lmql-serve-docker.py b/scripts/lmql-serve-docker.py
@@ -7,14 +7,22 @@ def has_docker_image():
     return os.system(cmd) == 0
 
 def build_docker_image():
-    cmd = """sudo docker build -t lmql-serve -f Dockerfile.serve ."""
+    ADDITIONAL_EXCLUDES = [
+    ]
+    cmd = """sudo docker build -t lmql-serve -f scripts/Dockerfile.serve ."""
     print(">", cmd)
     os.system(cmd)
 
-parser = argparse.ArgumentParser()
-parser.add_argument('--port', type=int, default=2223, help='Host port to expose the LMTP endpoint on')
-parser.add_argument('--gpus', type=str, default='all', help="GPUs to use, e.g. --gpu all, passed to 'docker run'")
-parser.add_argument('--transformers-cache', type=str, default='$HOME/.cache/huggingface/hub', help="Path to local directory to cache downloaded transformers models.")
+parser = argparse.ArgumentParser(description="""
+                                 Runs 'lmql serve-model' in a docker container.
+
+                                 This scripts passes all arguments to 'lmql serve-model' in the docker container, except for the following:
+                                 """.strip())
+parser.add_argument('--port', type=int, default=8080, help="Host port to expose the container's LMTP endpoint on. Default: 8080.")
+parser.add_argument('--gpus', type=str, default='all', help="GPUs to use, e.g. --gpu all, passed to 'docker run'. Default: all.")
+parser.add_argument('--transformers-cache', type=str, default='$HOME/.cache/huggingface/hub', help="Path to local directory to mount into the container as model cache.")
+parser.add_argument('--rebuild', action='store_true', help="Forces rebuilding the docker image")
+parser.add_argument('--extras', type=str, default='', help="Extra pip packages to install in the docker image before running lmql serve-model.")
 # all other args are passed to lmql-serve
 args, _ = parser.parse_known_args()
 
@@ -26,20 +34,23 @@ def build_docker_image():
         # otherwise, replace $HOME with current directory
         args.transformers_cache = args.transformers_cache.replace("$HOME", ".")
 
-if not has_docker_image():
+if not has_docker_image() or args.rebuild:
     build_docker_image()
 
 PORT=2223
 GPUS=all
 
 cmd = """sudo docker run \\
     -p $PORT:8899 \\
-    -e PORT=8899 \\
-    -e TRANSFORMERS_CACHE=/transformers \\
+    -e TRANSFORMERS_CACHE=/transformers $EXTRAS \\
     -it --gpus $GPUS \\
     -v $CACHE:/transformers \\
-    lmql-serve --cuda $@
-""".replace("$GPUS", args.gpus).replace("$PORT", str(args.port)).replace("$@", " ".join(_)).replace("$CACHE", args.transformers_cache)
+    lmql-serve $@
+ """.replace("$GPUS", args.gpus) \
+    .replace("$PORT", str(args.port)) \
+    .replace("$@", " ".join(_)) \
+    .replace("$CACHE", args.transformers_cache) \
+    .replace("$EXTRAS", f"-e EXTRA_PIP_PACKAGES='{args.extras}'" if args.extras != "" else "")
 
 print(">", cmd)
 os.system(cmd)
diff --git a/src/lmql/models/lmtp/lmtp_serve.py b/src/lmql/models/lmtp/lmtp_serve.py
@@ -5,6 +5,7 @@
 from .lmtp_inference_server import *
 from .utils import rename_model_args
 from .lmtp_balance import balance_main
+from .lmtp_layout import layout_main
 
 def serve(model_name, host="localhost", port=8080, cuda=False, dtype=None, static=False, loader=None, **kwargs):
     """
@@ -40,10 +41,12 @@ def lmtp_serve_main(model_args):
 
     # extract explicit arguments
     host = model_args.pop("host", "localhost")
-    port = model_args.pop("port", 8080)
+    port = int(model_args.pop("port", 8080))
     model = model_args.pop("model", None)
     single_thread = model_args.pop("single_thread", False)
     static = model_args.pop("static", False) or single_thread
+    # in Docker, don't show the port (it's not accessible from outside the container anyway)
+    docker_hide_port = model_args.pop("docker_hide_port", False)
     
     assert not single_thread or model != "auto", "Cannot use --single_thread mode with model 'auto'. Please specify a specific model to load."
 
@@ -68,7 +71,10 @@ async def stream(request):
     
     def web_print(*args):
         if len(args) == 1 and args[0].startswith("======== Running on"):
-            print(f"[Serving LMTP endpoint on ws://{host}:{port}/]")
+            if docker_hide_port:
+                print(f"[Serving LMTP endpoint on Docker container port]")
+            else:
+                print(f"[Serving LMTP endpoint on ws://{host}:{port}/]")
         else:
             print(*args)
     
@@ -82,7 +88,7 @@ def argparser(args):
     next_argument_name = None
     
     kwargs = {}
-    flag_args = ["cuda", "static", "single_thread"]
+    flag_args = ["cuda", "static", "single_thread", "docker_hide_port"]
 
     help_text = """
 usage: serve-model [-h] [--port PORT] [--host HOST] [--cuda] [--dtype DTYPE] [--[*] VALUE] model
@@ -96,6 +102,8 @@ def argparser(args):
   --host HOST
   --cuda
   --static      If set, the model cannot be switched on client request but remains fixed to the model specified in the model argument.
+  --single_thread Run the model on the main thread. This can lead to increased latency when processing multiple requests, but is necessary for some models that 
+                 cannot be run in the background.
   --dtype DTYPE  What format to load the model weights. Options: 'float16'
                  (not available on all models), '8bit' (requires bitsandbytes)
   --loader OPT  If set, the model will be loaded using the corresponding option. Useful for loading quantized modules in formats not
@@ -154,6 +162,10 @@ def cli(args=None):
         args = args[1:]
         balance_main(args)
         return
+    elif "--layout" in args:
+        # instead of running directly, with a layout we are launching the
+        # relevant worker subprocesses, which in turn call lmtp_serve_main
+        layout_main(args)
     else:
         args = argparser(args)
         lmtp_serve_main(args)