Feature: Add Support for GCP (#193)

* Initial Dockerfile draft * Add new endpoints * Build predict expertise function * Expect rawPredict format * Use entrypoint instead of cmd * Fix entrypoint * Changes to config * Add production config * Set containerized * Remove Redis interaction if container * Adjust model directories * From NVIDIA container * Fetch expertise utils * Avoid artifact download * Move artifact copying to entrypoint * Add container flag * Update container flag * Remove type * Add support for instances in request * Move artifact loading * Async load artifacts * Add startup route to check for artifact loading * Don't call artifacts * Allow token pass in body * Pass token * Properly handle no token * Isolate token * Load into dict if not a dict * Add flag to block on artifacts * Rollback blocking * Block on loading artifacts * Log model ready * Log URI and bucket * Point to /app * Fix return schema * Index into predictions list * Fix blob prefix * Remove unused script * Fix prefix parsing * Support reviewer_ids * Fix reviewer IDs bug * Fix bug in expertise invitation for reviewerIds * Merge instances on reviewerIds * Return list of predictions * Parsing must happen in routes * Check correctly formed dataset * Fix subscriptable bug * Remove prod config * Add retry safety * Validate dataset creation * Support count in validation * Get entityA properly * Move statements * Fix Path bug * Use sub IDs for validation * Fix convert field to path * Add failure explanation * Create execute_pipeline.py * Absolute import * Fix script * Upload results to bucket * Fix prefix * Avoid installing SPECTER deps * Remove cd into specter * Draft push action * Remove VertexParser * Remove /predict * Remove import * Remove /predict func * Remove import * Use auth@v2 Co-authored-by: Carlos Daniel Mondragón Chapa <[email protected]> * Bump Miniconda * Dump metadata * Push new action * Trigger on push to branch * Use absolute path * Use proper path * Fix argparse * Only try script * Clean up credentials again * Add some logging * Fix execute and skip build * Add GCPInterface * Add interface tests * Add pipeline tests * Reduce image size * Use logger * fix tests * Use workflow dispatch * Parameterize pipeline name * Try fix gensim tokenizers * Revert "Try fix gensim tokenizers" This reverts commit 7ef18ce. * Update image to 3.11 * Install openreview-py afterwards * Start implementing cloud service * Fix deps * Add logging * Finish service tests * Clear Redis at beginning of test * Use filesystem for GCP mock * Wait until complete * Log Cloud ID * Separate contexts * Log service class * Patch run_once * Isolate cloud queue * Fix variables * Change publication count * Log arg list * update count * Reduce code reusage * Clean up comments --------- Co-authored-by: Carlos Daniel Mondragón Chapa <[email protected]>
openreview · Jan 3, 2025 · 2974db2 · 2974db2
1 parent 7dc429f
commit 2974db2
Show file tree

Hide file tree

Showing 15 changed files with 2,466 additions and 153 deletions.
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
@@ -0,0 +1,72 @@
+# This workflow builds and pushes the expertise image to the Artifact Registry
+
+name: push-workflow-image
+
+# Controls when the workflow will run
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+env:
+  REGION: us
+  KFP_REGION: us-central1
+  KFP_REPO: openreview-kfp
+  REPO: openreview-docker-images
+  PROJECT: sunlit-realm-131518
+  IMAGE: expertise-test
+  TAG: latest
+
+jobs:
+  push-workflow-image:
+    # Allow the job to fetch a GitHub ID token
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Add SSH key
+        run: |
+          mkdir -p /home/runner/.ssh
+          echo "${{ secrets.GCLOUD_SSH_KEY }}" > /home/runner/.ssh/google_compute_engine
+          echo "${{ secrets.GCLOUD_SSH_KEY_PUB }}" > /home/runner/.ssh/google_compute_engine.pub
+          chmod 600 /home/runner/.ssh/google_compute_engine
+          chmod 600 /home/runner/.ssh/google_compute_engine.pub
+      - name: Authenticate with Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v2
+        with:
+          workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }}
+          create_credentials_file: true
+          cleanup_credentials: true
+          export_environment_variables: true
+      - name: Set Image Tag
+        run: echo "IMAGE_TAG=$REGION-docker.pkg.dev/$PROJECT/$REPO/$IMAGE" >> $GITHUB_ENV
+      - name: Setup gcloud
+        uses: google-github-actions/setup-gcloud@v1
+      - name: Setup Docker authentication
+        run: gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.9'
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install kfp
+      - name: Build and push image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: ${{ env.IMAGE_TAG }}
+      #- name: Run pipeline script
+      #  run: |
+      #    python expertise/build_pipeline.py \
+      #      --region "${{ env.REGION }}" \
+      #      --kfp_region "${{ env.KFP_REGION }}" \
+      #      --project "${{ env.PROJECT }}" \
+      #      --repo "${{ env.REPO }}" \
+      #      --kfp_repo "${{ env.KFP_REPO }}" \
+      #      --image "${{ env.IMAGE }}" \
+      #      --tag "${{ env.TAG }}"
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,55 @@
+FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
+
+WORKDIR /app
+
+ENV PYTHON_VERSION=3.11 \
+    HOME="/app" \
+    PATH="/app/miniconda/bin:${PATH}" \
+    FLASK_ENV=production \
+    AIP_STORAGE_URI="gs://openreview-expertise/expertise-utils/" \
+    SPECTER_DIR="/app/expertise-utils/specter/" \
+    MFR_VOCAB_DIR="/app/expertise-utils/multifacet_recommender/feature_vocab_file" \
+    MFR_CHECKPOINT_DIR="/app/expertise-utils/mfr_model_checkpoint/"
+
+COPY . /app/openreview-expertise
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    curl \
+    ca-certificates \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    \
+    && cd $HOME \
+    && wget "https://repo.anaconda.com/miniconda/Miniconda3-py311_24.9.2-0-Linux-x86_64.sh" -O miniconda.sh \
+    && echo "62ef806265659c47e37e22e8f9adce29e75c4ea0497e619c280f54c823887c4f  miniconda.sh" | sha256sum -c - \
+    && bash miniconda.sh -b -p $HOME/miniconda \
+    && rm miniconda.sh \
+    \
+    && conda update -y conda \
+    && conda create -y -n expertise python=$PYTHON_VERSION -c conda-forge \
+    \
+    && . $HOME/miniconda/etc/profile.d/conda.sh \
+    && conda activate expertise \
+    && conda install pytorch pytorch-cuda=12.4 -c pytorch -c nvidia \
+    && conda install -y filelock intel-openmp faiss-cpu -c pytorch \
+    && python -m pip install --no-cache-dir -e $HOME/openreview-expertise \
+    && python -m pip install --no-cache-dir -I protobuf==3.20.1 \
+    && python -m pip install --no-cache-dir numpy==1.26.4 --force-reinstall \
+    && python -m pip install openreview-py --force-reinstall \
+    && conda clean --all -y \
+    && apt-get purge -y build-essential wget curl git \
+    && apt-get autoremove -y \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add conda environment bin to PATH so that 'python' uses the environment by default
+ENV PATH="/app/miniconda/envs/expertise/bin:${PATH}"
+
+RUN mkdir ${HOME}/expertise-utils \
+    && cp ${HOME}/openreview-expertise/expertise/service/config/default_container.cfg \
+       ${HOME}/openreview-expertise/expertise/service/config/production.cfg
+
+EXPOSE 8080
+
+ENTRYPOINT ["python", "-m", "expertise.service", "--host", "0.0.0.0", "--port", "8080", "--container"]
diff --git a/expertise/build_pipeline.py b/expertise/build_pipeline.py
@@ -0,0 +1,105 @@
+# pip install kfp
+from kfp import dsl
+from kfp.v2 import compiler
+from kfp.v2.dsl import pipeline
+from kfp.registry import RegistryClient
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Builds and Uploads a Kubeflow Pipeline for the Expertise Model")
+    parser.add_argument(
+        "--region",
+        type=str,
+        required=True,
+        help="Region for Docker Images in Artifact Registry"
+    )
+    parser.add_argument(
+        "--kfp_region",
+        type=str,
+        required=True,
+        help="Region Kubeflow Pipelines in Artifact Registry"
+    )
+    parser.add_argument(
+        "--project",
+        type=str,
+        required=True,
+        help="GCP Project ID"
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        required=True,
+        help="Name of the Artifact Registry Docker Repository"
+    )
+    parser.add_argument(
+        "--kfp_repo",
+        type=str,
+        required=True,
+        help="Name of the Artifact Registry Kubeflow Repository"
+    )
+    parser.add_argument(
+        "--kfp_name",
+        type=str,
+        required=True,
+        help="Name of the Kubeflow Pipeline"
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        required=True,
+        help="Name of the Docker Image"
+    )
+    parser.add_argument(
+        "--tag",
+        type=str,
+        required=False,
+        default='latest',
+        help="Tag of the Docker Image"
+    )
+    args = parser.parse_args()
+
+    @dsl.container_component
+    def execute_expertise_pipeline_op(job_config: str):
+        return dsl.ContainerSpec(
+            image=f'{args.region}-docker.pkg.dev/{args.project}/{args.repo}/{args.image}:{args.tag}',
+            command=['python', '-m', 'expertise.execute_pipeline'],
+            args=[job_config]
+        )
+
+    @pipeline(
+        name=args.kfp_name,
+        description='Processes request for user-paper expertise scores'
+    )
+    def expertise_pipeline(job_config: str):
+        import os
+        # Setting environment variables within the function
+        os.environ["AIP_STORAGE_URI"] = "gs://openreview-expertise/expertise-utils/"
+        os.environ["SPECTER_DIR"] = "/app/expertise-utils/specter/"
+        os.environ["MFR_VOCAB_DIR"] = "/app/expertise-utils/multifacet_recommender/feature_vocab_file"
+        os.environ["MFR_CHECKPOINT_DIR"] = "/app/expertise-utils/multifacet_recommender/mfr_model_checkpoint/"
+        op = (execute_expertise_pipeline_op(job_config=job_config)
+        .set_cpu_limit('4')
+        .set_memory_limit('32G')
+        .add_node_selector_constraint('NVIDIA_TESLA_T4')
+        .set_accelerator_limit('1')
+        )
+
+
+    compiler.Compiler().compile(
+        pipeline_func=expertise_pipeline,
+        package_path='expertise_pipeline.yaml'
+    )
+
+    client = RegistryClient(host=f"https://{args.kfp_region}-kfp.pkg.dev/{args.project}/{args.kfp_repo}")
+    client.delete_tag(
+        args.kfp_name,
+        'latest'
+    )
+
+    tags = [args.tag]
+    if 'latest' not in tags:
+        tags.append('latest')
+    templateName, versionName = client.upload_pipeline(
+        tags=tags,
+        file_name="expertise_pipeline.yaml"
+    )