Skip to content

Commit

Permalink
Merge pull request spotify#220 from spotify/dsimon/restore-config-mai…
Browse files Browse the repository at this point in the history
…n-session

[cli,lib] load config from main session
  • Loading branch information
econchick authored Oct 20, 2021
2 parents f989e71 + 5762132 commit c8f0b19
Show file tree
Hide file tree
Showing 38 changed files with 86 additions and 293 deletions.
23 changes: 14 additions & 9 deletions cli/src/klio_cli/commands/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@

import logging
import os
import tempfile

import docker

from klio_core.config import core as config_core

from klio_cli.utils import docker_utils


Expand All @@ -30,6 +29,8 @@ class BaseDockerizedPipeline(object):
CONTAINER_GCP_CRED_PATH = os.path.join("/usr", GCP_CRED_FILE)
CONTAINER_JOB_DIR = "/usr/src/app"
DOCKER_LOGGER_NAME = "klio.base_docker_pipeline"
# path where the temp config-file is mounted into klio-exec's container
MATERIALIZED_CONFIG_PATH = "/usr/src/config/materialized_config.yaml"

def __init__(self, job_dir, klio_config, docker_runtime_config):
self.job_dir = job_dir
Expand All @@ -41,6 +42,7 @@ def __init__(self, job_dir, klio_config, docker_runtime_config):
# if this is set to true, running the command will generate a temp file
# and mount it to the container
self.requires_config_file = True
self.materialized_config_file = None

@property
def _full_image_name(self):
Expand Down Expand Up @@ -98,6 +100,12 @@ def _get_volumes(self):
},
}

if self.materialized_config_file is not None:
volumes[self.materialized_config_file.name] = {
"bind": BaseDockerizedPipeline.MATERIALIZED_CONFIG_PATH,
"mode": "rw",
}

return volumes

def _get_command(self, *args, **kwargs):
Expand All @@ -108,10 +116,7 @@ def _add_base_args(self, command):
command.extend(
[
"--config-file",
os.path.join(
self.CONTAINER_JOB_DIR,
config_core.RUN_EFFECTIVE_CONFIG_FILE,
),
BaseDockerizedPipeline.MATERIALIZED_CONFIG_PATH,
]
)
return command
Expand Down Expand Up @@ -171,10 +176,10 @@ def _check_gcp_credentials_exist(self):

def _write_effective_config(self):
if self.requires_config_file:
path = os.path.join(
self.job_dir, config_core.RUN_EFFECTIVE_CONFIG_FILE
self.materialized_config_file = tempfile.NamedTemporaryFile(
prefix="/tmp/", mode="w", delete=False
)
self.klio_config.write_to_file(path)
self.klio_config.write_to_file(self.materialized_config_file)

def run(self, *args, **kwargs):
# bail early
Expand Down
6 changes: 0 additions & 6 deletions cli/src/klio_cli/commands/job/utils/templates/dockerfile.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,10 @@ COPY __init__.py \
job-requirements.txt \
setup.py \
MANIFEST.in \
klio-job.yaml \
# Include any other non-Python files your job needs
{%- endif %}
/usr/src/app/

ARG KLIO_CONFIG=klio-job.yaml
{% if not klio.use_fnapi -%}
COPY $KLIO_CONFIG klio-job-run-effective.yaml

RUN pip install .
{%- else -%}
COPY $KLIO_CONFIG /usr/src/config/.effective-klio-job.yaml
{% endif -%}
5 changes: 2 additions & 3 deletions cli/src/klio_cli/commands/job/utils/templates/setup.py.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,13 @@ setuptools.setup(
# approach to install deps (i.e. deps that don't require OS-level deps
# and/or not from a private PyPI)
install_requires=[], # optional
# NOTE: The `klio-job.yaml` and any other non-Python files
# required to run a Klio job *must* be listed in here in `data_files`
# NOTE: Any other non-Python files required to run a Klio job *must* be
# listed in here in `data_files`
data_files=[ # required
# tuple(
# str(dir where to install files, relative to Python modules),
# list(str(non-Python filenames))
# )
(".", ["klio-job-run-effective.yaml"]),
],
include_package_data=True, # required
# NOTE: Explicitly include Python modules names (all relevant Python files
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,3 @@ COPY __init__.py \
transforms.py \
/usr/src/app/

ARG KLIO_CONFIG=klio-job.yaml
COPY $KLIO_CONFIG /usr/src/config/.effective-klio-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@ COPY __init__.py \
job-requirements.txt \
setup.py \
MANIFEST.in \
klio-job.yaml \
# Include any other non-Python files your job needs
/usr/src/app/

ARG KLIO_CONFIG=klio-job.yaml
COPY $KLIO_CONFIG klio-job-run-effective.yaml

RUN pip install .
RUN pip install .
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,13 @@ def run(self):
# approach to install deps (i.e. deps that don't require OS-level deps
# and/or not from a private PyPI)
install_requires=[], # optional
# NOTE: The `klio-job.yaml` and any other non-Python files
# required to run a Klio job *must* be listed in here in `data_files`
# NOTE: Any other non-Python files required to run a Klio job *must* be
# listed in here in `data_files`
data_files=[ # required
# tuple(
# str(dir where to install files, relative to Python modules),
# list(str(non-Python filenames))
# )
(".", ["klio-job-run-effective.yaml"]),
],
include_package_data=True, # required
# NOTE: Explicitly include Python modules names (all relevant Python files
Expand Down
2 changes: 1 addition & 1 deletion cli/tests/commands/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_get_docker_runflags(

if requires_config:
expected_command.extend(
["--config-file", "/usr/src/app/klio-job-run-effective.yaml"]
["--config-file", "/usr/src/config/materialized_config.yaml"]
)

exp_runflags = {
Expand Down
12 changes: 0 additions & 12 deletions core/src/klio_core/config/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,6 @@
logger = logging.getLogger("klio")


# path used by both klio-cli and klioexec to write and read the config when
# running a job
RUN_EFFECTIVE_CONFIG_FILE = "klio-job-run-effective.yaml"
RUN_EFFECTIVE_CONFIG_PATH = os.path.join(
"/usr/src/app", RUN_EFFECTIVE_CONFIG_FILE
)

# location where workers expect to load the config file from when using setup.py
WORKER_RUN_EFFECTIVE_CONFIG_PATH = os.path.join(
"/usr/local", RUN_EFFECTIVE_CONFIG_FILE
)

WORKER_DISK_TYPE_URL = (
"compute.googleapis.com/projects/{project}/regions/{region}/"
"diskTypes/{disk_type}"
Expand Down
6 changes: 3 additions & 3 deletions docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def find_version(*file_paths):
# -- Set up build env

# Set this env before building the docs as it's used to avoid loading the
# `/usr/src/config/.effective-klio-job.yaml` when klio decorators are
# imported (to grab their docs for `autodocs` functionality). When set,
# a mock object of KlioContext is returned (in `klio.transforms.decorators`).
# `KlioConfig` object when klio decorators are imported (to grab their docs for
# `autodocs` functionality). When set, a mock object of KlioContext is returned
# (in `klio.transforms.decorators`).
os.environ["KLIO_DOCS_MODE"] = "true"

# -- Project information -----------------------------------------------------
Expand Down
23 changes: 5 additions & 18 deletions docs/src/faqs/migrate_from_fnapi.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ explicitly including non-Python files needed for a job (i.e. a model, a JSON sch
description="My example job using setup.py", # optional
install_requires=["tensorflow"], # optional
data_files=[ # required
(".", ["klio-job-run-effective.yaml", "my-model.h5"]),
(".", ["my-model.h5"]),
],
include_package_data=True, # required
py_modules=["run", "transforms"], # required
Expand Down Expand Up @@ -184,31 +184,25 @@ Update: ``Dockerfile``
Required Changes
~~~~~~~~~~~~~~~~

1. **ADD** ``klio-job.yaml`` to be copied into ``/usr/src/app``.

.. collapsible:: Why is this needed?

We need to include Klio's configuration, but when creating a package of the job, the configuration must be within the same directory ``setup.py`` is in (subdirectories are fine). Relatedly, multi-configuration is not yet supported without the FnAPI since Klio expects the job configuration in a location that we can't manipulate with the ``setup.py`` approach.

2. **ADD** the newly required files to be copied over - ``setup.py`` and ``MANIFEST.in`` - into the working directory, ``/usr/src/app``.
1. **ADD** the newly required files to be copied over - ``setup.py`` and ``MANIFEST.in`` - into the working directory, ``/usr/src/app``.

.. collapsible:: Why is this needed?

``setup.py`` and ``MANIFEST.in`` are needed to tell Klio and Dataflow how to build your pipeline as a Python package (i.e. what Python and non-Python files to include) since you're no longer using a Docker image as a "package" for your job.

3. **DOUBLE CHECK** any non-Python files needed for the job, e.g. models, JSON schemas, etc, are copied into the working directory, ``/usr/src/app``.
2. **DOUBLE CHECK** any non-Python files needed for the job, e.g. models, JSON schemas, etc, are copied into the working directory, ``/usr/src/app``.

.. collapsible:: Why is this needed?

Klio packages up your job to be installed (for unit tests, audits, and running on the direct runner), and to be uploaded to Dataflow locally on the job's worker image. Therefore, the Docker image needs to have all the required Python and non-Python files to run the job.

4. **ADD** the following line to the end of the file: ``RUN pip install .``
3. **ADD** the following line to the end of the file: ``RUN pip install .``

.. collapsible:: Why is this needed?

We install the package for the ability to run unit tests via ``klio job test``, run audits via ``klio job audit``, and - if needed - to run the job with Direct Runner.

5. **DOUBLE CHECK** that you ``COPY`` in your ``job-requirements.txt`` file into the image (it should already exist if the job was made via ``klio job create``). It can be grouped into one ``COPY`` line like the example below.
4. **DOUBLE CHECK** that you ``COPY`` in your ``job-requirements.txt`` file into the image (it should already exist if the job was made via ``klio job create``). It can be grouped into one ``COPY`` line like the example below.

.. collapsible:: Example of Required Changes

Expand All @@ -218,7 +212,6 @@ Required Changes
+ setup.py \
+ MANIFEST.in \
+ my-model.h5 \
+ klio-job.yaml \
+ job-requirements.txt \
run.py \
transforms.py \
Expand Down Expand Up @@ -246,7 +239,6 @@ The following is a collection of suggested changes to optimize Docker builds by
Note: Keeping ``pip install --upgrade pip setuptools`` (or similar) is still advised.

* **DELETE** any lines creating ``/usr/src/config``, i.e. ``RUN mkdir -p /usr/src/config``.
* **DELETE** the two lines ``ARG KLIO_CONFIG=klio-job.yaml`` and ``COPY $KLIO_CONFIG /usr/src/config/.effective-klio-job.yaml``.


.. collapsible:: Example of Suggested Changes
Expand Down Expand Up @@ -275,8 +267,6 @@ The following is a collection of suggested changes to optimize Docker builds by
my-model.h5 \
/usr/src/app/
- ARG KLIO_CONFIG=klio-job.yaml
- COPY $KLIO_CONFIG /usr/src/config/.effective-klio-job.yaml
.. collapsible:: Combined Example of Required & Suggested Changes

Expand All @@ -303,13 +293,10 @@ The following is a collection of suggested changes to optimize Docker builds by
+ MANIFEST.in \
+ job-requirements.txt \
+ my-model.h5 \
+ klio-job.yaml \
run.py \
transforms.py \
/usr/src/app/
- ARG KLIO_CONFIG=klio-job.yaml
- COPY $KLIO_CONFIG /usr/src/config/.effective-klio-job.yaml
+ RUN pip install .
.. _source distribution: https://packaging.python.org/guides/distributing-packages-using-setuptools/#source-distributions
Expand Down
2 changes: 1 addition & 1 deletion docs/src/faqs/run_on_kubernetes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Step 3: Update Dockerfile
.. code-block:: Docker
ARG KLIO_CONFIG=klio-job.yaml
COPY $KLIO_CONFIG klio-job-run-effective.yaml
COPY $KLIO_CONFIG klio-job.yaml
Step 4: Update ``klio-job.yaml``
Expand Down
5 changes: 5 additions & 0 deletions docs/src/reference/cli/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ Fixed

* Correctly validate existence of Dataflow-related Klio config when running on Dataflow (and not just "not --direct-runner").

Changed
*******

* When running a job, effective config is no longer written to ``klio-job-run-effective.yaml``, but instead to a temp file. This file no longer needs to be included in ``setup.py`` projects (See `PR 233 <https://github.com/spotify/klio/pull/233>`_).

.. end-21.10.0
.. _cli-21.9.0:
Expand Down
15 changes: 15 additions & 0 deletions docs/src/reference/lib/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
Changelog
=========

.. _lib-21.10.0:

21.10.0 (UNRELEASED)
--------------------

.. start-21.10.0
Changed
*******

* ``KlioConfig`` is now loaded on workers from pickled main session instead of a bundled config file (See `PR 233 <https://github.com/spotify/klio/pull/233>`_).

.. end-21.10.0
.. _lib-21.9.0:

21.9.0 (2021-10-12)
Expand Down
4 changes: 2 additions & 2 deletions examples/audio_spectrograms/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def run(self):
# approach to install deps (i.e. deps that don't require OS-level deps
# and/or not from internal PyPI)
install_requires=[], # optional
# NOTE: The `klio-job.yaml` and any other non-Python files
# required to run a Klio job *must* be listed in here in `data_files`
# NOTE: Any other non-Python files required to run a Klio job *must* be
# listed in here in `data_files`
data_files=[ # required
# tuple(
# str(dir where to install files, relative to Python modules),
Expand Down
2 changes: 0 additions & 2 deletions examples/catvdog/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,3 @@ COPY __init__.py \
transforms.py \
/usr/src/app/

ARG KLIO_CONFIG=klio-job.yaml
COPY $KLIO_CONFIG /usr/src/config/.effective-klio-job.yaml
35 changes: 0 additions & 35 deletions exec/src/klio_exec/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from klio.transforms import helpers
from klio_core import __version__ as klio_core_version
from klio_core import variables as var
from klio_core.config import core as config_core

from klio_exec import __version__ as klio_exec_version

Expand Down Expand Up @@ -279,34 +278,6 @@ def _get_run_callable(self):
)
raise SystemExit(1)

def _write_run_effective_config(self):
# this method assumes setup.py is being used!
if self.runtime_conf.direct_runner:
path = config_core.WORKER_RUN_EFFECTIVE_CONFIG_PATH
else:
path = config_core.RUN_EFFECTIVE_CONFIG_PATH
logging.debug(
"Writing runtime configuration to {}"
" in the job's running docker container.".format(path)
)
with open(path, "w") as f:
f.write(RUN_CONFIG_PREAMBLE)
self.config.write_to_file(f)

def _verify_setup_py(self):
# verify that setup.py has a reference to the runtime config file
data_files_line = f'(".", ["{config_core.RUN_EFFECTIVE_CONFIG_FILE}"])'
setup_file = self.config.pipeline_options.setup_file
with open(setup_file, "r") as r:
data = r.read()
if config_core.RUN_EFFECTIVE_CONFIG_FILE not in data:
logging.warning(
"Reference to 'klio-job-run-effective.yaml'"
" appears to be missing in 'setup.py'. Please"
" ensure that the 'data_files' list includes "
f"the tuple: {data_files_line}."
)

def _verify_packaging(self):
pipeline_opts = self.config.pipeline_options
experiments = pipeline_opts.experiments
Expand Down Expand Up @@ -344,12 +315,6 @@ def _verify_packaging(self):
)
raise SystemExit(1)

if setup_file_exists:
# when using setup.py, dump the current config to a runtime config
# file to be included in the distribution package
self._verify_setup_py()
self._write_run_effective_config()

def _setup_data_io_filters(self, in_pcol, label_prefix=None):
# label prefixes are required for multiple inputs (to avoid label
# name collisions in Beam)
Expand Down
Loading

0 comments on commit c8f0b19

Please sign in to comment.