Skip to content

Commit

Permalink
[PSM Interop] Increase CSM NEG annotation timeout; improve wording (g…
Browse files Browse the repository at this point in the history
…rpc#34555)

- GAMMA server runner: increase the wait time for the NEG annotation
from 1 minute to 3.
- Improve the wording around the wait for NEG methods to make it clear
this is the annotation we're waiting for - so it's not confused with
getting the NEG health from the GCP APIs.

ref b/298501683, b/302723651
  • Loading branch information
sergiitk authored Oct 4, 2023
1 parent 0a3bc67 commit 96f36b6
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 28 deletions.
2 changes: 1 addition & 1 deletion tools/run_tests/xds_k8s_test_driver/bin/run_td_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def main(
k8s_api_manager, server_namespace
)

neg_name, neg_zones = k8s_namespace.get_service_neg(
neg_name, neg_zones = k8s_namespace.parse_service_neg_status(
server_name, server_port
)

Expand Down
58 changes: 48 additions & 10 deletions tools/run_tests/xds_k8s_test_driver/framework/infrastructure/k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
_api: KubernetesApiManager
_name: str

NEG_STATUS_META = "cloud.google.com/neg-status"
NEG_STATUS_ANNOTATION = "cloud.google.com/neg-status"
DELETE_GRACE_PERIOD_SEC: int = 5
WAIT_SHORT_TIMEOUT_SEC: int = 60
WAIT_SHORT_SLEEP_SEC: int = 1
Expand Down Expand Up @@ -771,7 +771,7 @@ def wait_for_namespace_deleted(
)
retryer(self.get)

def wait_for_service_neg(
def wait_for_service_neg_status_annotation(
self,
name: str,
timeout_sec: int = WAIT_SHORT_TIMEOUT_SEC,
Expand All @@ -781,29 +781,36 @@ def wait_for_service_neg(
retryer = retryers.constant_retryer(
wait_fixed=_timedelta(seconds=wait_sec),
timeout=timeout,
check_result=self._check_service_neg_annotation,
check_result=self._check_service_neg_status_annotation,
)
try:
retryer(self.get_service, name)
except retryers.RetryError as retry_err:
result = retry_err.result()
note = framework.errors.FrameworkError.note_blanket_error_info_below(
"A k8s service wasn't assigned a NEG (Network Endpoint Group).",
"A Kubernetes Service wasn't assigned a NEG (Network Endpoint"
" Group) status annotation.",
info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for service {name}"
f" to report NEG status. Last service status:\n"
f"Timeout {timeout} (h:mm:ss) waiting for Kubernetes"
f" Service {name} in the namespace {self.name} to report"
f" the '{self.NEG_STATUS_ANNOTATION}' metadata annotation."
f"\nThis indicates the NEG wasn't created OR"
f" the NEG creation event hasn't propagated to Kubernetes."
f" Service metadata:\n"
f"{self._pretty_format_metadata(result, highlight=False)}"
f"Service status:\n"
f"{self._pretty_format_status(result, highlight=False)}"
),
)
retry_err.add_note(note)
raise

def get_service_neg(
def parse_service_neg_status(
self, service_name: str, service_port: int
) -> Tuple[str, List[str]]:
service = self.get_service(service_name)
neg_info: dict = json.loads(
service.metadata.annotations[self.NEG_STATUS_META]
service.metadata.annotations[self.NEG_STATUS_ANNOTATION]
)
neg_name: str = neg_info["network_endpoint_groups"][str(service_port)]
neg_zones: List[str] = neg_info["zones"]
Expand Down Expand Up @@ -1027,6 +1034,37 @@ def _pretty_format_status(
# Return the name of k8s object, and its pretty-printed status.
return f"{name}:\n{status}\n"

def _pretty_format_metadata(
self,
k8s_object: Optional[object],
*,
highlight: bool = True,
managed_fields: bool = False,
) -> str:
if k8s_object is None:
return "No data"

# Parse the name if present.
if not hasattr(k8s_object, "metadata"):
return "Object metadata missing"

name = k8s_object.metadata.name or "Can't parse resource name"

# Pretty-print metadata.
try:
metadata_dict: dict = k8s_object.metadata.to_dict()
# Don't print manged fields by default. Lots of noise with no value.
if not managed_fields:
metadata_dict.pop("managed_fields", None)
metadata = self._pretty_format(metadata_dict, highlight=highlight)
except Exception as e: # pylint: disable=broad-except
# Catching all exceptions because not printing the metadata
# isn't as important as the system under test.
metadata = f"Can't parse resource metadata: {e}"

# Return the name of k8s object, and its pretty-printed status.
return f"{name}:\n{metadata}\n"

def _pretty_format(
self,
data: dict,
Expand All @@ -1038,12 +1076,12 @@ def _pretty_format(
return self._highlighter.highlight(yaml_out) if highlight else yaml_out

@classmethod
def _check_service_neg_annotation(
def _check_service_neg_status_annotation(
cls, service: Optional[V1Service]
) -> bool:
return (
isinstance(service, V1Service)
and cls.NEG_STATUS_META in service.metadata.annotations
and cls.NEG_STATUS_ANNOTATION in service.metadata.annotations
)

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"""
Run xDS Test Client on Kubernetes using Gamma
"""
import datetime
import logging
from typing import List, Optional

Expand Down Expand Up @@ -217,7 +218,13 @@ def run( # pylint: disable=arguments-differ

# The controller will not populate the NEGs until there are
# endpoint slices.
self._wait_service_neg(self.service_name, test_port)
# For this reason, we run this check after the servers were created,
# and increase the wait time from 1 minute to 3.
self._wait_service_neg_status_annotation(
self.service_name,
test_port,
timeout_sec=datetime.timedelta(minutes=3).total_seconds(),
)

return servers

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -769,14 +769,35 @@ def _start_logging_pod(
self.pod_log_collectors.append(pod_log_collector)
return pod_log_collector

def _wait_service_neg(self, name, service_port, **kwargs):
logger.info("Waiting for NEG for service %s", name)
self.k8s_namespace.wait_for_service_neg(name, **kwargs)
neg_name, neg_zones = self.k8s_namespace.get_service_neg(
name, service_port
def _wait_service_neg_status_annotation(
self,
service_name: str,
service_port: int,
**kwargs,
) -> None:
logger.info(
"Waiting for '%s' annotation for a NEG at port %s to be assigned to"
" Kubernetes Service %s in namespace %s",
self.k8s_namespace.NEG_STATUS_ANNOTATION,
service_port,
service_name,
self.k8s_namespace.name,
)
self.k8s_namespace.wait_for_service_neg_status_annotation(
service_name, **kwargs
)
neg_name, neg_zones = self.k8s_namespace.parse_service_neg_status(
service_name, service_port
)
logger.info(
"Service %s: detected NEG=%s in zones=%s", name, neg_name, neg_zones
"Detected '%s' annotation for Kubernetes Service %s, namespace %s:"
" neg_name=%s, port=%s, zones=%s",
self.k8s_namespace.NEG_STATUS_ANNOTATION,
service_name,
self.k8s_namespace.name,
neg_name,
service_port,
neg_zones,
)

def logs_explorer_link(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def run( # pylint: disable=arguments-differ,too-many-branches
neg_name=self.gcp_neg_name,
test_port=test_port,
)
self._wait_service_neg(self.service_name, test_port)
self._wait_service_neg_status_annotation(self.service_name, test_port)

if self.enable_workload_identity:
# Allow Kubernetes service account to use the GCP service account
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,10 @@ def setupServerBackends(
if server_runner is None:
server_runner = self.server_runner
# Load Backends
neg_name, neg_zones = server_runner.k8s_namespace.get_service_neg(
(
neg_name,
neg_zones,
) = server_runner.k8s_namespace.parse_service_neg_status(
server_runner.service_name, self.server_port
)

Expand All @@ -305,7 +308,10 @@ def removeServerBackends(self, *, server_runner=None):
if server_runner is None:
server_runner = self.server_runner
# Load Backends
neg_name, neg_zones = server_runner.k8s_namespace.get_service_neg(
(
neg_name,
neg_zones,
) = server_runner.k8s_namespace.parse_service_neg_status(
server_runner.service_name, self.server_port
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,12 +309,15 @@ def setup(self, test_case_classes: Iterable["XdsUrlMapTestCase"]) -> None:
replica_count=3,
)
# Add backend to default backend service
neg_name, neg_zones = self.k8s_namespace.get_service_neg(
neg_name, neg_zones = self.k8s_namespace.parse_service_neg_status(
self.test_server_runner.service_name, self.server_port
)
self.td.backend_service_add_neg_backends(neg_name, neg_zones)
# Add backend to alternative backend service
neg_name_alt, neg_zones_alt = self.k8s_namespace.get_service_neg(
(
neg_name_alt,
neg_zones_alt,
) = self.k8s_namespace.parse_service_neg_status(
self.test_server_alternative_runner.service_name, self.server_port
)
self.td.alternative_backend_service_add_neg_backends(
Expand All @@ -324,7 +327,7 @@ def setup(self, test_case_classes: Iterable["XdsUrlMapTestCase"]) -> None:
(
neg_name_affinity,
neg_zones_affinity,
) = self.k8s_namespace.get_service_neg(
) = self.k8s_namespace.parse_service_neg_status(
self.test_server_affinity_runner.service_name, self.server_port
)
self.td.affinity_backend_service_add_neg_backends(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,10 @@ def setUpClass(cls):
)

# Load backends.
neg_name, neg_zones = cls.server_runner.k8s_namespace.get_service_neg(
(
neg_name,
neg_zones,
) = cls.server_runner.k8s_namespace.parse_service_neg_status(
cls.server_runner.service_name, cls.server_port
)

Expand All @@ -116,7 +119,10 @@ def setUpClass(cls):
def tearDownClass(cls):
# Remove backends from the Backend Service before closing the server
# runner.
neg_name, neg_zones = cls.server_runner.k8s_namespace.get_service_neg(
(
neg_name,
neg_zones,
) = cls.server_runner.k8s_namespace.parse_service_neg_status(
cls.server_runner.service_name, cls.server_port
)
cls.td.backend_service_remove_neg_backends(neg_name, neg_zones)
Expand Down Expand Up @@ -209,7 +215,10 @@ def test_baseline_in_server_with_bootstrap_version(self, version, image):
)

# Load backends.
neg_name, neg_zones = self.server_runner.k8s_namespace.get_service_neg(
(
neg_name,
neg_zones,
) = self.server_runner.k8s_namespace.parse_service_neg_status(
self.server_runner.service_name, self.server_port
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_change_backend_service(self) -> None:
(
neg_name_alt,
neg_zones_alt,
) = self.alternate_k8s_namespace.get_service_neg(
) = self.alternate_k8s_namespace.parse_service_neg_status(
self.alternate_server_runner.service_name, self.server_port
)
self.td.alternative_backend_service_add_neg_backends(
Expand Down

0 comments on commit 96f36b6

Please sign in to comment.