Skip to content

Commit

Permalink
feat(clustering): report config reload errors to Konnect (Kong#12282)
Browse files Browse the repository at this point in the history
Data-plane nodes running in Konnect will now report config reload failures such as invalid configuration or transient errors to the control-plane.
  • Loading branch information
flrgh authored Jan 17, 2024
1 parent 960902b commit 0f95ffc
Show file tree
Hide file tree
Showing 7 changed files with 908 additions and 15 deletions.
149 changes: 143 additions & 6 deletions kong/clustering/config_helper.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ local isempty = require("table.isempty")
local isarray = require("table.isarray")
local nkeys = require("table.nkeys")
local buffer = require("string.buffer")
local db_errors = require("kong.db.errors")


local tostring = tostring
Expand All @@ -17,6 +18,7 @@ local sort = table.sort
local yield = require("kong.tools.yield").yield
local fetch_table = tablepool.fetch
local release_table = tablepool.release
local xpcall = xpcall


local ngx_log = ngx.log
Expand All @@ -29,6 +31,7 @@ local ngx_DEBUG = ngx.DEBUG


local DECLARATIVE_EMPTY_CONFIG_HASH = constants.DECLARATIVE_EMPTY_CONFIG_HASH
local ERRORS = constants.CLUSTERING_DATA_PLANE_ERROR
local _log_prefix = "[clustering] "


Expand Down Expand Up @@ -202,8 +205,96 @@ local function fill_empty_hashes(hashes)
end
end

function _M.update(declarative_config, msg)

--- Errors returned from _M.update() should have these fields
---
---@class kong.clustering.config_helper.update.err_t.base
---
---@field name string # identifier that can be used to classify the error type
---@field source string # lua function that is responsible for this error
---@field message string # error description/contents
---@field config_hash string


--- Error returned when something causes an exception to be thrown
---
---@class kong.clustering.config_helper.update.err_t.exception : kong.clustering.config_helper.update.err_t.base
---
---@field exception any # value that was passed to `error()`
---@field traceback string # lua traceback of the exception


--- Error returned when the configuration received from the control plane is
--- not valid
---
---@class kong.clustering.config_helper.update.err_t.declarative : kong.clustering.config_helper.update.err_t.base
---
---@field flattened_errors table
---@field fields table
---@field code? integer


--- Error returned when the act of reloading the local configuration failed
---
---@class kong.clustering.config_helper.update.err_t.reload : kong.clustering.config_helper.update.err_t.base


---@alias kong.clustering.config_helper.update.err_t
---| kong.clustering.config_helper.update.err_t.exception
---| kong.clustering.config_helper.update.err_t.declarative
---| kong.clustering.config_helper.update.err_t.reload


---@param err_t kong.clustering.config_helper.update.err_t
---@param msg kong.clustering.config_helper.update.msg
local function format_error(err_t, msg)
err_t.source = err_t.source or "kong.clustering.config_helper.update"
err_t.name = err_t.name or ERRORS.GENERIC
err_t.message = err_t.message or "an unexpected error occurred"
err_t.config_hash = msg.config_hash or DECLARATIVE_EMPTY_CONFIG_HASH

-- Declarative config parse errors will include all the input entities in
-- the error table. Strip these out to keep the error payload size small.
local errors = err_t.flattened_errors
if type(errors) == "table" then
for i = 1, #errors do
local err = errors[i]
if type(err) == "table" then
err.entity = nil
end
end
end
end


---@param err any # whatever was passed to `error()`
---@return kong.clustering.config_helper.update.err_t.exception err_t
local function format_exception(err)
return {
name = ERRORS.RELOAD,
source = "kong.clustering.config_helper.update",
message = "an exception was raised while updating the configuration",
exception = err,
traceback = debug.traceback(tostring(err), 1),
}
end


---@class kong.clustering.config_helper.update.msg : table
---
---@field config_table table
---@field config_hash string
---@field hashes table<string, string>
---@field current_transaction_id? string|number


---@param declarative_config table
---@param msg kong.clustering.config_helper.update.msg
---
---@return boolean? success
---@return string? err
---@return kong.clustering.config_helper.update.err_t? err_t
local function update(declarative_config, msg)
local config_table = msg.config_table
local config_hash = msg.config_hash
local hashes = msg.hashes
Expand All @@ -212,6 +303,11 @@ function _M.update(declarative_config, msg)

if not config_hash then
config_hash, hashes = calculate_config_hash(config_table)

-- update the message in-place with the calculated hashes so that this
-- metadata can be used in error-reporting
msg.config_hash = config_hash
msg.hashes = hashes
end

if hashes then
Expand All @@ -225,10 +321,16 @@ function _M.update(declarative_config, msg)
return true
end

local entities, err, _, meta, new_hash =
declarative_config:parse_table(config_table, config_hash)
local entities, err, err_t, meta, new_hash =
declarative_config:parse_table(config_table, config_hash)
if not entities then
return nil, "bad config received from control plane " .. err
---@type kong.clustering.config_helper.update.err_t.declarative
err_t = db_errors:declarative_config_flattened(err_t, config_table)

err_t.name = ERRORS.CONFIG_PARSE
err_t.source = "kong.db.declarative.parse_table"

return nil, "bad config received from control plane " .. err, err_t
end

if current_hash == new_hash then
Expand All @@ -243,17 +345,52 @@ function _M.update(declarative_config, msg)
local res
res, err = declarative.load_into_cache_with_events(entities, meta, new_hash, hashes, msg.current_transaction_id)
if not res then
return nil, err
---@type kong.clustering.config_helper.update.err_t.reload
err_t = {
name = ERRORS.RELOAD,
source = "kong.db.declarative.load_into_cache_with_events",
message = err,
}

return nil, err, err_t
end

if kong.configuration.log_level == "debug" then
ngx_log(ngx.DEBUG, _log_prefix, "loaded configuration with transaction ID " .. msg.current_transaction_id)
ngx_log(ngx.DEBUG, _log_prefix, "loaded configuration with transaction ID ",
msg.current_transaction_id)
end

return true
end


---@param declarative_config table
---@param msg kong.clustering.config_helper.update.msg
---
---@return boolean? success
---@return string? err
---@return kong.clustering.config_helper.update.err_t? err_t
function _M.update(declarative_config, msg)
local pok, ok_or_err, err, err_t = xpcall(update, format_exception,
declarative_config, msg)

local ok = pok and ok_or_err

if not pok then
err_t = ok_or_err --[[@as kong.clustering.config_helper.update.err_t.exception]]--
-- format_exception() captures the original error in the .exception field
err = err_t.exception or "unknown error"
end

if not ok and err_t then
format_error(err_t, msg)
end

return ok, err, err_t
end



_M.calculate_config_hash = calculate_config_hash


Expand Down
62 changes: 53 additions & 9 deletions kong/clustering/data_plane.lua
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@ local clustering_utils = require("kong.clustering.utils")
local declarative = require("kong.db.declarative")
local constants = require("kong.constants")
local pl_stringx = require("pl.stringx")

local inspect = require("inspect")

local assert = assert
local setmetatable = setmetatable
local math = math
local pcall = pcall
local tostring = tostring
local sub = string.sub
local ngx = ngx
Expand Down Expand Up @@ -66,6 +65,10 @@ function _M.new(clustering)
conf = clustering.conf,
cert = clustering.cert,
cert_key = clustering.cert_key,

-- in konnect_mode, reconfigure errors will be reported to the control plane
-- via WebSocket message
error_reporting = clustering.conf.konnect_mode,
}

return setmetatable(self, _MT)
Expand Down Expand Up @@ -105,6 +108,40 @@ local function send_ping(c, log_suffix)
end


---@param c resty.websocket.client
---@param err_t kong.clustering.config_helper.update.err_t
---@param log_suffix? string
local function send_error(c, err_t, log_suffix)
local payload, json_err = cjson_encode({
type = "error",
error = err_t,
})

if json_err then
json_err = tostring(json_err)
ngx_log(ngx_ERR, _log_prefix, "failed to JSON-encode error payload for ",
"control plane: ", json_err, ", payload: ", inspect(err_t), log_suffix)

payload = assert(cjson_encode({
type = "error",
error = {
name = constants.CLUSTERING_DATA_PLANE_ERROR.GENERIC,
message = "failed to encode JSON error payload: " .. json_err,
source = "kong.clustering.data_plane.send_error",
config_hash = err_t and err_t.config_hash
or DECLARATIVE_EMPTY_CONFIG_HASH,
}
}))
end

local ok, err = c:send_binary(payload)
if not ok then
ngx_log(ngx_ERR, _log_prefix, "failed to send error report to control plane: ",
err, log_suffix)
end
end


function _M:communicate(premature)
if premature then
-- worker wants to exit
Expand Down Expand Up @@ -181,6 +218,7 @@ function _M:communicate(premature)
local ping_immediately
local config_exit
local next_data
local config_err_t

local config_thread = ngx.thread.spawn(function()
while not exiting() and not config_exit do
Expand Down Expand Up @@ -212,14 +250,14 @@ function _M:communicate(premature)
msg.timestamp and " with timestamp: " .. msg.timestamp or "",
log_suffix)

local pok, res, err = pcall(config_helper.update, self.declarative_config, msg)
if pok then
ping_immediately = true
end
local err_t
ok, err, err_t = config_helper.update(self.declarative_config, msg)

if not pok or not res then
ngx_log(ngx_ERR, _log_prefix, "unable to update running config: ",
(not pok and res) or err)
if not ok then
if self.error_reporting then
config_err_t = err_t
end
ngx_log(ngx_ERR, _log_prefix, "unable to update running config: ", err)
end

if next_data == data then
Expand All @@ -241,6 +279,12 @@ function _M:communicate(premature)
send_ping(c, log_suffix)
end

if config_err_t then
local err_t = config_err_t
config_err_t = nil
send_error(c, err_t, log_suffix)
end

counter = counter - 1

ngx_sleep(1)
Expand Down
5 changes: 5 additions & 0 deletions kong/constants.lua
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ local constants = {
CLUSTERING_TIMEOUT = 5000, -- 5 seconds
CLUSTERING_PING_INTERVAL = 30, -- 30 seconds
CLUSTERING_OCSP_TIMEOUT = 5000, -- 5 seconds
CLUSTERING_DATA_PLANE_ERROR = {
CONFIG_PARSE = "declarative configuration parse failure",
RELOAD = "configuration reload failed",
GENERIC = "generic or unknown error",
},

CLEAR_HEALTH_STATUS_DELAY = 300, -- 300 seconds

Expand Down
Loading

0 comments on commit 0f95ffc

Please sign in to comment.