Skip to content

Commit

Permalink
Update MiqServer::ServerMonitor
Browse files Browse the repository at this point in the history
  • Loading branch information
agrare committed Nov 4, 2021
1 parent 8086150 commit 39be7ed
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 53 deletions.
20 changes: 19 additions & 1 deletion app/models/miq_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ def validate_is_deleteable
end
end

def server_monitor
@server_monitor ||= ServerMonitor.new(self)
end

def worker_manager
@worker_manager ||= WorkerManagement.build(self)
end
Expand Down Expand Up @@ -196,7 +200,7 @@ def monitor
Benchmark.realtime_block(:server_dequeue) { process_miq_queue } if threshold_exceeded?(:server_dequeue_frequency, now)

Benchmark.realtime_block(:server_monitor) do
monitor_servers
server_monitor.monitor_servers
monitor_server_roles if self.is_master?
end if threshold_exceeded?(:server_monitor_frequency, now)

Expand Down Expand Up @@ -472,6 +476,20 @@ def find_other_servers_in_zone
self.class.where(:zone_id => zone_id).where.not(:id => id).to_a
end

def mark_as_not_responding(seconds = ::Settings.server.heartbeat_timeout.to_i_with_method)
msg = "#{format_full_log_msg} has not responded in #{seconds} seconds."
_log.info(msg)
update(:status => "not responding")
deactivate_all_roles

# TODO: need to add event for this
MiqEvent.raise_evm_event_queue_in_region(self, "evm_server_not_responding", :event_details => msg)

# Mark all messages currently being worked on by the not responding server's workers as error
_log.info("Cleaning all active messages being processed by #{format_full_log_msg}")
miq_workers.each(&:clean_active_messages)
end

def display_name
"#{name} [#{id}]"
end
Expand Down
47 changes: 21 additions & 26 deletions app/models/miq_server/server_monitor.rb
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
class MiqServer::ServerMonitor
def mark_as_not_responding(seconds = miq_server_time_threshold)
msg = "#{format_full_log_msg} has not responded in #{seconds} seconds."
_log.info(msg)
update(:status => "not responding")
deactivate_all_roles

# TODO: need to add event for this
MiqEvent.raise_evm_event_queue_in_region(self, "evm_server_not_responding", :event_details => msg)

# Mark all messages currently being worked on by the not responding server's workers as error
_log.info("Cleaning all active messages being processed by #{format_full_log_msg}")
miq_workers.each(&:clean_active_messages)
include Vmdb::Logging

attr_reader :my_server

def initialize(my_server)
@my_server = my_server
end

def monitor_servers
my_server.reload.is_master? ? monitor_servers_as_master : monitor_servers_as_non_master
end

private

def make_master_server(last_master)
_log.info("Master server has #{last_master.nil? ? "not been set" : "died, #{last_master.name}"}. Attempting takeover as new master server, #{name}.")
_log.info("Master server has #{last_master.nil? ? "not been set" : "died, #{last_master.name}"}. Attempting takeover as new master server, #{my_server.name}.")
parent = MiqRegion.my_region(true)
parent.lock do
# See if an ACTIVE server has already taken over
Expand All @@ -27,17 +27,16 @@ def make_master_server(last_master)
return nil
end

_log.debug("Setting this server, #{name}, as master server")
_log.debug("Setting this server, #{my_server.name}, as master server")

# Set is_master on self, reset every other server in the region, including
# inactive ones.
parent.miq_servers.each do |s|
s.is_master = (id == s.id)
s.is_master = (my_server.id == s.id)
s.save!
end
end
_log.info("This server #{name} is now set as the master server, last_master: #{last_master.try(:name)}")
self
_log.info("This server #{my_server.name} is now set as the master server, last_master: #{last_master.try(:name)}")
end

def miq_server_time_threshold
Expand All @@ -50,7 +49,7 @@ def monitor_servers_as_master
@last_servers ||= {}

# Check all of the other servers and see if we have new servers, servers have stopped, or servers have stopped responding
all_servers = find_other_started_servers_in_region
all_servers = my_server.find_other_started_servers_in_region

current_ids = all_servers.collect(&:id)
last_ids = @last_servers.keys
Expand All @@ -75,7 +74,7 @@ def monitor_servers_as_master

if s.is_master?
_log.info("#{s.format_short_log_msg} has been detected as a second master and is being demoted.")
update(:is_master => false)
my_server.update(:is_master => false)
end

else # unchanged
Expand Down Expand Up @@ -115,7 +114,7 @@ def monitor_servers_as_non_master
else
_log.info("Master #{master.format_full_log_msg} has not responded in #{miq_server_time_threshold} seconds.") unless master.nil?
make_master_server(@last_master.empty? ? nil : @last_master[:record])
if reload.is_master?
if my_server.reload.is_master?
master.mark_as_not_responding unless master.nil?
@last_master = nil

Expand All @@ -127,8 +126,8 @@ def monitor_servers_as_non_master

# Raise miq_server_is_master event
master_msg = master && " from #{master.format_short_log_msg}"
msg = "#{format_short_log_msg} has taken over master#{master_msg}"
MiqEvent.raise_evm_event_queue_in_region(self, "evm_server_is_master", :event_details => msg)
msg = "#{my_server.format_short_log_msg} has taken over master#{master_msg}"
MiqEvent.raise_evm_event_queue_in_region(my_server, "evm_server_is_master", :event_details => msg)

monitor_servers_as_master
else
Expand All @@ -137,8 +136,4 @@ def monitor_servers_as_non_master
end
end
end

def monitor_servers
reload.is_master? ? monitor_servers_as_master : monitor_servers_as_non_master
end
end
2 changes: 1 addition & 1 deletion lib/workers/evm_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def configure_server_roles
#############################################################
@current_server.deactivate_all_roles
@current_server.set_database_owner_role(EvmDatabase.local?)
@current_server.monitor_servers
@current_server.server_monitor.monitor_servers
@current_server.monitor_server_roles if @current_server.is_master?
@current_server.sync_active_roles
@current_server.set_active_role_flags
Expand Down
50 changes: 25 additions & 25 deletions spec/models/miq_server/server_monitor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
context "with 1 Server" do
before do
@miq_server = EvmSpecHelper.local_miq_server
@miq_server.monitor_servers
@miq_server.server_monitor.monitor_servers

@miq_server.deactivate_all_roles
@miq_server.role = 'event, ems_operations, scheduler, reporting'
Expand Down Expand Up @@ -278,7 +278,7 @@
@miq_server2.is_master = false
@miq_server2.save!

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server1.monitor_server_roles
@miq_server2.reload
end
Expand All @@ -292,9 +292,9 @@

context "where Non-Master is not responding" do
before do
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
Timecop.travel 5.minutes do
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end
end

Expand Down Expand Up @@ -339,7 +339,7 @@
@roles2.each { |role, priority| @miq_server2.assign_role(ServerRole.find_by(:name => role), priority) }
@miq_server2.activate_roles("event", "ems_operations", 'scheduler', 'reporting')

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end

it "should have all roles active after sync between them" do
Expand All @@ -356,7 +356,7 @@
@miq_server2.status = "stopped"
@miq_server2.is_master = false
@miq_server2.save!
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end

it "should takeover as Master" do
Expand All @@ -379,7 +379,7 @@
context "where Master is not responding" do
before do
Timecop.travel 5.minutes
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end

after do
Expand Down Expand Up @@ -430,19 +430,19 @@
@roles3 = [['ems_operations', 2], ['event', 3], ['ems_inventory', 1], ['ems_metrics_coordinator', 1]]
@roles3.each { |role, priority| @miq_server3.assign_role(ServerRole.find_by(:name => role), priority) }

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server1.monitor_server_roles if @miq_server1.is_master?
@miq_server2.reload
@miq_server3.reload
end

it "should support multiple failover transitions from stopped master" do
# server1 is first to start, becomes master
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers

# Initialize the bookkeeping around current and last master
@miq_server2.monitor_servers
@miq_server3.monitor_servers
@miq_server2.server_monitor.monitor_servers
@miq_server3.server_monitor.monitor_servers

# server1 is master
expect(@miq_server1.reload.is_master).to be_truthy
Expand All @@ -453,7 +453,7 @@
@miq_server1.update(:status => "stopped")

# server 3 becomes master, server 2 hasn't monitored servers yet
@miq_server3.monitor_servers
@miq_server3.server_monitor.monitor_servers
expect(@miq_server1.reload.is_master).to be_falsey
expect(@miq_server2.reload.is_master).to be_falsey
expect(@miq_server3.reload.is_master).to be_truthy
Expand All @@ -462,15 +462,15 @@
@miq_server3.update(:status => "stopped")

# server 2 finally gets to monitor_servers, takes over
@miq_server2.monitor_servers
@miq_server2.server_monitor.monitor_servers
expect(@miq_server1.reload.is_master).to be_falsey
expect(@miq_server2.reload.is_master).to be_truthy
expect(@miq_server3.reload.is_master).to be_falsey
end

it "should failover from stopped master on startup" do
# server 1 is first to start, becomes master
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers

# server 1 shuts down
@miq_server1.update(:status => "stopped")
Expand All @@ -480,7 +480,7 @@
expect(@miq_server3.reload.is_master).to be_falsey

# server 3 runs monitor_servers and becomes master
@miq_server3.monitor_servers
@miq_server3.server_monitor.monitor_servers
expect(@miq_server1.reload.is_master).to be_falsey
expect(@miq_server3.reload.is_master).to be_truthy
end
Expand Down Expand Up @@ -533,7 +533,7 @@
@miq_server3.is_master = false
@miq_server3.save!

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server1.monitor_server_roles if @miq_server1.is_master?
@miq_server2.reload
@miq_server3.reload
Expand All @@ -560,7 +560,7 @@
@miq_server3.status = "started"
@miq_server3.save!

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server1.monitor_server_roles if @miq_server1.is_master?
@miq_server2.reload
@miq_server3.reload
Expand Down Expand Up @@ -594,7 +594,7 @@
@miq_server2.is_master = false
@miq_server2.save!

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server1.monitor_server_roles if @miq_server1.is_master?
@miq_server2.reload
@miq_server3.reload
Expand All @@ -621,7 +621,7 @@
@miq_server2.status = "started"
@miq_server2.save!

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server1.monitor_server_roles if @miq_server1.is_master?
@miq_server2.reload
@miq_server3.reload
Expand Down Expand Up @@ -665,7 +665,7 @@
@miq_server3.role = 'ems_metrics_coordinator, ems_inventory, ems_operations'
@miq_server3.activate_roles("ems_operations")

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end

it "should have the master on Server 2" do
Expand All @@ -692,7 +692,7 @@
@miq_server2.is_master = false
@miq_server2.save!

@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end

it "should takeover as Master" do
Expand Down Expand Up @@ -729,7 +729,7 @@
context "where Master is not responding" do
before do
Timecop.travel 5.minutes
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
end

after do
Expand Down Expand Up @@ -782,12 +782,12 @@
end

it "should allow only 1 Master in the Region" do
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server2.reload
expect(@miq_server1.is_master).to be_truthy
expect(@miq_server2.is_master).to be_falsey

@miq_server2.monitor_servers
@miq_server2.server_monitor.monitor_servers
@miq_server2.reload
expect(@miq_server1.is_master).to be_truthy
expect(@miq_server2.is_master).to be_falsey
Expand Down Expand Up @@ -843,7 +843,7 @@
end

it "should resolve 1 Master in the Zone" do
@miq_server1.monitor_servers
@miq_server1.server_monitor.monitor_servers
@miq_server2.reload
expect(@miq_server1.is_master?).to be_truthy
expect(@miq_server2.is_master?).not_to be_truthy
Expand Down

0 comments on commit 39be7ed

Please sign in to comment.