Implemented issue #2188 - WebApp (monitoring section) claims scheduler not running, if delayed jobs get too many.

This commit is contained in:
Martin Edenhofer 2018-08-15 09:11:48 +02:00
parent 88938024cd
commit 1306ca16a9
2 changed files with 79 additions and 20 deletions

View file

@ -46,7 +46,8 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX
issues.push "#{message} #{channel.last_log_in}"
end
if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance
issues.push "#{message} channel is active but not fetched for 1 hour"
diff = Time.zone.now - channel.preferences['last_fetch']
issues.push "#{message} channel is active but not fetched for #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} hour"
end
# outbound channel
@ -71,11 +72,11 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX
end
end
# scheduler check
Scheduler.where(active: true).where.not(last_run: nil).each do |scheduler|
next if scheduler.period <= 300
next if scheduler.last_run + scheduler.period.seconds > Time.zone.now - 5.minutes
issues.push 'scheduler not running'
# scheduler running check
Scheduler.where('active = ? AND period > 300', true).where.not(last_run: nil).order(last_run: :asc, period: :asc).each do |scheduler|
diff = Time.zone.now - (scheduler.last_run + scheduler.period.seconds)
next if diff < 8.minutes
issues.push "scheduler may not run (last execution of #{scheduler.method} #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} over) - please contact your system administrator"
break
end
if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count
@ -90,20 +91,23 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX
# failed jobs check
failed_jobs = Delayed::Job.where('attempts > 0')
count_failed_jobs = failed_jobs.count
if count_failed_jobs > 10
issues.push "#{count_failed_jobs} failing background jobs."
issues.push "#{count_failed_jobs} failing background jobs"
end
listed_failed_jobs = failed_jobs.select(:handler, :attempts).limit(10)
sorted_failed_jobs = listed_failed_jobs.group_by(&:name).sort_by { |_handler, entries| entries.length }.reverse.to_h
sorted_failed_jobs.each_with_index do |(name, jobs), index|
attempts = jobs.map(&:attempts).sum
issues.push "Failed to run background job ##{index += 1} '#{name}' #{jobs.count} time(s) with #{attempts} attempt(s)."
end
# job count check
total_jobs = Delayed::Job.where('created_at < ?', Time.zone.now - 15.minutes).count
if total_jobs > 8000
issues.push "#{total_jobs} background jobs in queue"
end
# import jobs
import_backends = ImportJob.backends

View file

@ -368,7 +368,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
channel.last_log_out = nil
channel.save!
# health_check
# health_check - channel
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
@ -379,12 +379,28 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
assert_equal(false, result['healthy'])
assert_equal('Channel: Email::Notification out ', result['message'])
# health_check - scheduler may not run
scheduler = Scheduler.where(active: true).last
scheduler.last_run = Time.zone.now - 20.minutes
scheduler.period = 600
scheduler.save!
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
result = JSON.parse(@response.body)
assert_equal(Hash, result.class)
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal("Channel: Email::Notification out ;scheduler may not run (last execution of #{scheduler.method} 10 minutes over) - please contact your system administrator", result['message'])
# health_check - scheduler may not run
scheduler = Scheduler.where(active: true).last
scheduler.last_run = Time.zone.now - 1.day
scheduler.period = 600
scheduler.save!
# health_check
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
@ -393,13 +409,53 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal('Channel: Email::Notification out ;scheduler not running', result['message'])
assert_equal("Channel: Email::Notification out ;scheduler may not run (last execution of #{scheduler.method} about 24 hours over) - please contact your system administrator", result['message'])
# health_check - scheduler job count
travel 2.seconds
8001.times do
Delayed::Job.enqueue( BackgroundJobSearchIndex.new('Ticket', 1))
end
Scheduler.where(active: true).each do |local_scheduler|
local_scheduler.last_run = Time.zone.now
local_scheduler.save!
end
total_jobs = Delayed::Job.count
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
result = JSON.parse(@response.body)
assert_equal(Hash, result.class)
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal('Channel: Email::Notification out ', result['message'])
travel 20.minutes
Scheduler.where(active: true).each do |local_scheduler|
local_scheduler.last_run = Time.zone.now
local_scheduler.save!
end
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
result = JSON.parse(@response.body)
assert_equal(Hash, result.class)
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal("Channel: Email::Notification out ;#{total_jobs} background jobs in queue", result['message'])
Delayed::Job.delete_all
travel_back
# health_check - unprocessable mail
dir = Rails.root.join('tmp', 'unprocessable_mail')
FileUtils.mkdir_p(dir)
FileUtils.touch("#{dir}/test.eml")
# health_check
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
@ -408,10 +464,10 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal('Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running', result['message'])
assert_equal('Channel: Email::Notification out ;unprocessable mails: 1', result['message'])
# health_check - ldap
Setting.set('ldap_integration', true)
ImportJob.create(
name: 'Import::Ldap',
started_at: Time.zone.now,
@ -421,7 +477,6 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
}
)
# health_check
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
assert_response(200)
@ -430,7 +485,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running;Failed to run import backend 'Import::Ldap'. Cause: Some bad error", result['message'])
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;Failed to run import backend 'Import::Ldap'. Cause: Some bad error", result['message'])
stuck_updated_at_timestamp = 15.minutes.ago
ImportJob.create(
@ -449,7 +504,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running;Failed to run import backend 'Import::Ldap'. Cause: Some bad error;Stuck import backend 'Import::Ldap' detected. Last update: #{stuck_updated_at_timestamp}", result['message'])
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;Failed to run import backend 'Import::Ldap'. Cause: Some bad error;Stuck import backend 'Import::Ldap' detected. Last update: #{stuck_updated_at_timestamp}", result['message'])
Setting.set('ldap_integration', false)
end
@ -632,7 +687,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
assert(result['message'])
assert(result['issues'])
assert_equal(false, result['healthy'])
assert_equal("13 failing background jobs.;Failed to run background job #1 'Object' 8 time(s) with 40 attempt(s).;Failed to run background job #2 'BackgroundJobSearchIndex' 2 time(s) with 14 attempt(s).", result['message'])
assert_equal("13 failing background jobs;Failed to run background job #1 'Object' 8 time(s) with 40 attempt(s).;Failed to run background job #2 'BackgroundJobSearchIndex' 2 time(s) with 14 attempt(s).", result['message'])
# cleanup
Delayed::Job.delete_all