Implemented issue #2188 - WebApp (monitoring section) claims scheduler not running, if delayed jobs get too many.
This commit is contained in:
parent
88938024cd
commit
1306ca16a9
2 changed files with 79 additions and 20 deletions
|
@ -46,7 +46,8 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX
|
|||
issues.push "#{message} #{channel.last_log_in}"
|
||||
end
|
||||
if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance
|
||||
issues.push "#{message} channel is active but not fetched for 1 hour"
|
||||
diff = Time.zone.now - channel.preferences['last_fetch']
|
||||
issues.push "#{message} channel is active but not fetched for #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} hour"
|
||||
end
|
||||
|
||||
# outbound channel
|
||||
|
@ -71,11 +72,11 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX
|
|||
end
|
||||
end
|
||||
|
||||
# scheduler check
|
||||
Scheduler.where(active: true).where.not(last_run: nil).each do |scheduler|
|
||||
next if scheduler.period <= 300
|
||||
next if scheduler.last_run + scheduler.period.seconds > Time.zone.now - 5.minutes
|
||||
issues.push 'scheduler not running'
|
||||
# scheduler running check
|
||||
Scheduler.where('active = ? AND period > 300', true).where.not(last_run: nil).order(last_run: :asc, period: :asc).each do |scheduler|
|
||||
diff = Time.zone.now - (scheduler.last_run + scheduler.period.seconds)
|
||||
next if diff < 8.minutes
|
||||
issues.push "scheduler may not run (last execution of #{scheduler.method} #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} over) - please contact your system administrator"
|
||||
break
|
||||
end
|
||||
if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count
|
||||
|
@ -90,20 +91,23 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX
|
|||
# failed jobs check
|
||||
failed_jobs = Delayed::Job.where('attempts > 0')
|
||||
count_failed_jobs = failed_jobs.count
|
||||
|
||||
if count_failed_jobs > 10
|
||||
issues.push "#{count_failed_jobs} failing background jobs."
|
||||
issues.push "#{count_failed_jobs} failing background jobs"
|
||||
end
|
||||
|
||||
listed_failed_jobs = failed_jobs.select(:handler, :attempts).limit(10)
|
||||
sorted_failed_jobs = listed_failed_jobs.group_by(&:name).sort_by { |_handler, entries| entries.length }.reverse.to_h
|
||||
sorted_failed_jobs.each_with_index do |(name, jobs), index|
|
||||
|
||||
attempts = jobs.map(&:attempts).sum
|
||||
|
||||
issues.push "Failed to run background job ##{index += 1} '#{name}' #{jobs.count} time(s) with #{attempts} attempt(s)."
|
||||
end
|
||||
|
||||
# job count check
|
||||
total_jobs = Delayed::Job.where('created_at < ?', Time.zone.now - 15.minutes).count
|
||||
if total_jobs > 8000
|
||||
issues.push "#{total_jobs} background jobs in queue"
|
||||
end
|
||||
|
||||
# import jobs
|
||||
import_backends = ImportJob.backends
|
||||
|
||||
|
|
|
@ -368,7 +368,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
channel.last_log_out = nil
|
||||
channel.save!
|
||||
|
||||
# health_check
|
||||
# health_check - channel
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
|
@ -379,12 +379,28 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
assert_equal(false, result['healthy'])
|
||||
assert_equal('Channel: Email::Notification out ', result['message'])
|
||||
|
||||
# health_check - scheduler may not run
|
||||
scheduler = Scheduler.where(active: true).last
|
||||
scheduler.last_run = Time.zone.now - 20.minutes
|
||||
scheduler.period = 600
|
||||
scheduler.save!
|
||||
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
result = JSON.parse(@response.body)
|
||||
assert_equal(Hash, result.class)
|
||||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal("Channel: Email::Notification out ;scheduler may not run (last execution of #{scheduler.method} 10 minutes over) - please contact your system administrator", result['message'])
|
||||
|
||||
# health_check - scheduler may not run
|
||||
scheduler = Scheduler.where(active: true).last
|
||||
scheduler.last_run = Time.zone.now - 1.day
|
||||
scheduler.period = 600
|
||||
scheduler.save!
|
||||
|
||||
# health_check
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
|
@ -393,13 +409,53 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal('Channel: Email::Notification out ;scheduler not running', result['message'])
|
||||
assert_equal("Channel: Email::Notification out ;scheduler may not run (last execution of #{scheduler.method} about 24 hours over) - please contact your system administrator", result['message'])
|
||||
|
||||
# health_check - scheduler job count
|
||||
travel 2.seconds
|
||||
8001.times do
|
||||
Delayed::Job.enqueue( BackgroundJobSearchIndex.new('Ticket', 1))
|
||||
end
|
||||
Scheduler.where(active: true).each do |local_scheduler|
|
||||
local_scheduler.last_run = Time.zone.now
|
||||
local_scheduler.save!
|
||||
end
|
||||
total_jobs = Delayed::Job.count
|
||||
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
result = JSON.parse(@response.body)
|
||||
assert_equal(Hash, result.class)
|
||||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal('Channel: Email::Notification out ', result['message'])
|
||||
|
||||
travel 20.minutes
|
||||
Scheduler.where(active: true).each do |local_scheduler|
|
||||
local_scheduler.last_run = Time.zone.now
|
||||
local_scheduler.save!
|
||||
end
|
||||
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
result = JSON.parse(@response.body)
|
||||
assert_equal(Hash, result.class)
|
||||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal("Channel: Email::Notification out ;#{total_jobs} background jobs in queue", result['message'])
|
||||
|
||||
Delayed::Job.delete_all
|
||||
travel_back
|
||||
|
||||
# health_check - unprocessable mail
|
||||
dir = Rails.root.join('tmp', 'unprocessable_mail')
|
||||
FileUtils.mkdir_p(dir)
|
||||
FileUtils.touch("#{dir}/test.eml")
|
||||
|
||||
# health_check
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
|
@ -408,10 +464,10 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal('Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running', result['message'])
|
||||
assert_equal('Channel: Email::Notification out ;unprocessable mails: 1', result['message'])
|
||||
|
||||
# health_check - ldap
|
||||
Setting.set('ldap_integration', true)
|
||||
|
||||
ImportJob.create(
|
||||
name: 'Import::Ldap',
|
||||
started_at: Time.zone.now,
|
||||
|
@ -421,7 +477,6 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
}
|
||||
)
|
||||
|
||||
# health_check
|
||||
get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers
|
||||
assert_response(200)
|
||||
|
||||
|
@ -430,7 +485,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running;Failed to run import backend 'Import::Ldap'. Cause: Some bad error", result['message'])
|
||||
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;Failed to run import backend 'Import::Ldap'. Cause: Some bad error", result['message'])
|
||||
|
||||
stuck_updated_at_timestamp = 15.minutes.ago
|
||||
ImportJob.create(
|
||||
|
@ -449,7 +504,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running;Failed to run import backend 'Import::Ldap'. Cause: Some bad error;Stuck import backend 'Import::Ldap' detected. Last update: #{stuck_updated_at_timestamp}", result['message'])
|
||||
assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;Failed to run import backend 'Import::Ldap'. Cause: Some bad error;Stuck import backend 'Import::Ldap' detected. Last update: #{stuck_updated_at_timestamp}", result['message'])
|
||||
|
||||
Setting.set('ldap_integration', false)
|
||||
end
|
||||
|
@ -632,7 +687,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest
|
|||
assert(result['message'])
|
||||
assert(result['issues'])
|
||||
assert_equal(false, result['healthy'])
|
||||
assert_equal("13 failing background jobs.;Failed to run background job #1 'Object' 8 time(s) with 40 attempt(s).;Failed to run background job #2 'BackgroundJobSearchIndex' 2 time(s) with 14 attempt(s).", result['message'])
|
||||
assert_equal("13 failing background jobs;Failed to run background job #1 'Object' 8 time(s) with 40 attempt(s).;Failed to run background job #2 'BackgroundJobSearchIndex' 2 time(s) with 14 attempt(s).", result['message'])
|
||||
|
||||
# cleanup
|
||||
Delayed::Job.delete_all
|
||||
|
|
Loading…
Reference in a new issue