From 1306ca16a9734feedd9173b16658eae47c6fadbe Mon Sep 17 00:00:00 2001 From: Martin Edenhofer Date: Wed, 15 Aug 2018 09:11:48 +0200 Subject: [PATCH] Implemented issue #2188 - WebApp (monitoring section) claims scheduler not running, if delayed jobs get too many. --- app/controllers/monitoring_controller.rb | 24 +++--- .../integration/monitoring_controller_test.rb | 75 ++++++++++++++++--- 2 files changed, 79 insertions(+), 20 deletions(-) diff --git a/app/controllers/monitoring_controller.rb b/app/controllers/monitoring_controller.rb index b02f7cd3a..e477cd3b3 100644 --- a/app/controllers/monitoring_controller.rb +++ b/app/controllers/monitoring_controller.rb @@ -46,7 +46,8 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX issues.push "#{message} #{channel.last_log_in}" end if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance - issues.push "#{message} channel is active but not fetched for 1 hour" + diff = Time.zone.now - channel.preferences['last_fetch'] + issues.push "#{message} channel is active but not fetched for #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} hour" end # outbound channel @@ -71,11 +72,11 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX end end - # scheduler check - Scheduler.where(active: true).where.not(last_run: nil).each do |scheduler| - next if scheduler.period <= 300 - next if scheduler.last_run + scheduler.period.seconds > Time.zone.now - 5.minutes - issues.push 'scheduler not running' + # scheduler running check + Scheduler.where('active = ? AND period > 300', true).where.not(last_run: nil).order(last_run: :asc, period: :asc).each do |scheduler| + diff = Time.zone.now - (scheduler.last_run + scheduler.period.seconds) + next if diff < 8.minutes + issues.push "scheduler may not run (last execution of #{scheduler.method} #{helpers.time_ago_in_words(Time.zone.now - diff.seconds)} over) - please contact your system administrator" break end if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count @@ -90,20 +91,23 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX # failed jobs check failed_jobs = Delayed::Job.where('attempts > 0') count_failed_jobs = failed_jobs.count - if count_failed_jobs > 10 - issues.push "#{count_failed_jobs} failing background jobs." + issues.push "#{count_failed_jobs} failing background jobs" end listed_failed_jobs = failed_jobs.select(:handler, :attempts).limit(10) sorted_failed_jobs = listed_failed_jobs.group_by(&:name).sort_by { |_handler, entries| entries.length }.reverse.to_h sorted_failed_jobs.each_with_index do |(name, jobs), index| - attempts = jobs.map(&:attempts).sum - issues.push "Failed to run background job ##{index += 1} '#{name}' #{jobs.count} time(s) with #{attempts} attempt(s)." end + # job count check + total_jobs = Delayed::Job.where('created_at < ?', Time.zone.now - 15.minutes).count + if total_jobs > 8000 + issues.push "#{total_jobs} background jobs in queue" + end + # import jobs import_backends = ImportJob.backends diff --git a/test/integration/monitoring_controller_test.rb b/test/integration/monitoring_controller_test.rb index 213d03a76..9d0409578 100644 --- a/test/integration/monitoring_controller_test.rb +++ b/test/integration/monitoring_controller_test.rb @@ -368,7 +368,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest channel.last_log_out = nil channel.save! - # health_check + # health_check - channel get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers assert_response(200) @@ -379,12 +379,28 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest assert_equal(false, result['healthy']) assert_equal('Channel: Email::Notification out ', result['message']) + # health_check - scheduler may not run + scheduler = Scheduler.where(active: true).last + scheduler.last_run = Time.zone.now - 20.minutes + scheduler.period = 600 + scheduler.save! + + get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers + assert_response(200) + + result = JSON.parse(@response.body) + assert_equal(Hash, result.class) + assert(result['message']) + assert(result['issues']) + assert_equal(false, result['healthy']) + assert_equal("Channel: Email::Notification out ;scheduler may not run (last execution of #{scheduler.method} 10 minutes over) - please contact your system administrator", result['message']) + + # health_check - scheduler may not run scheduler = Scheduler.where(active: true).last scheduler.last_run = Time.zone.now - 1.day scheduler.period = 600 scheduler.save! - # health_check get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers assert_response(200) @@ -393,13 +409,53 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest assert(result['message']) assert(result['issues']) assert_equal(false, result['healthy']) - assert_equal('Channel: Email::Notification out ;scheduler not running', result['message']) + assert_equal("Channel: Email::Notification out ;scheduler may not run (last execution of #{scheduler.method} about 24 hours over) - please contact your system administrator", result['message']) + # health_check - scheduler job count + travel 2.seconds + 8001.times do + Delayed::Job.enqueue( BackgroundJobSearchIndex.new('Ticket', 1)) + end + Scheduler.where(active: true).each do |local_scheduler| + local_scheduler.last_run = Time.zone.now + local_scheduler.save! + end + total_jobs = Delayed::Job.count + + get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers + assert_response(200) + + result = JSON.parse(@response.body) + assert_equal(Hash, result.class) + assert(result['message']) + assert(result['issues']) + assert_equal(false, result['healthy']) + assert_equal('Channel: Email::Notification out ', result['message']) + + travel 20.minutes + Scheduler.where(active: true).each do |local_scheduler| + local_scheduler.last_run = Time.zone.now + local_scheduler.save! + end + + get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers + assert_response(200) + + result = JSON.parse(@response.body) + assert_equal(Hash, result.class) + assert(result['message']) + assert(result['issues']) + assert_equal(false, result['healthy']) + assert_equal("Channel: Email::Notification out ;#{total_jobs} background jobs in queue", result['message']) + + Delayed::Job.delete_all + travel_back + + # health_check - unprocessable mail dir = Rails.root.join('tmp', 'unprocessable_mail') FileUtils.mkdir_p(dir) FileUtils.touch("#{dir}/test.eml") - # health_check get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers assert_response(200) @@ -408,10 +464,10 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest assert(result['message']) assert(result['issues']) assert_equal(false, result['healthy']) - assert_equal('Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running', result['message']) + assert_equal('Channel: Email::Notification out ;unprocessable mails: 1', result['message']) + # health_check - ldap Setting.set('ldap_integration', true) - ImportJob.create( name: 'Import::Ldap', started_at: Time.zone.now, @@ -421,7 +477,6 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest } ) - # health_check get "/api/v1/monitoring/health_check?token=#{@token}", params: {}, headers: @headers assert_response(200) @@ -430,7 +485,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest assert(result['message']) assert(result['issues']) assert_equal(false, result['healthy']) - assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running;Failed to run import backend 'Import::Ldap'. Cause: Some bad error", result['message']) + assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;Failed to run import backend 'Import::Ldap'. Cause: Some bad error", result['message']) stuck_updated_at_timestamp = 15.minutes.ago ImportJob.create( @@ -449,7 +504,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest assert(result['message']) assert(result['issues']) assert_equal(false, result['healthy']) - assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;scheduler not running;Failed to run import backend 'Import::Ldap'. Cause: Some bad error;Stuck import backend 'Import::Ldap' detected. Last update: #{stuck_updated_at_timestamp}", result['message']) + assert_equal("Channel: Email::Notification out ;unprocessable mails: 1;Failed to run import backend 'Import::Ldap'. Cause: Some bad error;Stuck import backend 'Import::Ldap' detected. Last update: #{stuck_updated_at_timestamp}", result['message']) Setting.set('ldap_integration', false) end @@ -632,7 +687,7 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest assert(result['message']) assert(result['issues']) assert_equal(false, result['healthy']) - assert_equal("13 failing background jobs.;Failed to run background job #1 'Object' 8 time(s) with 40 attempt(s).;Failed to run background job #2 'BackgroundJobSearchIndex' 2 time(s) with 14 attempt(s).", result['message']) + assert_equal("13 failing background jobs;Failed to run background job #1 'Object' 8 time(s) with 40 attempt(s).;Failed to run background job #2 'BackgroundJobSearchIndex' 2 time(s) with 14 attempt(s).", result['message']) # cleanup Delayed::Job.delete_all