diff --git a/app/assets/javascripts/app/controllers/monitoring.coffee b/app/assets/javascripts/app/controllers/monitoring.coffee index c43b29142..2b6900248 100644 --- a/app/assets/javascripts/app/controllers/monitoring.coffee +++ b/app/assets/javascripts/app/controllers/monitoring.coffee @@ -4,6 +4,7 @@ class Index extends App.ControllerSubContent events: 'click .js-resetToken': 'resetToken' 'click .js-select': 'selectAll' + 'click .js-restartFailedJobs': 'restartFailedJobs' constructor: -> super @@ -42,4 +43,14 @@ class Index extends App.ControllerSubContent @load() ) + restartFailedJobs: (e) => + e.preventDefault() + @ajax( + id: 'restart_failed_jobs_request' + type: 'POST' + url: "#{@apiPath}/monitoring/restart_failed_jobs" + success: (data) => + @load() + ) + App.Config.set('Monitoring', { prio: 3600, name: 'Monitoring', parent: '#system', target: '#system/monitoring', controller: Index, permission: ['admin.monitoring'] }, 'NavBarAdmin') diff --git a/app/assets/javascripts/app/views/monitoring.jst.eco b/app/assets/javascripts/app/views/monitoring.jst.eco index 053c128ed..ceea50003 100644 --- a/app/assets/javascripts/app/views/monitoring.jst.eco +++ b/app/assets/javascripts/app/views/monitoring.jst.eco @@ -34,6 +34,9 @@ <% end %> <% end %> + <% if _.contains(@data.actions, 'restart_failed_jobs'): %> + + <% end %> - \ No newline at end of file + diff --git a/app/controllers/monitoring_controller.rb b/app/controllers/monitoring_controller.rb index ece7e4a5d..248e7b1e8 100644 --- a/app/controllers/monitoring_controller.rb +++ b/app/controllers/monitoring_controller.rb @@ -30,6 +30,7 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX token_or_permission_check issues = [] + actions = Set.new # channel check last_run_tolerance = Time.zone.now - 1.hour @@ -81,6 +82,11 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX issues.push 'scheduler not running' end + Scheduler.failed_jobs.each do |job| + issues.push "Failed to run scheduled job '#{job.name}'. Cause: #{job.error_message}" + actions.add(:restart_failed_jobs) + end + token = Setting.get('monitoring_token') if issues.empty? @@ -96,8 +102,9 @@ curl http://localhost/api/v1/monitoring/health_check?token=XXX result = { healthy: false, message: issues.join(';'), - issues: issues, - token: token, + issues: issues, + actions: actions, + token: token, } render json: result end @@ -173,6 +180,14 @@ curl http://localhost/api/v1/monitoring/status?token=XXX render json: result, status: :created end + def restart_failed_jobs + access_check + + Scheduler.restart_failed_jobs + + render json: {}, status: :ok + end + private def token_or_permission_check diff --git a/app/models/scheduler.rb b/app/models/scheduler.rb index 0ce63ea30..22581ba9c 100644 --- a/app/models/scheduler.rb +++ b/app/models/scheduler.rb @@ -169,9 +169,13 @@ class Scheduler < ApplicationModel end def self._start_job(job, try_count = 0, try_run_time = Time.zone.now) - job.last_run = Time.zone.now - job.pid = Thread.current.object_id - job.save + job.update( + last_run: Time.zone.now, + pid: Thread.current.object_id, + status: 'ok', + error_message: '', + ) + logger.info "execute #{job.method} (try_count #{try_count})..." eval job.method() # rubocop:disable Lint/Eval rescue => e @@ -197,7 +201,15 @@ class Scheduler < ApplicationModel if try_run_max > try_count _start_job(job, try_count, try_run_time) else - raise "STOP thread for #{job.method} after #{try_count} tries (#{e.inspect})" + @@jobs_started[ job.id ] = false + error = "Failed to run #{job.method} after #{try_count} tries #{e.inspect}" + logger.error error + + job.update( + error_message: error, + status: 'error', + active: false, + ) end end @@ -255,4 +267,28 @@ class Scheduler < ApplicationModel end + # This function returns a list of failed jobs + # + # @example + # Scheduler.failed_jobs + # + # return [Array] + def self.failed_jobs + where(status: 'error', active: false) + end + + # This function restarts failed jobs to retry them + # + # @example + # Scheduler.restart_failed_jobs + # + # return [true] + def self.restart_failed_jobs + failed_jobs.each do |job| + job.update(active: true) + end + + true + end + end diff --git a/config/routes/monitoring.rb b/config/routes/monitoring.rb index 18f2e6c93..637330fac 100644 --- a/config/routes/monitoring.rb +++ b/config/routes/monitoring.rb @@ -1,8 +1,9 @@ Zammad::Application.routes.draw do api_path = Rails.configuration.api_path - match api_path + '/monitoring/health_check', to: 'monitoring#health_check', via: :get - match api_path + '/monitoring/status', to: 'monitoring#status', via: :get - match api_path + '/monitoring/token', to: 'monitoring#token', via: :post + match api_path + '/monitoring/health_check', to: 'monitoring#health_check', via: :get + match api_path + '/monitoring/status', to: 'monitoring#status', via: :get + match api_path + '/monitoring/token', to: 'monitoring#token', via: :post + match api_path + '/monitoring/restart_failed_jobs', to: 'monitoring#restart_failed_jobs', via: :post end diff --git a/db/migrate/20120101000001_create_base.rb b/db/migrate/20120101000001_create_base.rb index 71547ebb9..78708a076 100644 --- a/db/migrate/20120101000001_create_base.rb +++ b/db/migrate/20120101000001_create_base.rb @@ -514,6 +514,8 @@ class CreateBase < ActiveRecord::Migration t.integer :prio, null: false t.string :pid, limit: 250, null: true t.string :note, limit: 250, null: true + t.string :error_message, null: true + t.string :status, null: true t.boolean :active, null: false, default: false t.integer :updated_by_id, null: false t.integer :created_by_id, null: false diff --git a/db/migrate/20170515000001_scheduler_status.rb b/db/migrate/20170515000001_scheduler_status.rb new file mode 100644 index 000000000..6c55b2ff7 --- /dev/null +++ b/db/migrate/20170515000001_scheduler_status.rb @@ -0,0 +1,12 @@ +class SchedulerStatus < ActiveRecord::Migration + def up + + # return if it's a new setup + return if !Setting.find_by(name: 'system_init_done') + + change_table :schedulers do |t| + t.string :error_message, null: true + t.string :status, null: true + end + end +end diff --git a/spec/factories/scheduler.rb b/spec/factories/scheduler.rb new file mode 100644 index 000000000..5b484c9a1 --- /dev/null +++ b/spec/factories/scheduler.rb @@ -0,0 +1,25 @@ +FactoryGirl.define do + sequence :test_scheduler_name do |n| + "Testscheduler#{n}" + end +end + +FactoryGirl.define do + + factory :scheduler do + name { generate(:test_scheduler_name) } + last_run { Time.zone.now } + pid 1337 + prio 1 + status 'ok' + active true + period { 10.minutes } + running false + note 'test' + updated_by_id 1 + created_by_id 1 + created_at 1 + updated_at 1 + add_attribute(:method) { 'test' } + end +end diff --git a/spec/models/scheduler_spec.rb b/spec/models/scheduler_spec.rb index 7432186d0..67412c29d 100644 --- a/spec/models/scheduler_spec.rb +++ b/spec/models/scheduler_spec.rb @@ -26,6 +26,45 @@ RSpec.describe Scheduler do SpecSpace.send(:remove_const, :DelayedJobBackend) end + describe '.failed_jobs' do + + it 'does list failed jobs' do + job = create(:scheduler, status: 'error', active: false) + failed_list = described_class.failed_jobs + expect(failed_list).to be_present + expect(failed_list).to include(job) + end + + end + + describe '.restart_failed_jobs' do + + it 'does restart failed jobs' do + job = create(:scheduler, status: 'error', active: false) + described_class.restart_failed_jobs + job.reload + expect(job.active).to be true + end + end + + describe '._start_job' do + + it 'sets error status/message for failed jobs' do + job = create(:scheduler) + described_class._start_job(job) + expect(job.status).to eq 'error' + expect(job.active).to be false + expect(job.error_message).to be_present + end + + it 'executes job that is expected to succeed' do + expect(Setting).to receive(:reload) + job = create(:scheduler, method: 'Setting.reload') + described_class._start_job(job) + expect(job.status).to eq 'ok' + end + end + describe '.cleanup' do it 'gets called by .threads' do diff --git a/test/controllers/monitoring_controller_test.rb b/test/controllers/monitoring_controller_test.rb index 7914b9522..fcd066e62 100644 --- a/test/controllers/monitoring_controller_test.rb +++ b/test/controllers/monitoring_controller_test.rb @@ -391,4 +391,10 @@ class MonitoringControllerTest < ActionDispatch::IntegrationTest end + test '09 check restart_failed_jobs' do + credentials = ActionController::HttpAuthentication::Basic.encode_credentials('monitoring-admin@example.com', 'adminpw') + post '/api/v1/monitoring/restart_failed_jobs', {}, @headers.merge('Authorization' => credentials) + assert_response(200) + end + end