2016-11-23 14:25:44 +00:00
|
|
|
# Copyright (C) 2012-2016 Zammad Foundation, http://zammad-foundation.org/
|
|
|
|
|
|
|
|
class MonitoringController < ApplicationController
|
2017-11-23 08:09:44 +00:00
|
|
|
prepend_before_action -> { authentication_check(permission: 'admin.monitoring') }, except: %i[health_check status]
|
2017-02-15 12:29:25 +00:00
|
|
|
skip_before_action :verify_csrf_token
|
2016-11-23 14:25:44 +00:00
|
|
|
|
|
|
|
=begin
|
|
|
|
|
|
|
|
Resource:
|
|
|
|
GET /api/v1/monitoring/health_check?token=XXX
|
|
|
|
|
|
|
|
Response:
|
|
|
|
{
|
|
|
|
"healthy": true,
|
|
|
|
"message": "success",
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
"healthy": false,
|
|
|
|
"message": "authentication of XXX failed; issue #2",
|
|
|
|
"issues": ["authentication of XXX failed", "issue #2"],
|
|
|
|
}
|
|
|
|
|
|
|
|
Test:
|
|
|
|
curl http://localhost/api/v1/monitoring/health_check?token=XXX
|
|
|
|
|
|
|
|
=end
|
|
|
|
|
|
|
|
def health_check
|
|
|
|
token_or_permission_check
|
|
|
|
|
|
|
|
issues = []
|
2017-09-07 14:33:09 +00:00
|
|
|
actions = Set.new
|
2016-11-23 14:25:44 +00:00
|
|
|
|
|
|
|
# channel check
|
2017-04-28 09:52:30 +00:00
|
|
|
last_run_tolerance = Time.zone.now - 1.hour
|
2017-10-01 12:25:52 +00:00
|
|
|
Channel.where(active: true).each do |channel|
|
2017-04-28 09:52:30 +00:00
|
|
|
|
|
|
|
# inbound channel
|
2016-11-23 14:25:44 +00:00
|
|
|
if channel.status_in == 'error'
|
|
|
|
message = "Channel: #{channel.area} in "
|
2017-11-23 08:09:44 +00:00
|
|
|
%w[host user uid].each do |key|
|
2017-11-21 14:25:04 +00:00
|
|
|
next if channel.options[key].blank?
|
2016-11-23 14:25:44 +00:00
|
|
|
message += "key:#{channel.options[key]};"
|
2017-10-01 12:25:52 +00:00
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
issues.push "#{message} #{channel.last_log_in}"
|
|
|
|
end
|
2017-04-28 09:52:30 +00:00
|
|
|
if channel.preferences && channel.preferences['last_fetch'] && channel.preferences['last_fetch'] < last_run_tolerance
|
|
|
|
issues.push "#{message} channel is active but not fetched for 1 hour"
|
|
|
|
end
|
|
|
|
|
|
|
|
# outbound channel
|
2016-11-23 14:25:44 +00:00
|
|
|
next if channel.status_out != 'error'
|
|
|
|
message = "Channel: #{channel.area} out "
|
2017-11-23 08:09:44 +00:00
|
|
|
%w[host user uid].each do |key|
|
2017-11-21 14:25:04 +00:00
|
|
|
next if channel.options[key].blank?
|
2016-11-23 14:25:44 +00:00
|
|
|
message += "key:#{channel.options[key]};"
|
2017-10-01 12:25:52 +00:00
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
issues.push "#{message} #{channel.last_log_out}"
|
2017-10-01 12:25:52 +00:00
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
|
|
|
|
# unprocessable mail check
|
2017-11-23 08:09:44 +00:00
|
|
|
directory = Rails.root.join('tmp', 'unprocessable_mail').to_s
|
2016-11-23 14:25:44 +00:00
|
|
|
if File.exist?(directory)
|
|
|
|
count = 0
|
2017-10-01 12:25:52 +00:00
|
|
|
Dir.glob("#{directory}/*.eml") do |_entry|
|
2016-11-23 14:25:44 +00:00
|
|
|
count += 1
|
2017-10-01 12:25:52 +00:00
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
if count.nonzero?
|
|
|
|
issues.push "unprocessable mails: #{count}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# scheduler check
|
2017-10-01 12:25:52 +00:00
|
|
|
Scheduler.where(active: true).where.not(last_run: nil).each do |scheduler|
|
2016-11-23 14:25:44 +00:00
|
|
|
next if scheduler.period <= 300
|
|
|
|
next if scheduler.last_run + scheduler.period.seconds > Time.zone.now - 5.minutes
|
|
|
|
issues.push 'scheduler not running'
|
|
|
|
break
|
2017-10-01 12:25:52 +00:00
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
if Scheduler.where(active: true, last_run: nil).count == Scheduler.where(active: true).count
|
|
|
|
issues.push 'scheduler not running'
|
|
|
|
end
|
|
|
|
|
2017-09-07 16:07:48 +00:00
|
|
|
Scheduler.failed_jobs.each do |job|
|
|
|
|
issues.push "Failed to run scheduled job '#{job.name}'. Cause: #{job.error_message}"
|
2017-09-07 14:33:09 +00:00
|
|
|
actions.add(:restart_failed_jobs)
|
2017-09-07 13:00:12 +00:00
|
|
|
end
|
2017-05-15 11:50:55 +00:00
|
|
|
|
2018-03-26 15:43:52 +00:00
|
|
|
# failed jobs check
|
|
|
|
failed_jobs = Delayed::Job.where('attempts > 0')
|
|
|
|
count_failed_jobs = failed_jobs.count
|
|
|
|
|
|
|
|
if count_failed_jobs > 10
|
|
|
|
issues.push "#{count_failed_jobs} failing background jobs."
|
|
|
|
end
|
|
|
|
|
|
|
|
listed_failed_jobs = failed_jobs.select(:handler, :attempts).limit(10)
|
2018-07-05 14:40:45 +00:00
|
|
|
sorted_failed_jobs = listed_failed_jobs.group_by(&:name).sort_by { |_handler, entries| entries.length }.reverse.to_h
|
|
|
|
sorted_failed_jobs.each_with_index do |(name, jobs), index|
|
2018-03-26 15:43:52 +00:00
|
|
|
|
|
|
|
attempts = jobs.map(&:attempts).sum
|
|
|
|
|
|
|
|
issues.push "Failed to run background job ##{index += 1} '#{name}' #{jobs.count} time(s) with #{attempts} attempt(s)."
|
|
|
|
end
|
|
|
|
|
|
|
|
# import jobs
|
2018-02-12 11:22:16 +00:00
|
|
|
import_backends = ImportJob.backends
|
2018-01-24 16:16:26 +00:00
|
|
|
|
2018-02-12 11:22:16 +00:00
|
|
|
# failed import jobs
|
|
|
|
import_backends.each do |backend|
|
2018-01-24 16:16:26 +00:00
|
|
|
|
|
|
|
job = ImportJob.where(
|
|
|
|
name: backend,
|
|
|
|
dry_run: false,
|
|
|
|
).where('finished_at >= ?', 5.minutes.ago).limit(1).first
|
|
|
|
|
|
|
|
next if job.blank?
|
|
|
|
next if !job.result.is_a?(Hash)
|
|
|
|
|
|
|
|
error_message = job.result[:error]
|
|
|
|
next if error_message.blank?
|
|
|
|
|
|
|
|
issues.push "Failed to run import backend '#{backend}'. Cause: #{error_message}"
|
|
|
|
end
|
|
|
|
|
2018-02-12 11:22:16 +00:00
|
|
|
# stuck import jobs
|
|
|
|
import_backends.each do |backend|
|
|
|
|
|
|
|
|
job = ImportJob.where(
|
|
|
|
name: backend,
|
|
|
|
dry_run: false,
|
|
|
|
finished_at: nil,
|
|
|
|
).where('updated_at <= ?', 5.minutes.ago).limit(1).first
|
|
|
|
|
|
|
|
next if job.blank?
|
|
|
|
|
|
|
|
issues.push "Stuck import backend '#{backend}' detected. Last update: #{job.updated_at}"
|
|
|
|
end
|
|
|
|
|
2016-11-23 14:25:44 +00:00
|
|
|
token = Setting.get('monitoring_token')
|
|
|
|
|
2017-11-21 14:25:04 +00:00
|
|
|
if issues.blank?
|
2016-11-23 14:25:44 +00:00
|
|
|
result = {
|
|
|
|
healthy: true,
|
|
|
|
message: 'success',
|
|
|
|
token: token,
|
|
|
|
}
|
|
|
|
render json: result
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
result = {
|
|
|
|
healthy: false,
|
|
|
|
message: issues.join(';'),
|
2017-09-07 14:33:09 +00:00
|
|
|
issues: issues,
|
|
|
|
actions: actions,
|
|
|
|
token: token,
|
2016-11-23 14:25:44 +00:00
|
|
|
}
|
|
|
|
render json: result
|
|
|
|
end
|
|
|
|
|
|
|
|
=begin
|
|
|
|
|
|
|
|
Resource:
|
|
|
|
GET /api/v1/monitoring/status?token=XXX
|
|
|
|
|
|
|
|
Response:
|
|
|
|
{
|
|
|
|
"agents": 8123,
|
|
|
|
"last_login": "2016-11-21T14:14:14Z",
|
|
|
|
"counts": {
|
|
|
|
"users": 12313,
|
|
|
|
"tickets": 23123,
|
|
|
|
"ticket_articles": 131451,
|
|
|
|
},
|
|
|
|
"last_created_at": {
|
|
|
|
"users": "2016-11-21T14:14:14Z",
|
|
|
|
"tickets": "2016-11-21T14:14:14Z",
|
|
|
|
"ticket_articles": "2016-11-21T14:14:14Z",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
Test:
|
|
|
|
curl http://localhost/api/v1/monitoring/status?token=XXX
|
|
|
|
|
|
|
|
=end
|
|
|
|
|
|
|
|
def status
|
|
|
|
token_or_permission_check
|
|
|
|
|
|
|
|
last_login = nil
|
|
|
|
last_login_user = User.where('last_login IS NOT NULL').order(last_login: :desc).limit(1).first
|
|
|
|
if last_login_user
|
|
|
|
last_login = last_login_user.last_login
|
|
|
|
end
|
|
|
|
|
|
|
|
status = {
|
|
|
|
counts: {},
|
|
|
|
last_created_at: {},
|
|
|
|
last_login: last_login,
|
|
|
|
agents: User.with_permissions('ticket.agent').count,
|
|
|
|
}
|
|
|
|
|
|
|
|
map = {
|
|
|
|
users: User,
|
|
|
|
groups: Group,
|
|
|
|
overviews: Overview,
|
|
|
|
tickets: Ticket,
|
|
|
|
ticket_articles: Ticket::Article,
|
|
|
|
}
|
2017-10-01 12:25:52 +00:00
|
|
|
map.each do |key, class_name|
|
2016-11-23 14:25:44 +00:00
|
|
|
status[:counts][key] = class_name.count
|
|
|
|
last = class_name.last
|
2017-11-23 08:09:44 +00:00
|
|
|
status[:last_created_at][key] = last&.created_at
|
2017-10-01 12:25:52 +00:00
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
|
2018-06-05 12:57:00 +00:00
|
|
|
if ActiveRecord::Base.connection_config[:adapter] == 'postgresql'
|
|
|
|
sql = 'SELECT SUM(CAST(coalesce(size, \'0\') AS INTEGER)) FROM stores WHERE id IN (SELECT DISTINCT(store_file_id) FROM stores)'
|
|
|
|
records_array = ActiveRecord::Base.connection.exec_query(sql)
|
|
|
|
if records_array[0] && records_array[0]['sum']
|
|
|
|
sum = records_array[0]['sum']
|
|
|
|
status[:storage] = {
|
|
|
|
kB: sum / 1024,
|
|
|
|
MB: sum / 1024 / 1024,
|
|
|
|
GB: sum / 1024 / 1024 / 1024,
|
|
|
|
}
|
|
|
|
end
|
|
|
|
end
|
2016-11-23 14:25:44 +00:00
|
|
|
render json: status
|
|
|
|
end
|
|
|
|
|
|
|
|
def token
|
|
|
|
access_check
|
|
|
|
token = SecureRandom.urlsafe_base64(40)
|
|
|
|
Setting.set('monitoring_token', token)
|
|
|
|
|
|
|
|
result = {
|
|
|
|
token: token,
|
|
|
|
}
|
|
|
|
render json: result, status: :created
|
|
|
|
end
|
|
|
|
|
2017-09-07 12:45:13 +00:00
|
|
|
def restart_failed_jobs
|
2017-05-15 11:50:55 +00:00
|
|
|
access_check
|
|
|
|
|
2017-09-07 15:34:32 +00:00
|
|
|
Scheduler.restart_failed_jobs
|
2017-05-15 11:50:55 +00:00
|
|
|
|
2017-09-07 14:33:09 +00:00
|
|
|
render json: {}, status: :ok
|
2017-05-15 11:50:55 +00:00
|
|
|
end
|
|
|
|
|
2016-11-23 14:25:44 +00:00
|
|
|
private
|
|
|
|
|
|
|
|
def token_or_permission_check
|
|
|
|
user = authentication_check_only(permission: 'admin.monitoring')
|
|
|
|
return if user
|
|
|
|
return if Setting.get('monitoring_token') == params[:token]
|
|
|
|
raise Exceptions::NotAuthorized
|
|
|
|
end
|
|
|
|
|
|
|
|
def access_check
|
|
|
|
return if Permission.find_by(name: 'admin.monitoring', active: true)
|
|
|
|
raise Exceptions::NotAuthorized
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|