2016-11-23 14:25:44 +00:00
# Copyright (C) 2012-2016 Zammad Foundation, http://zammad-foundation.org/
class MonitoringController < ApplicationController
2020-03-19 09:39:51 +00:00
prepend_before_action { authorize! }
prepend_before_action - > { authentication_check } , except : % i [ health_check status amount_check ]
prepend_before_action - > { authentication_check_only } , only : % i [ health_check status amount_check ]
2017-02-15 12:29:25 +00:00
skip_before_action :verify_csrf_token
2016-11-23 14:25:44 +00:00
= begin
Resource :
GET / api / v1 / monitoring / health_check? token = XXX
Response :
{
" healthy " : true ,
" message " : " success " ,
}
{
" healthy " : false ,
" message " : " authentication of XXX failed; issue # 2 " ,
" issues " : [ " authentication of XXX failed " , " issue # 2 " ] ,
}
Test :
curl http : / / localhost / api / v1 / monitoring / health_check? token = XXX
= end
def health_check
issues = [ ]
2017-09-07 14:33:09 +00:00
actions = Set . new
2016-11-23 14:25:44 +00:00
# channel check
2017-04-28 09:52:30 +00:00
last_run_tolerance = Time . zone . now - 1 . hour
2020-09-30 09:07:01 +00:00
options_keys = %w[ host user uid ]
2017-10-01 12:25:52 +00:00
Channel . where ( active : true ) . each do | channel |
2017-04-28 09:52:30 +00:00
# inbound channel
2016-11-23 14:25:44 +00:00
if channel . status_in == 'error'
message = " Channel: #{ channel . area } in "
2020-09-30 09:07:01 +00:00
options_keys . each do | key |
2017-11-21 14:25:04 +00:00
next if channel . options [ key ] . blank?
2018-10-09 06:17:41 +00:00
2016-11-23 14:25:44 +00:00
message += " key: #{ channel . options [ key ] } ; "
2017-10-01 12:25:52 +00:00
end
2016-11-23 14:25:44 +00:00
issues . push " #{ message } #{ channel . last_log_in } "
end
2017-04-28 09:52:30 +00:00
if channel . preferences && channel . preferences [ 'last_fetch' ] && channel . preferences [ 'last_fetch' ] < last_run_tolerance
2018-08-15 07:11:48 +00:00
diff = Time . zone . now - channel . preferences [ 'last_fetch' ]
2018-11-20 10:01:54 +00:00
issues . push " #{ message } channel is active but not fetched for #{ helpers . time_ago_in_words ( Time . zone . now - diff . seconds ) } "
2017-04-28 09:52:30 +00:00
end
# outbound channel
2016-11-23 14:25:44 +00:00
next if channel . status_out != 'error'
2018-10-09 06:17:41 +00:00
2016-11-23 14:25:44 +00:00
message = " Channel: #{ channel . area } out "
2020-09-30 09:07:01 +00:00
options_keys . each do | key |
2017-11-21 14:25:04 +00:00
next if channel . options [ key ] . blank?
2018-10-09 06:17:41 +00:00
2016-11-23 14:25:44 +00:00
message += " key: #{ channel . options [ key ] } ; "
2017-10-01 12:25:52 +00:00
end
2016-11-23 14:25:44 +00:00
issues . push " #{ message } #{ channel . last_log_out } "
2017-10-01 12:25:52 +00:00
end
2016-11-23 14:25:44 +00:00
# unprocessable mail check
2020-02-18 19:51:31 +00:00
directory = Rails . root . join ( 'tmp/unprocessable_mail' ) . to_s
2016-11-23 14:25:44 +00:00
if File . exist? ( directory )
count = 0
2017-10-01 12:25:52 +00:00
Dir . glob ( " #{ directory } /*.eml " ) do | _entry |
2016-11-23 14:25:44 +00:00
count += 1
2017-10-01 12:25:52 +00:00
end
2016-11-23 14:25:44 +00:00
if count . nonzero?
issues . push " unprocessable mails: #{ count } "
end
end
2018-08-15 07:11:48 +00:00
# scheduler running check
Scheduler . where ( 'active = ? AND period > 300' , true ) . where . not ( last_run : nil ) . order ( last_run : :asc , period : :asc ) . each do | scheduler |
diff = Time . zone . now - ( scheduler . last_run + scheduler . period . seconds )
next if diff < 8 . minutes
2018-10-09 06:17:41 +00:00
2018-08-15 07:11:48 +00:00
issues . push " scheduler may not run (last execution of #{ scheduler . method } #{ helpers . time_ago_in_words ( Time . zone . now - diff . seconds ) } over) - please contact your system administrator "
2016-11-23 14:25:44 +00:00
break
2017-10-01 12:25:52 +00:00
end
2016-11-23 14:25:44 +00:00
if Scheduler . where ( active : true , last_run : nil ) . count == Scheduler . where ( active : true ) . count
issues . push 'scheduler not running'
end
2017-09-07 16:07:48 +00:00
Scheduler . failed_jobs . each do | job |
issues . push " Failed to run scheduled job ' #{ job . name } '. Cause: #{ job . error_message } "
2017-09-07 14:33:09 +00:00
actions . add ( :restart_failed_jobs )
2017-09-07 13:00:12 +00:00
end
2017-05-15 11:50:55 +00:00
2018-03-26 15:43:52 +00:00
# failed jobs check
failed_jobs = Delayed :: Job . where ( 'attempts > 0' )
count_failed_jobs = failed_jobs . count
if count_failed_jobs > 10
2018-08-15 07:11:48 +00:00
issues . push " #{ count_failed_jobs } failing background jobs "
2018-03-26 15:43:52 +00:00
end
2019-01-15 12:32:14 +00:00
handler_attempts_map = { }
failed_jobs . order ( :created_at ) . limit ( 10 ) . each do | job |
2019-07-04 11:16:55 +00:00
job_name = if job . class . name == 'Delayed::Backend::ActiveRecord::Job' . freeze && job . payload_object . respond_to? ( :job_data )
2019-01-15 12:32:14 +00:00
job . payload_object . job_data [ 'job_class' ]
else
job . name
end
handler_attempts_map [ job_name ] || = {
count : 0 ,
attempts : 0 ,
}
handler_attempts_map [ job_name ] [ :count ] += 1
handler_attempts_map [ job_name ] [ :attempts ] += job . attempts
end
Hash [ handler_attempts_map . sort ] . each_with_index do | ( job_name , job_data ) , index |
issues . push " Failed to run background job # #{ index + 1 } ' #{ job_name } ' #{ job_data [ :count ] } time(s) with #{ job_data [ :attempts ] } attempt(s). "
2018-03-26 15:43:52 +00:00
end
2018-08-15 07:11:48 +00:00
# job count check
total_jobs = Delayed :: Job . where ( 'created_at < ?' , Time . zone . now - 15 . minutes ) . count
if total_jobs > 8000
issues . push " #{ total_jobs } background jobs in queue "
end
2018-03-26 15:43:52 +00:00
# import jobs
2018-02-12 11:22:16 +00:00
import_backends = ImportJob . backends
2018-01-24 16:16:26 +00:00
2018-02-12 11:22:16 +00:00
# failed import jobs
import_backends . each do | backend |
2018-01-24 16:16:26 +00:00
job = ImportJob . where (
name : backend ,
dry_run : false ,
) . where ( 'finished_at >= ?' , 5 . minutes . ago ) . limit ( 1 ) . first
next if job . blank?
next if ! job . result . is_a? ( Hash )
error_message = job . result [ :error ]
next if error_message . blank?
issues . push " Failed to run import backend ' #{ backend } '. Cause: #{ error_message } "
end
2018-02-12 11:22:16 +00:00
# stuck import jobs
2020-09-30 09:07:01 +00:00
import_backends . each do | backend | # rubocop:disable Style/CombinableLoops
2018-02-12 11:22:16 +00:00
job = ImportJob . where (
name : backend ,
dry_run : false ,
finished_at : nil ,
) . where ( 'updated_at <= ?' , 5 . minutes . ago ) . limit ( 1 ) . first
next if job . blank?
issues . push " Stuck import backend ' #{ backend } ' detected. Last update: #{ job . updated_at } "
end
2020-09-08 15:06:23 +00:00
# stuck data privacy tasks
DataPrivacyTask . where . not ( state : 'completed' ) . where ( 'updated_at <= ?' , 30 . minutes . ago ) . find_each do | task |
issues . push " Stuck data privacy task (ID #{ task . id } ) detected. Last update: #{ task . updated_at } "
end
2016-11-23 14:25:44 +00:00
token = Setting . get ( 'monitoring_token' )
2017-11-21 14:25:04 +00:00
if issues . blank?
2016-11-23 14:25:44 +00:00
result = {
healthy : true ,
message : 'success' ,
2020-07-10 14:43:39 +00:00
issues : issues ,
2018-12-19 17:31:51 +00:00
token : token ,
2016-11-23 14:25:44 +00:00
}
render json : result
return
end
result = {
healthy : false ,
message : issues . join ( ';' ) ,
2017-09-07 14:33:09 +00:00
issues : issues ,
actions : actions ,
token : token ,
2016-11-23 14:25:44 +00:00
}
render json : result
end
= begin
Resource :
GET / api / v1 / monitoring / status? token = XXX
Response :
{
" agents " : 8123 ,
" last_login " : " 2016-11-21T14:14:14Z " ,
" counts " : {
" users " : 12313 ,
" tickets " : 23123 ,
" ticket_articles " : 131451 ,
} ,
" last_created_at " : {
" users " : " 2016-11-21T14:14:14Z " ,
" tickets " : " 2016-11-21T14:14:14Z " ,
" ticket_articles " : " 2016-11-21T14:14:14Z " ,
} ,
}
Test :
curl http : / / localhost / api / v1 / monitoring / status? token = XXX
= end
def status
last_login = nil
2020-09-30 09:07:01 +00:00
last_login_user = User . where . not ( last_login : nil ) . order ( last_login : :desc ) . limit ( 1 ) . first
2016-11-23 14:25:44 +00:00
if last_login_user
last_login = last_login_user . last_login
end
status = {
2018-12-19 17:31:51 +00:00
counts : { } ,
2016-11-23 14:25:44 +00:00
last_created_at : { } ,
2018-12-19 17:31:51 +00:00
last_login : last_login ,
agents : User . with_permissions ( 'ticket.agent' ) . count ,
2016-11-23 14:25:44 +00:00
}
map = {
2019-03-14 15:31:56 +00:00
users : User ,
groups : Group ,
overviews : Overview ,
tickets : Ticket ,
ticket_articles : Ticket :: Article ,
text_modules : TextModule ,
object_manager_attributes : ObjectManager :: Attribute ,
2019-06-13 01:52:23 +00:00
knowledge_base_categories : KnowledgeBase :: Category ,
knowledge_base_answers : KnowledgeBase :: Answer ,
2016-11-23 14:25:44 +00:00
}
2017-10-01 12:25:52 +00:00
map . each do | key , class_name |
2016-11-23 14:25:44 +00:00
status [ :counts ] [ key ] = class_name . count
last = class_name . last
2017-11-23 08:09:44 +00:00
status [ :last_created_at ] [ key ] = last & . created_at
2017-10-01 12:25:52 +00:00
end
2016-11-23 14:25:44 +00:00
2018-06-05 12:57:00 +00:00
if ActiveRecord :: Base . connection_config [ :adapter ] == 'postgresql'
2018-12-06 14:35:00 +00:00
sql = 'SELECT SUM(CAST(coalesce(size, \'0\') AS INTEGER)) FROM stores WHERE id IN (SELECT MAX(id) FROM stores GROUP BY store_file_id)'
2018-06-05 12:57:00 +00:00
records_array = ActiveRecord :: Base . connection . exec_query ( sql )
if records_array [ 0 ] && records_array [ 0 ] [ 'sum' ]
sum = records_array [ 0 ] [ 'sum' ]
status [ :storage ] = {
kB : sum / 1024 ,
MB : sum / 1024 / 1024 ,
GB : sum / 1024 / 1024 / 1024 ,
}
end
end
2016-11-23 14:25:44 +00:00
render json : status
end
2018-11-02 06:35:28 +00:00
= begin
get counts about created ticket in certain time slot . s , m , h and d possible .
Resource :
GET / api / v1 / monitoring / amount_check? token = XXX & max_warning = 2000 & max_critical = 3000 & periode = 1 h
GET / api / v1 / monitoring / amount_check? token = XXX & min_warning = 2000 & min_critical = 3000 & periode = 1 h
GET / api / v1 / monitoring / amount_check? token = XXX & periode = 1 h
Response :
{
" state " : " ok " ,
" message " : " " ,
" count " : 123 ,
}
{
" state " : " warning " ,
" message " : " limit of 2000 tickets in 1h reached " ,
" count " : 123 ,
}
{
" state " : " critical " ,
" message " : " limit of 3000 tickets in 1h reached " ,
" count " : 123 ,
}
Test :
curl http : / / localhost / api / v1 / monitoring / amount_check? token = XXX & max_warning = 2000 & max_critical = 3000 & periode = 1 h
curl http : / / localhost / api / v1 / monitoring / amount_check? token = XXX & min_warning = 2000 & min_critical = 3000 & periode = 1 h
curl http : / / localhost / api / v1 / monitoring / amount_check? token = XXX & periode = 1 h
= end
def amount_check
raise Exceptions :: UnprocessableEntity , 'periode is missing!' if params [ :periode ] . blank?
scale = params [ :periode ] [ - 1 , 1 ]
2019-09-16 09:21:10 +00:00
raise Exceptions :: UnprocessableEntity , 'periode need to have s, m, h or d as last!' if ! scale . match? ( / ^(s|m|h|d)$ / )
2018-11-02 06:35:28 +00:00
periode = params [ :periode ] [ 0 , params [ :periode ] . length - 1 ]
raise Exceptions :: UnprocessableEntity , 'periode need to be an integer!' if periode . to_i . zero?
2020-07-13 12:46:08 +00:00
case scale
when 's'
2018-11-02 06:35:28 +00:00
created_at = Time . zone . now - periode . to_i . seconds
2020-07-13 12:46:08 +00:00
when 'm'
2018-11-02 06:35:28 +00:00
created_at = Time . zone . now - periode . to_i . minutes
2020-07-13 12:46:08 +00:00
when 'h'
2018-11-02 06:35:28 +00:00
created_at = Time . zone . now - periode . to_i . hours
2020-07-13 12:46:08 +00:00
when 'd'
2018-11-02 06:35:28 +00:00
created_at = Time . zone . now - periode . to_i . days
end
map = [
{ param : :max_critical , notice : 'critical' , type : 'gt' } ,
{ param : :min_critical , notice : 'critical' , type : 'lt' } ,
{ param : :max_warning , notice : 'warning' , type : 'gt' } ,
{ param : :min_warning , notice : 'warning' , type : 'lt' } ,
]
result = { }
2019-01-21 07:01:51 +00:00
state_param = false
2018-11-02 06:35:28 +00:00
map . each do | row |
next if params [ row [ :param ] ] . blank?
raise Exceptions :: UnprocessableEntity , " #{ row [ :param ] } need to be an integer! " if params [ row [ :param ] ] . to_i . zero?
2019-01-21 07:01:51 +00:00
state_param = true
2018-11-02 06:35:28 +00:00
count = Ticket . where ( 'created_at >= ?' , created_at ) . count
if row [ :type ] == 'gt'
if count > params [ row [ :param ] ] . to_i
result = {
2018-12-19 17:31:51 +00:00
state : row [ :notice ] ,
2018-11-02 06:35:28 +00:00
message : " The limit of #{ params [ row [ :param ] ] } was exceeded with #{ count } in the last #{ params [ :periode ] } " ,
2018-12-19 17:31:51 +00:00
count : count ,
2018-11-02 06:35:28 +00:00
}
break
end
next
end
next if count > params [ row [ :param ] ] . to_i
result = {
2018-12-19 17:31:51 +00:00
state : row [ :notice ] ,
2018-11-02 06:35:28 +00:00
message : " The minimum of #{ params [ row [ :param ] ] } was undercut by #{ count } in the last #{ params [ :periode ] } " ,
2018-12-19 17:31:51 +00:00
count : count ,
2018-11-02 06:35:28 +00:00
}
break
end
if result . blank?
result = {
2019-01-21 07:01:51 +00:00
state : 'ok' ,
count : Ticket . where ( 'created_at >= ?' , created_at ) . count ,
2018-11-02 06:35:28 +00:00
}
end
2019-01-21 07:01:51 +00:00
if state_param == false
result . delete ( :state )
end
2018-11-02 06:35:28 +00:00
render json : result
end
2016-11-23 14:25:44 +00:00
def token
token = SecureRandom . urlsafe_base64 ( 40 )
Setting . set ( 'monitoring_token' , token )
result = {
token : token ,
}
render json : result , status : :created
end
2017-09-07 12:45:13 +00:00
def restart_failed_jobs
2017-09-07 15:34:32 +00:00
Scheduler . restart_failed_jobs
2017-05-15 11:50:55 +00:00
2017-09-07 14:33:09 +00:00
render json : { } , status : :ok
2017-05-15 11:50:55 +00:00
end
2016-11-23 14:25:44 +00:00
end