2021-10-20 16:08:21 +00:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
# Procesar una lista de URIs para una lista de dominios. Esto nos
|
2021-10-26 14:33:15 +00:00
|
|
|
# permite procesar estadísticas a demanda.
|
|
|
|
#
|
|
|
|
# Hay varias cosas acá que van a convertirse en métodos propios, como la
|
|
|
|
# detección de URIs de un sitio (aunque la versión actual detecta todas
|
|
|
|
# las páginas y no solo las de posts como tenemos planeado, hay que
|
|
|
|
# resolver eso).
|
|
|
|
#
|
|
|
|
# Los hostnames de un sitio van a poder obtenerse a partir de
|
|
|
|
# Site#hostnames con la garantía de que son únicos.
|
|
|
|
class UriCollectionJob < PeriodicJob
|
2022-04-23 23:32:20 +00:00
|
|
|
include RecursiveRollup
|
|
|
|
|
2021-10-26 14:33:15 +00:00
|
|
|
# Ignoramos imágenes porque suelen ser demasiadas y no aportan a las
|
|
|
|
# estadísticas.
|
2022-04-23 21:56:34 +00:00
|
|
|
IMAGES = %w[.png .jpg .jpeg .gif .webp .jfif].freeze
|
2021-10-26 14:33:15 +00:00
|
|
|
STAT_NAME = 'uri_collection_job'
|
|
|
|
|
|
|
|
def perform(site_id:, once: true)
|
|
|
|
@site = Site.find site_id
|
2021-10-20 16:08:21 +00:00
|
|
|
|
2022-04-30 13:23:58 +00:00
|
|
|
# Obtener el principio del intervalo anterior
|
|
|
|
beginning = beginning_of_interval
|
2022-04-23 20:50:36 +00:00
|
|
|
# Recordar la última vez que se corrió la tarea
|
|
|
|
stat = site.stats.create! name: STAT_NAME
|
2022-04-29 23:32:21 +00:00
|
|
|
columns = %i[http_referer geoip2_data_country_name]
|
2022-04-23 23:32:20 +00:00
|
|
|
|
2022-04-29 18:04:42 +00:00
|
|
|
# Recorremos todos los hostnames y uris posibles y luego agrupamos
|
|
|
|
# recursivamente para no tener que recalcular, asumiendo que es más
|
|
|
|
# rápido buscar en los rollups indexados que en la tabla en bruto.
|
|
|
|
#
|
|
|
|
# Los referers solo se agrupan por host.
|
2022-04-26 19:46:48 +00:00
|
|
|
site.hostnames.each do |host|
|
2022-04-23 23:32:20 +00:00
|
|
|
break if stop?
|
2022-04-23 20:50:36 +00:00
|
|
|
|
2022-04-29 23:32:21 +00:00
|
|
|
host_dimensions = { host: host }
|
|
|
|
|
|
|
|
# Las URIs son la fuente de verdad de las visitas, porque son las
|
|
|
|
# que indican las páginas y recursos descargables, el resto son
|
|
|
|
# imágenes, CSS, JS y tipografías que no nos aportan números
|
|
|
|
# significativos.
|
2021-10-20 16:08:21 +00:00
|
|
|
uris.each do |uri|
|
2022-04-23 23:32:20 +00:00
|
|
|
break if stop?
|
|
|
|
|
2022-04-29 23:32:21 +00:00
|
|
|
name = 'host|uri'
|
|
|
|
dimensions = { host: host, uri: uri }
|
|
|
|
|
|
|
|
rollup(name, beginning, **dimensions)
|
|
|
|
reduce_rollup(name, beginning, **dimensions)
|
|
|
|
|
|
|
|
columns.each do |column|
|
|
|
|
# Obtener orígenes de visitas por host
|
|
|
|
AccessLog.where(**host_dimensions).distinct(column).pluck(column).each do |value|
|
2022-04-30 13:13:49 +00:00
|
|
|
column_name = "host|uri|#{column}"
|
2022-04-29 23:32:21 +00:00
|
|
|
dimensions[column] = value
|
|
|
|
|
2022-04-30 13:03:14 +00:00
|
|
|
rollup(column_name, beginning, **dimensions)
|
|
|
|
reduce_rollup(column_name, beginning, **dimensions)
|
2022-04-29 23:32:21 +00:00
|
|
|
end
|
|
|
|
end
|
2021-10-26 14:33:15 +00:00
|
|
|
end
|
2022-04-23 23:32:20 +00:00
|
|
|
|
2022-04-29 18:04:42 +00:00
|
|
|
# Reducir todas las visitas a cantidad de visitas por host
|
2022-04-30 12:56:19 +00:00
|
|
|
square_rollup(name: 'host|uri',
|
|
|
|
new_name: 'host',
|
|
|
|
interval: starting_interval,
|
|
|
|
dimensions: host_dimensions,
|
|
|
|
beginning: beginning)
|
2022-04-29 18:04:42 +00:00
|
|
|
|
|
|
|
# Acumular por mes y año
|
2022-04-29 23:32:21 +00:00
|
|
|
reduce_rollup('host', beginning, **host_dimensions)
|
2022-04-26 19:59:02 +00:00
|
|
|
|
2022-04-29 20:57:29 +00:00
|
|
|
columns.each do |column|
|
2022-04-30 12:56:19 +00:00
|
|
|
square_rollup(name: "host|uri|#{column}",
|
|
|
|
new_name: "host|#{column}",
|
|
|
|
interval: starting_interval,
|
|
|
|
dimensions: host_dimensions,
|
|
|
|
beginning: beginning)
|
2022-04-29 23:32:21 +00:00
|
|
|
|
|
|
|
reduce_rollup("host|#{column}", beginning, **host_dimensions)
|
2022-04-26 19:59:02 +00:00
|
|
|
end
|
2021-10-26 14:33:15 +00:00
|
|
|
end
|
|
|
|
|
2022-04-23 20:50:36 +00:00
|
|
|
stat.touch
|
2021-10-26 14:33:15 +00:00
|
|
|
|
|
|
|
run_again! unless once
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2022-04-29 18:04:42 +00:00
|
|
|
# Generar un rollup de access logs
|
2022-04-26 19:59:02 +00:00
|
|
|
#
|
|
|
|
# @param :name [String]
|
|
|
|
# @param :beginning [Time]
|
|
|
|
# @param :dimensions [Hash]
|
|
|
|
# @return [nil]
|
|
|
|
def rollup(name, beginning, **dimensions)
|
2022-04-23 23:32:20 +00:00
|
|
|
AccessLog.where(**dimensions)
|
|
|
|
.where('created_at >= ?', beginning)
|
|
|
|
.completed_requests
|
|
|
|
.non_robots
|
|
|
|
.group(*dimensions.keys)
|
|
|
|
.rollup(name, interval: starting_interval, update: true)
|
2022-04-29 18:04:42 +00:00
|
|
|
end
|
2022-04-23 23:32:20 +00:00
|
|
|
|
2022-04-29 18:04:42 +00:00
|
|
|
# Reducir las estadísticas calculadas aplicando un rollup sobre el
|
|
|
|
# intervalo más amplio.
|
|
|
|
#
|
|
|
|
# @param :name [String]
|
|
|
|
# @param :beginning [Time]
|
|
|
|
# @param :dimensions [Hash]
|
|
|
|
# @return [nil]
|
|
|
|
def reduce_rollup(name, beginning, **dimensions)
|
2022-04-23 23:32:20 +00:00
|
|
|
Stat::INTERVALS.reduce do |previous, current|
|
|
|
|
recursive_rollup(name: name,
|
|
|
|
interval_previous: previous,
|
|
|
|
interval: current,
|
|
|
|
dimensions: dimensions,
|
|
|
|
beginning: beginning)
|
|
|
|
|
|
|
|
# Devolver el intervalo actual
|
|
|
|
current
|
|
|
|
end
|
2022-04-29 18:04:42 +00:00
|
|
|
|
|
|
|
nil
|
2022-04-23 23:32:20 +00:00
|
|
|
end
|
|
|
|
|
2021-10-26 14:33:15 +00:00
|
|
|
def stat_name
|
|
|
|
STAT_NAME
|
|
|
|
end
|
|
|
|
|
2022-04-29 20:43:06 +00:00
|
|
|
# Obtiene todas las ubicaciones de archivos
|
|
|
|
#
|
2021-10-26 14:33:15 +00:00
|
|
|
# @return [String]
|
|
|
|
#
|
|
|
|
# TODO: Cambiar al mergear origin-referer
|
2022-04-29 20:43:06 +00:00
|
|
|
def destinations
|
|
|
|
@destinations ||= site.deploys.map(&:destination).select do |d|
|
|
|
|
File.directory?(d)
|
|
|
|
end.map do |d|
|
|
|
|
File.realpath(d)
|
|
|
|
end.uniq
|
2021-10-26 14:33:15 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
# Recolecta todas las URIs menos imágenes
|
|
|
|
#
|
2022-04-30 13:03:14 +00:00
|
|
|
# TODO: Para los sitios con DeployLocalizedDomain estamos buscando
|
|
|
|
# URIs de más.
|
|
|
|
#
|
2021-10-26 14:33:15 +00:00
|
|
|
# @return [Array]
|
|
|
|
def uris
|
2022-04-23 21:56:34 +00:00
|
|
|
@uris ||=
|
2022-04-29 20:43:06 +00:00
|
|
|
destinations.map do |destination|
|
|
|
|
locales.map do |locale|
|
|
|
|
uri = "/#{locale}/".squeeze('/')
|
|
|
|
dir = File.join(destination, locale)
|
|
|
|
|
2022-04-29 21:04:43 +00:00
|
|
|
next unless File.directory? dir
|
|
|
|
|
2022-04-29 20:43:06 +00:00
|
|
|
files(dir).map do |f|
|
|
|
|
uri + f
|
|
|
|
end
|
2021-10-26 14:33:15 +00:00
|
|
|
end
|
2022-04-29 21:04:43 +00:00
|
|
|
end.flatten(3).compact
|
2022-04-23 21:56:34 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
# @return [Array]
|
|
|
|
def locales
|
|
|
|
@locales ||= ['', site.locales.map(&:to_s)].flatten(1)
|
|
|
|
end
|
|
|
|
|
|
|
|
# @param :dir [String]
|
|
|
|
# @return [Array]
|
|
|
|
def files(dir)
|
|
|
|
Dir.chdir(dir) do
|
|
|
|
pages = Dir.glob('**/*.html')
|
|
|
|
files = Dir.glob('public/**/*')
|
|
|
|
files = remove_directories files
|
|
|
|
files = remove_images files
|
|
|
|
|
|
|
|
[pages, files].flatten(1)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# @param :files [Array]
|
|
|
|
# @return [Array]
|
|
|
|
def remove_directories(files)
|
|
|
|
files.reject do |f|
|
|
|
|
File.directory? f
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def remove_images(files)
|
|
|
|
files.reject do |f|
|
|
|
|
IMAGES.include? File.extname(f).downcase
|
2021-10-20 16:08:21 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|