diff --git a/lib/signature_detection.rb b/lib/signature_detection.rb index a48067548..930087b9e 100644 --- a/lib/signature_detection.rb +++ b/lib/signature_detection.rb @@ -20,91 +20,34 @@ returns =end def self.find_signature(messages) + signature_candidates = Hash.new(0) # : + messages = messages.map { |m| m[:content_type].match?(%r{text/html}i) ? m[:content].html2text(true) : m[:content] } + message_pairs = messages.each_cons(2).to_a + diffs = message_pairs.map { |msg_pair| Diffy::Diff.new(*msg_pair).to_s } - string_list = messages.map { |m| m[:content] } - .map do |c| - c.match?(%r{text/html}i) ? c.html2text(true) : c - end + # Find the first 5- to 10-line common substring in each diff + diffs.map { |d| d.split("\n") }.each do |diff_lines| + # Get line numbers in diff representing changes (those starting with +, -, \) + delta_indices = diff_lines.map.with_index { |l, i| l.start_with?(' ') ? nil : i }.compact - # hash with possible signature and count of matches in string list - possible_signatures = {} + # Add boundaries at start and end + delta_indices.unshift(-1).push(diff_lines.length) - # loop all strings in array - string_list.each_with_index do |_main_string, main_string_index| - break if main_string_index + 1 > string_list.length - 1 + # Find first gap of 5+ lines between deltas (i.e., the common substring's location) + sig_range = delta_indices.each_cons(2) + .map { |head, tail| [head + 1, tail - 1] } + .find { |head, tail| tail > head + 4 } - # loop all all strings in array except of the previous index - ( main_string_index + 1..string_list.length - 1 ).each do |second_string_index| + # Take up to 10 lines from this "gap" (i.e., the common substring) + match_content = diff_lines[sig_range.first..sig_range.last] + .map { |l| l.sub(/^./, '') } + .first(10).join("\n") - # get content of string 1 - string1_content = string_list[main_string_index] - - # get content of string 2 - string2_content = string_list[second_string_index] - - # diff strings - diff_result = Diffy::Diff.new(string1_content, string2_content) - - # split diff result by new line - diff_result_array = diff_result.to_s.split("\n") - - # define start index for blocks with no difference - match_block = nil - - # loop of lines of the diff result - ( 0..diff_result_array.length - 1 ).each do |diff_string_index| - - # if no block with difference is defined then we try to find a string block without a difference - if !match_block - match_block = diff_string_index - end - - # get line of diff result with current loop inde - line = diff_result_array[diff_string_index] - - # check if the line starts with - # + = new content incoming - # - = removed content - # \ = end of file - # or if the current line is the last line of the diff result - next if line !~ /^(\\|\+|\-)/i && diff_string_index != diff_result_array.length - 1 - - # if the count of the lines without any difference is higher than 4 lines - if diff_string_index - match_block > 4 - - # define the block size without any difference - # except "-" because in this case 1 line is removed to much - match_block_total = diff_string_index + (line.match?(/^(\\|\+)/i) ? -1 : 0) - - # get string of possible signature, use only the first 10 lines - match_max_content = 0 - match_content = '' - ( match_block..match_block_total ).each do |match_block_index| - break if match_max_content == 10 - - match_max_content += 1 - match_content += "#{diff_result_array[match_block_index][1..-1]}\n" - end - - # count the match of the signature in string list to rank - # the signature - possible_signatures[match_content] ||= 0 - possible_signatures[match_content] += 1 - - break - end - - match_block = nil - end - end + # Add this substring to the signature_candidates hash and increment its match score + signature_candidates[match_content] += 1 end - # loop all possible signature by rating and return highest rating - possible_signatures.sort { |a1, a2| a2[1].to_i <=> a1[1].to_i }.map do |content, _score| - return content.chomp - end - - nil + signature_candidates.max_by { |_, score| score }&.first end =begin @@ -124,18 +67,13 @@ returns =end def self.find_signature_line(signature, string, content_type) - - if content_type.match?(%r{text/html}i) - string = string.html2text(true) - end + string = string.html2text(true) if content_type.match?(%r{text/html}i) # try to find the char position of the signature search_position = string.index(signature) - return if search_position.nil? - # count new lines up to signature - string[0..search_position].split("\n").length + 1 + string[0..search_position].split("\n").length + 1 if search_position.present? end =begin @@ -256,11 +194,12 @@ returns =end def self.rebuild_all_articles - article_type = Ticket::Article::Type.lookup(name: 'email') - Ticket::Article.select('id').where(type_id: article_type.id).order(id: :desc).each do |local_article| - article = Ticket::Article.find(local_article.id) - user = User.find(article.created_by_id) + + Ticket::Article.where(type_id: article_type.id) + .order(id: :desc) + .find_each(batch_size: 10) do |article| + user = User.lookup(id: article.created_by_id) next if !user.preferences[:signature_detection] signature_line = find_signature_line( @@ -269,10 +208,9 @@ returns article.content_type, ) next if !signature_line - next if article.preferences[:signature_detection] == signature_line article.preferences[:signature_detection] = signature_line - article.save + article.save if article.changed? end true end