From 7a7d9471c9d1bc14da10d837dd323de9009bb308 Mon Sep 17 00:00:00 2001 From: Rolf Schmidt Date: Wed, 7 Oct 2015 20:42:29 +0200 Subject: [PATCH] Added proving backend to detect signatures by diff module 'diffy' to reduce redundancy in article views. --- Gemfile | 2 + Gemfile.lock | 2 + lib/signature_detection.rb | 122 ++++++++++++++++++ .../email_signature_detection/client_a_1.txt | 2 + .../email_signature_detection/client_a_2.txt | 14 ++ test/unit/email_signatur_detection_test.rb | 101 ++++++++------- 6 files changed, 198 insertions(+), 45 deletions(-) create mode 100644 lib/signature_detection.rb diff --git a/Gemfile b/Gemfile index 999424f09..41621e07f 100644 --- a/Gemfile +++ b/Gemfile @@ -62,6 +62,8 @@ gem 'browser' gem 'eventmachine' gem 'em-websocket' +gem 'diffy' + # Gems used only for develop/test and not required # in production environments by default. group :development, :test do diff --git a/Gemfile.lock b/Gemfile.lock index 51cdaa3eb..a579b5567 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -76,6 +76,7 @@ GEM delayed_job_active_record (4.1.0) activerecord (>= 3.0, < 5) delayed_job (>= 3.0, < 5) + diffy (3.0.7) docile (1.1.5) eco (1.0.0) coffee-script @@ -302,6 +303,7 @@ DEPENDENCIES coffee-script-source daemons delayed_job_active_record + diffy eco em-websocket eventmachine diff --git a/lib/signature_detection.rb b/lib/signature_detection.rb new file mode 100644 index 000000000..81f55e665 --- /dev/null +++ b/lib/signature_detection.rb @@ -0,0 +1,122 @@ +module SignatureDetection + +=begin + +try to detect the signature in list of articles for example + + signature = SignatureDetection.find_signature(string_list) + +returns + + signature = '...signature possible match...' + +=end + + def self.find_signature(string_list) + + # hash with possible signature and count of matches in string list + possible_signatures = {} + + # loop all strings in array + #for main_string_index in 0 .. string_list.length - 1 + ( 0..string_list.length - 1 ).each {|main_string_index| + break if main_string_index + 1 > string_list.length - 1 + + # loop all all strings in array except of the previous index + ( main_string_index + 1..string_list.length - 1 ).each {|second_string_index| + + # get content of string 1 + string1_content = string_list[main_string_index] + + # get content of string 2 + string2_content = string_list[second_string_index] + + # diff strings + diff_result = Diffy::Diff.new(string1_content, string2_content) + + # split diff result by new line + diff_result_array = diff_result.to_s.split("\n") + + # define start index for blocks with no difference + match_block = nil + + # loop of lines of the diff result + ( 0..diff_result_array.length - 1 ).each {|diff_string_index| + + # if no block with difference is defined then we try to find a string block without a difference + if !match_block + match_block = diff_string_index + end + + # get line of diff result with current loop inde + line = diff_result_array[diff_string_index] + + # check if the line starts with + # + = new content incoming + # - = removed content + # \ = end of file + # or if the current line is the last line of the diff result + next if line !~ /^(\\|\+|\-)/i && diff_string_index != diff_result_array.length - 1 + + # if the count of the lines without any difference is higher than 5 lines + if diff_string_index - match_block > 5 + + # define the block size without any difference + # except "-" because in this case 1 line is removed to much + match_block_total = diff_string_index + (line =~ /^(\\|\+)/i ? -1 : 0) + + # get string of possible signature + match_content = '' + ( match_block..match_block_total ).each {|match_block_index| + match_content += "#{diff_result_array[match_block_index][1..-1]}\n" + } + + # count the match of the signature in string list to rank + # the signature + possible_signatures[match_content] ||= 0 + possible_signatures[match_content] += 1 + + end + + match_block = nil + } + } + } + + # loop all possible signature by rating and return highest rating + possible_signatures.sort { |a1, a2| a2[1].to_i <=> a1[1].to_i }.map do |content, _score| + return content.chomp + end + + nil + end + +=begin + +this function will search for a signature string in a string (e.g. article) and return the line number of the signature start + + signature_line = SignatureDetection.find_signature_line(signature, string) + +returns + + signature_line = 123 + + or + + signature_line = nil + +=end + + def self.find_signature_line(signature, string) + + # try to find the char position of the signature + search_position = string.index(signature) + + return if search_position.nil? + + # count new lines up to signature + search_newlines = string[0..search_position].split("\n").length + 1 + + search_newlines + end +end diff --git a/test/fixtures/email_signature_detection/client_a_1.txt b/test/fixtures/email_signature_detection/client_a_1.txt index 41beee0b8..f587cdcb2 100644 --- a/test/fixtures/email_signature_detection/client_a_1.txt +++ b/test/fixtures/email_signature_detection/client_a_1.txt @@ -1,5 +1,7 @@ Hi, +123 + uns liegt die fachliche Anforderung vor, dass eine Agent-AddNote-Benachrichtigung für die beiden o. g. TicketHistory-Typen versendet werden soll. Das Modul Custom/Kernel/System/Ticket/Article.pm sieht diese Benachrichtigungen nach meinem Verständnis bisher nicht vor. Dafür wäre doch eine Codeerweiterung erforderlich, oder? diff --git a/test/fixtures/email_signature_detection/client_a_2.txt b/test/fixtures/email_signature_detection/client_a_2.txt index 7812c7d82..95795147c 100644 --- a/test/fixtures/email_signature_detection/client_a_2.txt +++ b/test/fixtures/email_signature_detection/client_a_2.txt @@ -1,7 +1,21 @@ Hi Martin, +123 + ich benötige von Dir eine Aufwandschätzung für ein Upgrade von x.1 auf x.5 (wir hatten schon mal diesbezüglich informiert, jetzt wollen die Entscheider Zahlen sehen). +asd +fa +sdf +a +sdf +asd +f +as +df +asd +f + Vielen Dank! Mit freundlichen Grüßen diff --git a/test/unit/email_signatur_detection_test.rb b/test/unit/email_signatur_detection_test.rb index 46bd45953..31eb20f45 100644 --- a/test/unit/email_signatur_detection_test.rb +++ b/test/unit/email_signatur_detection_test.rb @@ -6,61 +6,72 @@ class EmailSignaturDetectionTest < ActiveSupport::TestCase test 'test case I - sender a' do # fixtures of sender a - fixture_files = [ - 'email_signature_detection/client_a_1.txt', - 'email_signature_detection/client_a_2.txt', - 'email_signature_detection/client_a_3.txt', - ] - - # detect signature - match_structure = '' - - # tests - # 'email_signature_detection/client_a_1.txt' - result_should = { - line: 9 + fixture_files = { + 'email_signature_detection/client_a_1.txt' => { line: 10 }, + 'email_signature_detection/client_a_2.txt' => { line: 20 }, + 'email_signature_detection/client_a_3.txt' => { line: 6 }, } - # 'email_signature_detection/client_a_2.txt' - result_should = { - line: 7 - } + fixture_files_string_list = [] - # 'email_signature_detection/client_a_3.txt' - result_should = { - line: 7 - } - assert(true) + fixture_files.keys.each do |filepath| + + file_content = '' + + file = File.new("#{Rails.root}/test/fixtures/#{filepath}", 'r') + while (line = file.gets) + file_content += line + end + file.close + + fixture_files[filepath][:content] = file_content + fixture_files_string_list.push(file_content) + end + + signature = SignatureDetection.find_signature(fixture_files_string_list) + expected_signature = "\nMit freundlichen Grüßen\n\nBob Smith\nBerechtigungen und dez. Department\n________________________________\n\nMusik AG\nBerechtigungen und dez. Department (ITPBM)\nKastanien 2\n12345 Hornhausen\nTel.: +49 911 6760\nFax: +49 911 85 6760\nMobil: +49 173 911\nE-Mail: Bob.Smith@music.com\nhttp://www.music.com\n\nMusik AG | Kastanien 2 | 12345 Hornhausen\nSitz der AG: Hornhausen, HRB xxxxx | USt.-ID: DE 111222333444\nVorstand: Marc Smith, Weber Huber\nAufsichtsrat: Max Mix (Vors.)" + assert_equal(expected_signature, signature) + + fixture_files.keys.each do |filepath| + expected_signature_position = fixture_files[filepath][:line] + + assert_equal(expected_signature_position, SignatureDetection.find_signature_line(signature, fixture_files[filepath][:content])) + end end test 'test case II - sender b' do - # fixtures of sender a - fixture_files = [ - 'email_signature_detection/client_b_1.txt', - 'email_signature_detection/client_b_2.txt', - 'email_signature_detection/client_b_3.txt', - ] - - # detect signature - match_structure = '' - - # tests - # 'email_signature_detection/client_b_1.txt' - result_should = { - line: 27 + fixture_files = { + 'email_signature_detection/client_b_1.txt' => { line: 26 }, + 'email_signature_detection/client_b_2.txt' => { line: 4 }, + 'email_signature_detection/client_b_3.txt' => { line: 6 }, } - # 'email_signature_detection/client_b_2.txt' - result_should = { - line: 5 - } + fixture_files_string_list = [] - # 'email_signature_detection/client_b_3.txt' - result_should = { - line: 7 - } - assert(true) + fixture_files.keys.each do |filepath| + + file_content = '' + + file = File.new("#{Rails.root}/test/fixtures/#{filepath}", 'r') + while (line = file.gets) + file_content += line + end + file.close + + fixture_files[filepath][:content] = file_content + fixture_files_string_list.push(file_content) + end + + signature = SignatureDetection.find_signature(fixture_files_string_list) + expected_signature = "\nFreundliche Grüße\n\nGünter Lässig\nLokale Daten\n\nMusic GmbH\nBaustraße 123, 12345 Max City\nTelefon 0123 5432114\nTelefax 0123 5432139\nE-Mail Günter.Lässig@example.com\n\nExample. Zusammen für eine bessere Welt.\n[cid:image001.png@01CE92A6.EC495B60]\n\n[cid:image002.png@01CE92A6.EC495B60]\n\n[cid:image003.png@01CE92A6.EC495B60]\n\n[cid:image004.png@01CE92A6.EC495B60]\n\n[cid:image005.jpg@01CE92A6.EC495B60]\n\n[cid:image006.png@01CE92A6.EC495B60]\n\nSitz der Gesellschaft: Max City, Amtsgericht Max City HRB Nr. 1234\nGeschäftsführer: Bob Smith\nVorsitzender des Aufsichtsrats: Alex Marx" + assert_equal(expected_signature, signature) + + fixture_files.keys.each do |filepath| + expected_signature_position = fixture_files[filepath][:line] + + assert_equal(expected_signature_position, SignatureDetection.find_signature_line(signature, fixture_files[filepath][:content])) + end end end