123 lines
3.5 KiB
Ruby
123 lines
3.5 KiB
Ruby
|
module SignatureDetection
|
||
|
|
||
|
=begin
|
||
|
|
||
|
try to detect the signature in list of articles for example
|
||
|
|
||
|
signature = SignatureDetection.find_signature(string_list)
|
||
|
|
||
|
returns
|
||
|
|
||
|
signature = '...signature possible match...'
|
||
|
|
||
|
=end
|
||
|
|
||
|
def self.find_signature(string_list)
|
||
|
|
||
|
# hash with possible signature and count of matches in string list
|
||
|
possible_signatures = {}
|
||
|
|
||
|
# loop all strings in array
|
||
|
#for main_string_index in 0 .. string_list.length - 1
|
||
|
( 0..string_list.length - 1 ).each {|main_string_index|
|
||
|
break if main_string_index + 1 > string_list.length - 1
|
||
|
|
||
|
# loop all all strings in array except of the previous index
|
||
|
( main_string_index + 1..string_list.length - 1 ).each {|second_string_index|
|
||
|
|
||
|
# get content of string 1
|
||
|
string1_content = string_list[main_string_index]
|
||
|
|
||
|
# get content of string 2
|
||
|
string2_content = string_list[second_string_index]
|
||
|
|
||
|
# diff strings
|
||
|
diff_result = Diffy::Diff.new(string1_content, string2_content)
|
||
|
|
||
|
# split diff result by new line
|
||
|
diff_result_array = diff_result.to_s.split("\n")
|
||
|
|
||
|
# define start index for blocks with no difference
|
||
|
match_block = nil
|
||
|
|
||
|
# loop of lines of the diff result
|
||
|
( 0..diff_result_array.length - 1 ).each {|diff_string_index|
|
||
|
|
||
|
# if no block with difference is defined then we try to find a string block without a difference
|
||
|
if !match_block
|
||
|
match_block = diff_string_index
|
||
|
end
|
||
|
|
||
|
# get line of diff result with current loop inde
|
||
|
line = diff_result_array[diff_string_index]
|
||
|
|
||
|
# check if the line starts with
|
||
|
# + = new content incoming
|
||
|
# - = removed content
|
||
|
# \ = end of file
|
||
|
# or if the current line is the last line of the diff result
|
||
|
next if line !~ /^(\\|\+|\-)/i && diff_string_index != diff_result_array.length - 1
|
||
|
|
||
|
# if the count of the lines without any difference is higher than 5 lines
|
||
|
if diff_string_index - match_block > 5
|
||
|
|
||
|
# define the block size without any difference
|
||
|
# except "-" because in this case 1 line is removed to much
|
||
|
match_block_total = diff_string_index + (line =~ /^(\\|\+)/i ? -1 : 0)
|
||
|
|
||
|
# get string of possible signature
|
||
|
match_content = ''
|
||
|
( match_block..match_block_total ).each {|match_block_index|
|
||
|
match_content += "#{diff_result_array[match_block_index][1..-1]}\n"
|
||
|
}
|
||
|
|
||
|
# count the match of the signature in string list to rank
|
||
|
# the signature
|
||
|
possible_signatures[match_content] ||= 0
|
||
|
possible_signatures[match_content] += 1
|
||
|
|
||
|
end
|
||
|
|
||
|
match_block = nil
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# loop all possible signature by rating and return highest rating
|
||
|
possible_signatures.sort { |a1, a2| a2[1].to_i <=> a1[1].to_i }.map do |content, _score|
|
||
|
return content.chomp
|
||
|
end
|
||
|
|
||
|
nil
|
||
|
end
|
||
|
|
||
|
=begin
|
||
|
|
||
|
this function will search for a signature string in a string (e.g. article) and return the line number of the signature start
|
||
|
|
||
|
signature_line = SignatureDetection.find_signature_line(signature, string)
|
||
|
|
||
|
returns
|
||
|
|
||
|
signature_line = 123
|
||
|
|
||
|
or
|
||
|
|
||
|
signature_line = nil
|
||
|
|
||
|
=end
|
||
|
|
||
|
def self.find_signature_line(signature, string)
|
||
|
|
||
|
# try to find the char position of the signature
|
||
|
search_position = string.index(signature)
|
||
|
|
||
|
return if search_position.nil?
|
||
|
|
||
|
# count new lines up to signature
|
||
|
search_newlines = string[0..search_position].split("\n").length + 1
|
||
|
|
||
|
search_newlines
|
||
|
end
|
||
|
end
|