require 'rchardet' class String alias old_strip strip alias old_strip! strip! def strip! begin sub!(/\A[[[:space:]]\u{200B}\u{FEFF}]+/, '') sub!(/[[[:space:]]\u{200B}\u{FEFF}]+\Z/, '') # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default rescue Encoding::CompatibilityError old_strip! end self end def strip begin new_string = sub(/\A[[[:space:]]\u{200B}\u{FEFF}]+/, '') new_string.sub!(/[[[:space:]]\u{200B}\u{FEFF}]+\Z/, '') # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default rescue Encoding::CompatibilityError new_string = old_strip end new_string end def message_quote quote = split("\n") body_quote = '' quote.each do |line| body_quote = "#{body_quote}> #{line}\n" end body_quote end def word_wrap(*args) options = args.extract_options! if args.present? options[:line_width] = args[0] || 82 end options.reverse_merge!(line_width: 82) lines = self lines.split("\n").collect do |line| line.length > options[:line_width] ? line.gsub(/(.{1,#{options[:line_width]}})(\s+|$)/, "\\1\n").strip : line end * "\n" end =begin filename = 'Some::Module'.to_filename returns 'some/module' =end def to_filename camel_cased_word = dup camel_cased_word.gsub(/::/, '/') .gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2') .gsub(/([a-z\d])([A-Z])/, '\1_\2') .tr('-', '_').downcase end =begin filename = 'some/module.rb'.to_classname returns 'Some::Module' =end def to_classname camel_cased_word = dup camel_cased_word.delete_suffix!('.rb') camel_cased_word.split('/').map(&:camelize).join('::') end # because of mysql inno_db limitations, strip 4 bytes utf8 chars (e. g. emojis) # unfortunaly UTF8mb4 will raise other limitaions of max varchar and lower index sizes # More details: http://pjambet.github.io/blog/emojis-and-mysql/ def utf8_to_3bytesutf8 return self if Rails.application.config.db_4bytes_utf8 each_char.select do |c| if c.bytes.count > 3 Rails.logger.warn "strip out 4 bytes utf8 chars '#{c}' of '#{self}'" next end c end .join('') end =begin text = html_string.html2text returns 'string with text only' =end def html2text(string_only = false, strict = false) string = dup # in case of invalid encoding, strip invalid chars # see also test/data/mail/mail021.box # note: string.encode!('UTF-8', 'UTF-8', :invalid => :replace, :replace => '?') was not detecting invalid chars if !string.valid_encoding? string = string.chars.select(&:valid_encoding?).join end # remove html comments string.gsub!(//m, '') # find and replace it with [x] link_list = '' counter = 0 if string_only string.gsub!(%r{]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) do |_placeholder| link = $3 text = $6 text.gsub!(/<.+?>/, '') link_compare = link.dup if link_compare.present? link.strip! link_compare.strip! link_compare.downcase! link_compare.sub!(%r{/$}, '') end text_compare = text.dup if text_compare.present? text.strip! text_compare.strip! text_compare.downcase! text_compare.sub!(%r{/$}, '') end if link_compare.present? && text_compare.blank? link elsif link_compare.blank? && text_compare.present? text elsif link_compare && link_compare =~ /^mailto/i text elsif link_compare.present? && text_compare.present? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase) "######LINKEXT:#{link}/TEXT:#{text}######" elsif text !~ /^http/ "#{text} (######LINKRAW:#{link}######)" else "#{link} (######LINKRAW:#{text}######)" end end elsif string.scan(//ix) do link = $2 counter = counter + 1 link_list += "[#{counter}] #{link}\n" "[#{counter}] " end end # remove style tags with content string.gsub!(%r{(.+?)}im, '') # remove empty lines string.gsub!(/^[[:space:]]*/m, '') if strict string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######') end # pre/code handling 1/2 string.gsub!(%r{
(.+?)
}m) do |placeholder| placeholder.gsub(/\n/, '###BR###') end string.gsub!(%r{(.+?)}m) do |placeholder| placeholder.gsub(/\n/, '###BR###') end # insert spaces on [A-z]\n[A-z] string.gsub!(/([A-z])[[:space:]]([A-z])/m, '\1 \2') # remove all new lines string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '') # blockquote handling string.gsub!(%r{]*)>(.+?)}m) do "\n#{$2.html2text(true).gsub(/^(.*)$/, '> \1')}\n" end # pre/code handling 2/2 string.gsub!(/###BR###/, "\n") # add counting string.gsub!(/]*)>/i, "\n* ") # add hr string.gsub!(%r{]*)>}i, "\n___\n") # add h\d string.gsub!(%r{}i, "\n") # add new lines string.gsub!(%r{}im, "\n") string.gsub!(%r{

}im, "\n") string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n") string.gsub!(%r{}i, "\n") string.gsub!(%r{}i, ' ') # strip all other tags string.gsub!(/<.+?>/, '') # replace multiple spaces with one string.gsub!(/ /, ' ') # add hyperlinks if strict string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) do |_placeholder| pre = $1 content = $2 post = $5 if content.match?(/^www/i) content = "http://#{content}" end if content =~ /^(http|https|ftp|tel)/i "#{pre}######LINKRAW:#{content}#######{post}" else "#{pre}#{content}#{post}" end end end # try HTMLEntities, if it fails on invalid signes, use manual way begin coder = HTMLEntities.new string = coder.decode(string) rescue # strip all & < > " string.gsub!('&', '&') string.gsub!('<', '<') string.gsub!('>', '>') string.gsub!('"', '"') string.gsub!(' ', ' ') # encode html entities like "–" string.gsub!(/(&\#(\d+);?)/x) do $2.chr end # encode html entities like "d;" string.gsub!(/(&\#[xX]([0-9a-fA-F]+);?)/x) do chr_orig = $1 hex = $2.hex if hex chr = hex.chr if chr chr_orig = chr else chr_orig end else chr_orig end # check valid encoding begin if !chr_orig.encode('UTF-8').valid_encoding? chr_orig = '?' end rescue chr_orig = '?' end chr_orig end end string = string.utf8_encode(fallback: :read_as_sanitized_binary) # remove tailing empty spaces string.gsub!(/[[:blank:]]+$/, '') # remove double multiple empty lines string.gsub!(/\n\n\n+/, "\n\n") # add extracted links if link_list != '' string += "\n\n\n#{link_list}" end # remove double multiple empty lines string.gsub!(/\n\n\n+/, "\n\n") string.strip end =begin html = text_string.text2html =end def text2html text = CGI.escapeHTML(self) text.gsub!(/\n/, '
') text.chomp end =begin html = text_string.text2html =end def html2html_strict string = dup string = HtmlSanitizer.cleanup_replace_tags(string) string = HtmlSanitizer.strict(string, true).strip string = HtmlSanitizer.cleanup(string).strip # as fallback, use html2text and text2html if string.blank? string = html2text.text2html string.signature_identify('text') marker_template = '' string.sub!(/######SIGNATURE_MARKER######/, marker_template) string.gsub!(/######SIGNATURE_MARKER######/, '') return string.chomp end string.gsub!(%r{(

[[:space:]]*

([[:space:]]*)){2,}}im, '

 

\2') string.gsub!(%r\
[[:space:]]*(([[:space:]]*)){2,}\im, '

\3') string.gsub!(%r\[[:space:]]*(
[[:space:]]*){3,}[[:space:]]*
\im, '

') string.gsub!(%r\
[[:space:]]*(
[[:space:]]*){1,}[[:space:]]*
\im, '
 
') string.gsub!(%r\
[[:space:]]*(
[[:space:]]*
[[:space:]]*){2,}
\im, '
 
') string.gsub!(%r\

[[:space:]]*

([[:space:]]*){2,}[[:space:]]*\im, '


') string.gsub!(%r{

[[:space:]]*

([[:space:]]*)+

[[:space:]]*

}im, '

') string.gsub!(%r\(
[[:space:]]*
[[:space:]]*){2,}\im, '
') string.gsub!(%r{
 
[[:space:]]*(
 
){1,}}im, '
 
') string.gsub!(/(
[[:space:]]*){3,}/im, '

') string.gsub!(%r\([[:space:]]*){3,}\im, '

') string.gsub!(%r{

[[:space:]]+

}im, '

 

') string.gsub!(%r{\A([[:space:]]*)*}i, '') string.gsub!(%r{[[:space:]]*([[:space:]]*)*\Z}i, '') string.gsub!(%r{(

){1,10}\Z}i, '') string.signature_identify('html') marker_template = '' string.sub!(/######SIGNATURE_MARKER######/, marker_template) string.gsub!(/######SIGNATURE_MARKER######/, '') string.chomp end def signature_identify(type = 'text', force = false) string = self marker = '######SIGNATURE_MARKER######' if type == 'html' map = [ '[[:space:]]*(--|__)', '<\/div>[[:space:]]*(--|__)', '

[[:space:]]*(--|__)', '(|

|

)[[:space:]]*(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]*', '(
|
)[[:space:]]*
[[:space:]]*(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]+', '[[:space:]]*
[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]]', '[[:space:]]*
[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]].{1,500}: map['apple-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:' # Thunderbird # Am 04.03.2015 um 12:47 schrieb Alf Aardvark: map['thunderbird-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:' # Thunderbird default - http://kb.mozillazine.org/Reply_header_settings # On 01-01-2007 11:00 AM, Alf Aardvark wrote: map['thunderbird-en-default'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10},[[:space:]].{1,250}(wrote):' # http://kb.mozillazine.org/Reply_header_settings # Alf Aardvark wrote, on 01-01-2007 11:00 AM: map['thunderbird-en'] = '^.{1,250}[[:space:]](wrote),[[:space:]]on[[:space:]].{3,20}:' # otrs # 25.02.2015 10:26 - edv hotline wrote: # 25.02.2015 10:26 - edv hotline schrieb: map['otrs-en-de'] = '^.{6,10}[[:space:]].{3,10}[[:space:]]-[[:space:]].{1,250}[[:space:]](wrote|schrieb):' # Ms # rubocop:disable Style/AsciiComments # From: Martin Edenhofer via Znuny Support [mailto:support@znuny.inc] # Send: Donnerstag, 2. April 2015 10:00 # To/Cc/Bcc: xxx # Subject: xxx # - or - # From: xxx # To/Cc/Bcc: xxx # Date: 01.04.2015 12:41 # Subject: xxx # - or - # De : xxx # À/?/?: xxx # Envoyé : mercredi 29 avril 2015 17:31 # Objet : xxx # rubocop:enable Style/AsciiComments # en/de/fr | sometimes ms adds a space to "xx : value" map['ms-en-de-fr_from'] = '^(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ)( ?):[[:space:]].+?' map['ms-en-de-fr_from_html'] = "\n######b######(From|Von|De)([[:space:]]?):([[:space:]]?)(######\/b######)[[:space:]].+?" # word 14 # edv hotline wrote: # edv hotline schrieb: #map['word-en-de'] = "[^#{marker}].{1,250}\s(wrote|schrieb):" map.each_value do |regexp| string.sub!(/#{regexp}/) do |placeholder| "#{marker}#{placeholder}" rescue # regexp was not possible because of some string encoding issue, use next Rails.logger.debug { "Invalid string/charset combination with regexp #{regexp} in string" } end end string end # Returns a copied string whose encoding is UTF-8. # If both the provided and current encodings are invalid, # an auto-detected encoding is tried. # # Supports some fallback strategies if a valid encoding cannot be found. # # Options: # # * from: An encoding to try first. # Takes precedence over the current and auto-detected encodings. # # * fallback: The strategy to follow if no valid encoding can be found. # * `:output_to_binary` returns an ASCII-8BIT-encoded string. # * `:read_as_sanitized_binary` returns a UTF-8-encoded string with all # invalid byte sequences replaced with "?" characters. def utf8_encode(**options) dup.utf8_encode!(options) end def utf8_encode!(**options) return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding? # convert string to given charset, if valid_encoding? is true if options[:from].present? begin encoding = Encoding.find(options[:from]) if encoding.present? && dup.force_encoding(encoding).valid_encoding? force_encoding(encoding) return encode!('utf-8', encoding) end rescue ArgumentError, EncodingError => e Rails.logger.error { e.inspect } end end # try to find valid encodings of string viable_encodings.each do |enc| return encode!('utf-8', enc) rescue EncodingError => e Rails.logger.error { e.inspect } end case options[:fallback] when :output_to_binary force_encoding('ascii-8bit') when :read_as_sanitized_binary encode!('utf-8', 'ascii-8bit', invalid: :replace, undef: :replace, replace: '?') else raise EncodingError, 'could not find a valid input encoding' end end private def viable_encodings(try_first: nil) return dup.viable_encodings(try_first: try_first) if frozen? provided = Encoding.find(try_first) if try_first.present? original = encoding detected = CharDet.detect(self)['encoding'] [provided, original, detected] .compact .reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT } .reject { |e| Encoding.find(e) == Encoding::UTF_8 } .select { |e| force_encoding(e).valid_encoding? } .tap { force_encoding(original) } # clean up changes from previous line # if `try_first` is not a valid encoding, try_first again without it rescue ArgumentError try_first.present? ? viable_encodings : raise end end