From 5a8f37dc6f800c985f4a0571eb674ce7b56d70f4 Mon Sep 17 00:00:00 2001 From: Martin Edenhofer Date: Wed, 5 Dec 2012 02:27:56 +0100 Subject: [PATCH] Moved to own encode.rb for utf8 input checks/validation. --- app/models/channel/email_parser.rb | 46 ++++-------------------------- lib/encode.rb | 31 ++++++++++++++++++++ lib/rss.rb | 4 +-- 3 files changed, 39 insertions(+), 42 deletions(-) create mode 100644 lib/encode.rb diff --git a/app/models/channel/email_parser.rb b/app/models/channel/email_parser.rb index 08a3b02b4..812767dd9 100644 --- a/app/models/channel/email_parser.rb +++ b/app/models/channel/email_parser.rb @@ -1,42 +1,8 @@ # encoding: utf-8 require 'mail' -#require 'iconv' + class Channel::EmailParser - def conv (charset, string) - - # if no charset is given, use LATIN1 as default - if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT' - charset = 'LATIN1' - end - - # return if string is false - return string if !string - - # validate already existing utf8 strings - if charset.downcase == 'utf8' || charset.downcase == 'utf-8' - begin - - # return if encoding is valid - utf8 = string.force_encoding('UTF-8') - return utf8 if utf8.valid_encoding? - - # try to encode from Windows-1252 to utf8 - string.encode!( 'UTF-8', 'Windows-1252' ) - - rescue EncodingError => e - puts "Bad encoding: #{new_value.inspect}" - string.encode!( 'UTF-8', invalid: :replace, undef: :replace, replace: '?' ) - end - return string - end -# puts '-------' + charset -# puts string - - # convert string - string.encode!( 'UTF-8', charset.upcase ) -# Iconv.conv( 'UTF8', charset, string ) - end =begin @@ -97,7 +63,7 @@ class Channel::EmailParser # set all headers mail.header.fields.each { |field| - data[field.name.downcase.to_sym] = conv( 'utf8', field.to_s ) + data[field.name.downcase.to_sym] = Encode.conv( 'utf8', field.to_s ) } # set extra headers @@ -124,7 +90,7 @@ class Channel::EmailParser # text attachment/body exists if mail.text_part data[:body] = mail.text_part.body.decoded - data[:body] = conv( mail.text_part.charset, data[:body] ) + data[:body] = Encode.conv( mail.text_part.charset, data[:body] ) # html attachment/body may exists and will be converted to text else @@ -132,7 +98,7 @@ class Channel::EmailParser if mail.html_part.body filename = 'html-email' data[:body] = mail.html_part.body.to_s - data[:body] = conv( mail.html_part.charset.to_s, data[:body] ) + data[:body] = Encode.conv( mail.html_part.charset.to_s, data[:body] ) data[:body] = html2ascii( data[:body] ) # any other attachments @@ -219,7 +185,7 @@ class Channel::EmailParser # text part if !mail.mime_type || mail.mime_type.to_s == '' || mail.mime_type.to_s.downcase == 'text/plain' data[:body] = mail.body.decoded - data[:body] = conv( mail.charset, data[:body] ) + data[:body] = Encode.conv( mail.charset, data[:body] ) # html part else @@ -227,7 +193,7 @@ class Channel::EmailParser if mail.mime_type.to_s.downcase == 'text/html' filename = 'html-email' data[:body] = mail.body.decoded - data[:body] = conv( mail.charset, data[:body] ) + data[:body] = Encode.conv( mail.charset, data[:body] ) data[:body] = html2ascii( data[:body] ) # any other attachments diff --git a/lib/encode.rb b/lib/encode.rb new file mode 100644 index 000000000..9daf07bd1 --- /dev/null +++ b/lib/encode.rb @@ -0,0 +1,31 @@ +#require 'iconv' +class Encode + def self.conv (charset, string) + + # if no charset is given, use LATIN1 as default + if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT' + charset = 'LATIN1' + end + + # return if string is false + return string if !string + + # validate already existing utf8 strings + if charset.downcase == 'utf8' || charset.downcase == 'utf-8' + begin + + # return if encoding is valid + utf8 = string.force_encoding('UTF-8') + return utf8 if utf8.valid_encoding? + + # try to encode from Windows-1252 to utf8 + string.encode!( 'UTF-8', 'Windows-1252' ) + + rescue EncodingError => e + puts "Bad encoding: #{new_value.inspect}" + string.encode!( 'UTF-8', invalid: :replace, undef: :replace, replace: '?' ) + end + return string + end + end +end \ No newline at end of file diff --git a/lib/rss.rb b/lib/rss.rb index 84f5704b1..838cbb6cb 100644 --- a/lib/rss.rb +++ b/lib/rss.rb @@ -16,8 +16,8 @@ module RSS rss.items.each { |item| record = { :id => item.id, - :title => item.title, - :summary => item.summary, + :title => Encode.conv( 'utf8', item.title ), + :summary => Encode.conv( 'utf8', item.summary ), :link => item.link, :published => item.published }