Moved to own encode.rb for utf8 input checks/validation.

This commit is contained in:
Martin Edenhofer 2012-12-05 02:27:56 +01:00
parent ccbf586ec7
commit 5a8f37dc6f
3 changed files with 39 additions and 42 deletions

View file

@ -1,42 +1,8 @@
# encoding: utf-8 # encoding: utf-8
require 'mail' require 'mail'
#require 'iconv'
class Channel::EmailParser class Channel::EmailParser
def conv (charset, string)
# if no charset is given, use LATIN1 as default
if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT'
charset = 'LATIN1'
end
# return if string is false
return string if !string
# validate already existing utf8 strings
if charset.downcase == 'utf8' || charset.downcase == 'utf-8'
begin
# return if encoding is valid
utf8 = string.force_encoding('UTF-8')
return utf8 if utf8.valid_encoding?
# try to encode from Windows-1252 to utf8
string.encode!( 'UTF-8', 'Windows-1252' )
rescue EncodingError => e
puts "Bad encoding: #{new_value.inspect}"
string.encode!( 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
end
return string
end
# puts '-------' + charset
# puts string
# convert string
string.encode!( 'UTF-8', charset.upcase )
# Iconv.conv( 'UTF8', charset, string )
end
=begin =begin
@ -97,7 +63,7 @@ class Channel::EmailParser
# set all headers # set all headers
mail.header.fields.each { |field| mail.header.fields.each { |field|
data[field.name.downcase.to_sym] = conv( 'utf8', field.to_s ) data[field.name.downcase.to_sym] = Encode.conv( 'utf8', field.to_s )
} }
# set extra headers # set extra headers
@ -124,7 +90,7 @@ class Channel::EmailParser
# text attachment/body exists # text attachment/body exists
if mail.text_part if mail.text_part
data[:body] = mail.text_part.body.decoded data[:body] = mail.text_part.body.decoded
data[:body] = conv( mail.text_part.charset, data[:body] ) data[:body] = Encode.conv( mail.text_part.charset, data[:body] )
# html attachment/body may exists and will be converted to text # html attachment/body may exists and will be converted to text
else else
@ -132,7 +98,7 @@ class Channel::EmailParser
if mail.html_part.body if mail.html_part.body
filename = 'html-email' filename = 'html-email'
data[:body] = mail.html_part.body.to_s data[:body] = mail.html_part.body.to_s
data[:body] = conv( mail.html_part.charset.to_s, data[:body] ) data[:body] = Encode.conv( mail.html_part.charset.to_s, data[:body] )
data[:body] = html2ascii( data[:body] ) data[:body] = html2ascii( data[:body] )
# any other attachments # any other attachments
@ -219,7 +185,7 @@ class Channel::EmailParser
# text part # text part
if !mail.mime_type || mail.mime_type.to_s == '' || mail.mime_type.to_s.downcase == 'text/plain' if !mail.mime_type || mail.mime_type.to_s == '' || mail.mime_type.to_s.downcase == 'text/plain'
data[:body] = mail.body.decoded data[:body] = mail.body.decoded
data[:body] = conv( mail.charset, data[:body] ) data[:body] = Encode.conv( mail.charset, data[:body] )
# html part # html part
else else
@ -227,7 +193,7 @@ class Channel::EmailParser
if mail.mime_type.to_s.downcase == 'text/html' if mail.mime_type.to_s.downcase == 'text/html'
filename = 'html-email' filename = 'html-email'
data[:body] = mail.body.decoded data[:body] = mail.body.decoded
data[:body] = conv( mail.charset, data[:body] ) data[:body] = Encode.conv( mail.charset, data[:body] )
data[:body] = html2ascii( data[:body] ) data[:body] = html2ascii( data[:body] )
# any other attachments # any other attachments

31
lib/encode.rb Normal file
View file

@ -0,0 +1,31 @@
#require 'iconv'
class Encode
def self.conv (charset, string)
# if no charset is given, use LATIN1 as default
if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT'
charset = 'LATIN1'
end
# return if string is false
return string if !string
# validate already existing utf8 strings
if charset.downcase == 'utf8' || charset.downcase == 'utf-8'
begin
# return if encoding is valid
utf8 = string.force_encoding('UTF-8')
return utf8 if utf8.valid_encoding?
# try to encode from Windows-1252 to utf8
string.encode!( 'UTF-8', 'Windows-1252' )
rescue EncodingError => e
puts "Bad encoding: #{new_value.inspect}"
string.encode!( 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
end
return string
end
end
end

View file

@ -16,8 +16,8 @@ module RSS
rss.items.each { |item| rss.items.each { |item|
record = { record = {
:id => item.id, :id => item.id,
:title => item.title, :title => Encode.conv( 'utf8', item.title ),
:summary => item.summary, :summary => Encode.conv( 'utf8', item.summary ),
:link => item.link, :link => item.link,
:published => item.published :published => item.published
} }