Refactor Channel::EmailParser#parse method

This commit is contained in:
Ryan Lue 2018-06-05 19:08:52 +08:00
parent 8951d3f1bd
commit c17a29cd0f
3 changed files with 317 additions and 441 deletions

View file

@ -3,6 +3,9 @@
# encoding: utf-8 # encoding: utf-8
class Channel::EmailParser class Channel::EmailParser
EMAIL_REGEX = /.+@.+/
RECIPIENT_FIELDS = %w[to cc delivered-to x-original-to envelope-to].freeze
SENDER_FIELDS = %w[from reply-to return-path].freeze
=begin =begin
@ -67,398 +70,16 @@ class Channel::EmailParser
=end =end
def parse(msg) def parse(msg)
data = {}.with_indifferent_access
mail = Mail.new(msg.utf8_encode) mail = Mail.new(msg.utf8_encode)
# set all headers message_attributes = [
mail.header.fields.each do |field| { mail_instance: mail },
message_header_hash(mail),
message_body_hash(mail),
self.class.sender_attributes(mail),
]
# full line, encode, ready for storage message_attributes.reduce({}.with_indifferent_access, &:merge)
begin
value = field.to_utf8
if value.blank?
value = field.raw_value
end
data[field.name.to_s.downcase.to_sym] = value
rescue => e
data[field.name.to_s.downcase.to_sym] = field.raw_value
end
# if we need to access the lines by objects later again
data["raw-#{field.name.downcase}".to_sym] = field
end
# verify content, ignore recipients with non email address
['to', 'cc', 'delivered-to', 'x-original-to', 'envelope-to'].each do |field|
next if data[field.to_sym].blank?
next if data[field.to_sym].match?(/@/)
data[field.to_sym] = ''
end
# get sender with @ / email address
from = nil
['from', 'reply-to', 'return-path'].each do |item|
next if data[item.to_sym].blank?
next if data[item.to_sym] !~ /@/
from = data[item.to_sym]
break if from
end
# in case of no sender with email address - get sender
if !from
['from', 'reply-to', 'return-path'].each do |item|
next if data[item.to_sym].blank?
from = data[item.to_sym]
break if from
end
end
# set x-any-recipient
data['x-any-recipient'.to_sym] = ''
['to', 'cc', 'delivered-to', 'x-original-to', 'envelope-to'].each do |item|
next if data[item.to_sym].blank?
if data['x-any-recipient'.to_sym] != ''
data['x-any-recipient'.to_sym] += ', '
end
data['x-any-recipient'.to_sym] += mail[item.to_sym].to_s
end
# set extra headers
data = data.merge(Channel::EmailParser.sender_properties(from))
# do extra encoding (see issue#1045)
if data[:subject].present?
data[:subject].sub!(/^=\?us-ascii\?Q\?(.+)\?=$/, '\1')
end
# compat headers
data[:message_id] = data['message-id'.to_sym]
# body
# plain_part = mail.multipart? ? (mail.text_part ? mail.text_part.body.decoded : nil) : mail.body.decoded
# html_part = message.html_part ? message.html_part.body.decoded : nil
data[:attachments] = []
# multi part email
if mail.multipart?
# html attachment/body may exists and will be converted to strict html
if mail.html_part&.body
data[:body] = mail.html_part.body.to_s
data[:body] = data[:body].utf8_encode(from: mail.html_part.charset, fallback: :read_as_sanitized_binary)
data[:body] = data[:body].html2html_strict
data[:content_type] = 'text/html'
end
# text attachment/body exists
if data[:body].blank? && mail.text_part
data[:body] = mail.text_part.body.decoded
data[:body] = data[:body].utf8_encode(from: mail.text_part.charset, fallback: :read_as_sanitized_binary)
data[:content_type] = 'text/plain'
end
# any other attachments
if data[:body].blank?
data[:body] = 'no visible content'
data[:content_type] = 'text/plain'
end
# add html attachment/body as real attachment
if mail.html_part
filename = 'message.html'
headers_store = {
'content-alternative' => true,
'original-format' => true,
}
if mail.mime_type
headers_store['Mime-Type'] = mail.html_part.mime_type
end
if mail.charset
headers_store['Charset'] = mail.html_part.charset
end
attachment = {
data: mail.html_part.body.to_s,
filename: mail.html_part.filename || filename,
preferences: headers_store
}
data[:attachments].push attachment
end
# get attachments
mail.parts&.each do |part|
# protect process to work fine with spam emails, see test/data/mail/mail015.box
begin
attachs = _get_attachment(part, data[:attachments], mail)
data[:attachments].concat(attachs)
rescue
attachs = _get_attachment(part, data[:attachments], mail)
data[:attachments].concat(attachs)
end
end
# not multipart email
# html part only, convert to text and add it as attachment
elsif mail.mime_type && mail.mime_type.to_s.casecmp('text/html').zero?
filename = 'message.html'
data[:body] = mail.body.decoded
data[:body] = data[:body].utf8_encode(from: mail.charset, fallback: :read_as_sanitized_binary)
data[:body] = data[:body].html2html_strict
data[:content_type] = 'text/html'
# add body as attachment
headers_store = {
'content-alternative' => true,
'original-format' => true,
}
if mail.mime_type
headers_store['Mime-Type'] = mail.mime_type
end
if mail.charset
headers_store['Charset'] = mail.charset
end
attachment = {
data: mail.body.decoded,
filename: mail.filename || filename,
preferences: headers_store
}
data[:attachments].push attachment
# text part only
elsif !mail.mime_type || mail.mime_type.to_s == '' || mail.mime_type.to_s.casecmp('text/plain').zero?
data[:body] = mail.body.decoded
data[:body] = data[:body].utf8_encode(from: mail.charset, fallback: :read_as_sanitized_binary)
data[:content_type] = 'text/plain'
else
filename = '-no name-'
data[:body] = 'no visible content'
data[:content_type] = 'text/plain'
# add body as attachment
headers_store = {
'content-alternative' => true,
}
if mail.mime_type
headers_store['Mime-Type'] = mail.mime_type
end
if mail.charset
headers_store['Charset'] = mail.charset
end
attachment = {
data: mail.body.decoded,
filename: mail.filename || filename,
preferences: headers_store
}
data[:attachments].push attachment
end
# strip not wanted chars
data[:body].gsub!(/\n\r/, "\n")
data[:body].gsub!(/\r\n/, "\n")
data[:body].tr!("\r", "\n")
# get mail date
begin
if mail.date
data[:date] = Time.zone.parse(mail.date.to_s)
end
rescue
data[:date] = nil
end
# remember original mail instance
data[:mail_instance] = mail
data
end
def _get_attachment(file, attachments, mail)
# check if sub parts are available
if file.parts.present?
list = []
file.parts.each do |p|
attachment = _get_attachment(p, attachments, mail)
list.concat(attachment)
end
return list
end
# ignore text/plain attachments - already shown in view
return [] if mail.text_part&.body.to_s == file.body.to_s
# ignore text/html - html part, already shown in view
return [] if mail.html_part&.body.to_s == file.body.to_s
# get file preferences
headers_store = {}
file.header.fields.each do |field|
# full line, encode, ready for storage
begin
value = field.to_utf8
if value.blank?
value = field.raw_value
end
headers_store[field.name.to_s] = value
rescue => e
headers_store[field.name.to_s] = field.raw_value
end
end
# cleanup content id, <> will be added automatically later
headers_store['Content-ID']&.gsub!(/(^<|>$)/, '')
# get filename from content-disposition
filename = nil
# workaround for: NoMethodError: undefined method `filename' for #<Mail::UnstructuredField:0x007ff109e80678>
begin
filename = file.header[:content_disposition].filename
rescue
begin
if file.header[:content_disposition].to_s =~ /filename="(.+?)"/i
filename = $1
elsif file.header[:content_disposition].to_s =~ /filename='(.+?)'/i
filename = $1
elsif file.header[:content_disposition].to_s =~ /filename=(.+?);/i
filename = $1
end
rescue
Rails.logger.debug { 'Unable to get filename' }
end
end
# as fallback, use raw values
if filename.blank?
if headers_store['Content-Disposition'].to_s =~ /filename="(.+?)"/i
filename = $1
elsif headers_store['Content-Disposition'].to_s =~ /filename='(.+?)'/i
filename = $1
elsif headers_store['Content-Disposition'].to_s =~ /filename=(.+?);/i
filename = $1
end
end
# for some broken sm mail clients (X-MimeOLE: Produced By Microsoft Exchange V6.5)
filename ||= file.header[:content_location].to_s
# generate file name based on content-id
if filename.blank? && headers_store['Content-ID'].present?
if headers_store['Content-ID'] =~ /(.+?)@.+?/i
filename = $1
end
end
# generate file name based on content type
if filename.blank? && headers_store['Content-Type'].present?
if headers_store['Content-Type'].match?(%r{^message/rfc822}i)
begin
parser = Channel::EmailParser.new
mail_local = parser.parse(file.body.to_s)
filename = if mail_local[:subject].present?
"#{mail_local[:subject]}.eml"
elsif headers_store['Content-Description'].present?
"#{headers_store['Content-Description']}.eml".to_s.force_encoding('utf-8')
else
'Mail.eml'
end
rescue
filename = 'Mail.eml'
end
end
# e. g. Content-Type: video/quicktime; name="Video.MOV";
if filename.blank?
['name="(.+?)"(;|$)', "name='(.+?)'(;|$)", 'name=(.+?)(;|$)'].each do |regexp|
if headers_store['Content-Type'] =~ /#{regexp}/i
filename = $1
break
end
end
end
# e. g. Content-Type: video/quicktime
if filename.blank?
map = {
'message/delivery-status': ['txt', 'delivery-status'],
'text/plain': %w[txt document],
'text/html': %w[html document],
'video/quicktime': %w[mov video],
'image/jpeg': %w[jpg image],
'image/jpg': %w[jpg image],
'image/png': %w[png image],
'image/gif': %w[gif image],
}
map.each do |type, ext|
next if headers_store['Content-Type'] !~ /^#{Regexp.quote(type)}/i
filename = if headers_store['Content-Description'].present?
"#{headers_store['Content-Description']}.#{ext[0]}".to_s.force_encoding('utf-8')
else
"#{ext[1]}.#{ext[0]}"
end
break
end
end
end
if filename.blank?
filename = 'file'
end
attachment_count = 0
local_filename = ''
local_extention = ''
if filename =~ /^(.*?)\.(.+?)$/
local_filename = $1
local_extention = $2
end
(1..1000).each do |count|
filename_exists = false
attachments.each do |attachment|
if attachment[:filename] == filename
filename_exists = true
end
end
break if filename_exists == false
filename = if local_extention.present?
"#{local_filename}#{count}.#{local_extention}"
else
"#{local_filename}#{count}"
end
end
# get mime type
if file.header[:content_type]&.string
headers_store['Mime-Type'] = file.header[:content_type].string
end
# get charset
if file.header&.charset
headers_store['Charset'] = file.header.charset
end
# remove not needed header
headers_store.delete('Content-Transfer-Encoding')
headers_store.delete('Content-Disposition')
# workaround for mail gem
# https://github.com/zammad/zammad/issues/928
filename = Mail::Encodings.value_decode(filename)
attach = {
data: file.body.to_s,
filename: filename,
preferences: headers_store,
}
[attach]
end end
=begin =begin
@ -708,45 +329,51 @@ returns
true true
end end
def self.sender_properties(from) def self.sender_attributes(from)
data = {} if from.is_a?(Mail::Message)
return data if from.blank? from = SENDER_FIELDS.map { |f| from.header[f] }.compact
begin .map(&:to_utf8).reject(&:blank?)
list = Mail::AddressList.new(from) .partition { |address| address.match?(EMAIL_REGEX) }
list.addresses.each do |address| .flatten.first
data[:from_email] = address.address
data[:from_local] = address.local
data[:from_domain] = address.domain
data[:from_display_name] = address.display_name ||
(address.comments && address.comments[0])
break if data[:from_email].present? && data[:from_email] =~ /@/
end
rescue => e
if from =~ /<>/ && from =~ /<.+?>/
data = sender_properties(from.gsub(/<>/, ''))
end
end end
if data.blank? || data[:from_email].blank? data = {}.with_indifferent_access
from.strip! return data if from.blank?
if from =~ /^(.+?)<(.+?)@(.+?)>$/
data[:from_email] = "#{$2}@#{$3}" from = from.gsub('<>', '').strip
data[:from_local] = $2 mail_address = begin
data[:from_domain] = $3 Mail::AddressList.new(from).addresses
.select { |a| a.address.present? }
.partition { |a| a.address.match?(EMAIL_REGEX) }
.flatten.first
rescue Mail::Field::ParseError => e
STDOUT.puts e
end
if mail_address&.address.present?
data[:from_email] = mail_address.address
data[:from_local] = mail_address.local
data[:from_domain] = mail_address.domain
data[:from_display_name] = mail_address.display_name || mail_address.comments&.first
elsif from =~ /^(.+?)<((.+?)@(.+?))>$/
data[:from_email] = $2
data[:from_local] = $3
data[:from_domain] = $4
data[:from_display_name] = $1 data[:from_display_name] = $1
else else
data[:from_email] = from data[:from_email] = from
data[:from_local] = from data[:from_local] = from
data[:from_domain] = from data[:from_domain] = from
end data[:from_display_name] = from
end end
# do extra decoding because we needed to use field.value # do extra decoding because we needed to use field.value
data[:from_display_name] = Mail::Field.new('X-From', data[:from_display_name].to_utf8).to_s data[:from_display_name] =
data[:from_display_name].delete!('"') Mail::Field.new('X-From', data[:from_display_name].to_utf8)
data[:from_display_name].strip! .to_s
data[:from_display_name].gsub!(/^'/, '') .delete('"')
data[:from_display_name].gsub!(/'$/, '') .strip
.gsub(/(^'|'$)/, '')
data data
end end
@ -835,6 +462,270 @@ process unprocessable_mails (tmp/unprocessable_mail/*.eml) again
files files
end end
private
def message_header_hash(mail)
imported_fields = mail.header.fields.map do |f|
value = begin
f.to_utf8
rescue NameError # handle bug #1238 in Mail 2.7.1.rc1
'' # swap out for commented line below once upgrade is available
end
[f.name.downcase, value]
end.to_h
# imported_fields = mail.header.fields.map { |f| [f.name.downcase, f.to_utf8] }.to_h
raw_fields = mail.header.fields.map { |f| ["raw-#{f.name.downcase}", f] }.to_h
custom_fields = {}.tap do |h|
validated_recipients = imported_fields.slice(*RECIPIENT_FIELDS)
.transform_values { |v| v.match?(EMAIL_REGEX) ? v : '' }
h.merge!(validated_recipients)
h['date'] = Time.zone.parse(mail.date.to_s) || imported_fields['date']
h['message_id'] = imported_fields['message-id']
h['subject'] = imported_fields['subject']&.sub(/^=\?us-ascii\?Q\?(.+)\?=$/, '\1')
h['x-any-recipient'] = validated_recipients.values.select(&:present?).join(', ')
end
[imported_fields, raw_fields, custom_fields].reduce({}.with_indifferent_access, &:merge)
end
def message_body_hash(mail)
message = [mail.html_part, mail.text_part, mail].find { |m| m&.body.present? }
if message.mime_type.nil? || message.mime_type.match?(%r{^text/(plain|html)$})
content_type = message.mime_type || 'text/plain'
body = body_text(message, strict_html: content_type.eql?('text/html'))
end
content_type = 'text/plain' if body.blank?
{
attachments: collect_attachments(mail),
content_type: content_type || 'text/plain',
body: body.presence || 'no visible content'
}.with_indifferent_access
end
def body_text(message, **options)
body_text = begin
message.body.to_s
rescue Mail::UnknownEncodingType # see test/data/mail/mail043.box / issue #348
message.body.raw_source
end
body_text = body_text.utf8_encode(from: message.charset, fallback: :read_as_sanitized_binary)
body_text = Mail::Utilities.to_lf(body_text)
return body_text.html2html_strict if options[:strict_html]
body_text
end
def collect_attachments(mail)
attachments = []
# Add non-plaintext body as an attachment
if mail.html_part&.body.present? ||
(!mail.multipart? && mail.mime_type.present? && mail.mime_type != 'text/plain')
message = mail.html_part || mail
filename = message.filename.presence ||
(message.mime_type.eql?('text/html') ? 'message.html' : '-no name-')
headers_store = {
'content-alternative' => true,
'original-format' => message.mime_type.eql?('text/html'),
'Mime-Type' => message.mime_type,
'Charset' => message.charset,
}.reject { |_, v| v.blank? }
attachments.push({ data: body_text(message),
filename: filename,
preferences: headers_store })
end
mail.parts.each do |part|
begin
new_attachments = get_attachments(part, attachments, mail).flatten.compact
attachments.push(*new_attachments)
rescue => e # Protect process to work with spam emails (see test/fixtures/mail15.box)
raise e if (fail_count ||= 0).positive?
(fail_count += 1) && retry
end
end
attachments
end
def get_attachments(file, attachments, mail)
return file.parts.map { |p| get_attachments(p, attachments, mail) } if file.parts.any?
return [] if [mail.text_part, mail.html_part].include?(file)
# get file preferences
headers_store = {}
file.header.fields.each do |field|
# full line, encode, ready for storage
begin
value = field.to_utf8
if value.blank?
value = field.raw_value
end
headers_store[field.name.to_s] = value
rescue => e
headers_store[field.name.to_s] = field.raw_value
end
end
# cleanup content id, <> will be added automatically later
if headers_store['Content-ID']
headers_store['Content-ID'].gsub!(/^</, '')
headers_store['Content-ID'].gsub!(/>$/, '')
end
# get filename from content-disposition
# workaround for: NoMethodError: undefined method `filename' for #<Mail::UnstructuredField:0x007ff109e80678>
filename = file.header[:content_disposition].try(:filename)
begin
if file.header[:content_disposition].to_s =~ /filename="(.+?)"/i
filename = $1
elsif file.header[:content_disposition].to_s =~ /filename='(.+?)'/i
filename = $1
elsif file.header[:content_disposition].to_s =~ /filename=(.+?);/i
filename = $1
end
rescue
Rails.logger.debug { 'Unable to get filename' }
end
# as fallback, use raw values
if filename.blank?
if headers_store['Content-Disposition'].to_s =~ /filename="(.+?)"/i
filename = $1
elsif headers_store['Content-Disposition'].to_s =~ /filename='(.+?)'/i
filename = $1
elsif headers_store['Content-Disposition'].to_s =~ /filename=(.+?);/i
filename = $1
end
end
# for some broken sm mail clients (X-MimeOLE: Produced By Microsoft Exchange V6.5)
filename ||= file.header[:content_location].to_s
# generate file name based on content-id
if filename.blank? && headers_store['Content-ID'].present?
if headers_store['Content-ID'] =~ /(.+?)@.+?/i
filename = $1
end
end
# generate file name based on content type
if filename.blank? && headers_store['Content-Type'].present?
if headers_store['Content-Type'].match?(%r{^message/rfc822}i)
begin
parser = Channel::EmailParser.new
mail_local = parser.parse(file.body.to_s)
filename = if mail_local[:subject].present?
"#{mail_local[:subject]}.eml"
elsif headers_store['Content-Description'].present?
"#{headers_store['Content-Description']}.eml".to_s.force_encoding('utf-8')
else
'Mail.eml'
end
rescue
filename = 'Mail.eml'
end
end
# e. g. Content-Type: video/quicktime; name="Video.MOV";
if filename.blank?
['name="(.+?)"(;|$)', "name='(.+?)'(;|$)", 'name=(.+?)(;|$)'].each do |regexp|
if headers_store['Content-Type'] =~ /#{regexp}/i
filename = $1
break
end
end
end
# e. g. Content-Type: video/quicktime
if filename.blank?
map = {
'message/delivery-status': ['txt', 'delivery-status'],
'text/plain': %w[txt document],
'text/html': %w[html document],
'video/quicktime': %w[mov video],
'image/jpeg': %w[jpg image],
'image/jpg': %w[jpg image],
'image/png': %w[png image],
'image/gif': %w[gif image],
}
map.each do |type, ext|
next if headers_store['Content-Type'] !~ /^#{Regexp.quote(type)}/i
filename = if headers_store['Content-Description'].present?
"#{headers_store['Content-Description']}.#{ext[0]}".to_s.force_encoding('utf-8')
else
"#{ext[1]}.#{ext[0]}"
end
break
end
end
end
if filename.blank?
filename = 'file'
end
local_filename = ''
local_extention = ''
if filename =~ /^(.*?)\.(.+?)$/
local_filename = $1
local_extention = $2
end
1.upto(1000) do |i|
filename_exists = false
attachments.each do |attachment|
if attachment[:filename] == filename
filename_exists = true
end
end
break if filename_exists == false
filename = if local_extention.present?
"#{local_filename}#{i}.#{local_extention}"
else
"#{local_filename}#{i}"
end
end
# get mime type
if file.header[:content_type]&.string
headers_store['Mime-Type'] = file.header[:content_type].string
end
# get charset
if file.header&.charset
headers_store['Charset'] = file.header.charset
end
# remove not needed header
headers_store.delete('Content-Transfer-Encoding')
headers_store.delete('Content-Disposition')
# workaround for mail gem
# https://github.com/zammad/zammad/issues/928
filename = Mail::Encodings.value_decode(filename)
attach = {
data: file.body.to_s,
filename: filename,
preferences: headers_store,
}
[attach]
end
end end
module Mail module Mail
@ -848,7 +739,7 @@ module Mail
end end
end end
# workaround to parse subjects with 2 different encodings correctly (e. g. quoted-printable see test/data/mail/mail009.box) # workaround to parse subjects with 2 different encodings correctly (e. g. quoted-printable see test/fixtures/mail9.box)
module Encodings module Encodings
def self.value_decode(str) def self.value_decode(str)
# Optimization: If there's no encoded-words in the string, just return it # Optimization: If there's no encoded-words in the string, just return it
@ -883,19 +774,4 @@ module Mail
end.join('') end.join('')
end end
end end
# issue#348 - IMAP mail fetching stops because of broken spam email (e. g. broken Content-Transfer-Encoding value see test/data/mail/mail043.box)
# https://github.com/zammad/zammad/issues/348
class Body
def decoded
if !Encodings.defined?(encoding)
#raise UnknownEncodingType, "Don't know how to decode #{encoding}, please call #encoded and decode it yourself."
Rails.logger.info "UnknownEncodingType: Don't know how to decode #{encoding}!"
raw_source
else
Encodings.get_encoding(encoding).decode(raw_source)
end
end
end
end end

View file

@ -19,7 +19,7 @@ module Channel::Filter::ReplyToBasedSender
mail['origin_from_display_name'.to_sym] = mail[:from_display_name] mail['origin_from_display_name'.to_sym] = mail[:from_display_name]
# get properties of reply-to header # get properties of reply-to header
result = Channel::EmailParser.sender_properties(reply_to) result = Channel::EmailParser.sender_attributes(reply_to)
if setting == 'as_sender_of_email' if setting == 'as_sender_of_email'

View file

@ -2,4 +2,4 @@
from: marketingmanager@nthcpghana.com from: marketingmanager@nthcpghana.com
from_email: marketingmanager@nthcpghana.com from_email: marketingmanager@nthcpghana.com
from_display_name: '' from_display_name: ''
to: to: ''