Improved email parsing.

This commit is contained in:
Martin Edenhofer 2012-05-08 00:37:07 +02:00
parent 63feba3e57
commit 21b244bc08
5 changed files with 303 additions and 6 deletions

View file

@ -2,7 +2,7 @@ require 'mail'
require 'iconv'
class Channel::EmailParser
def conv (charset, string)
if charset == 'US-ASCII' || charset == 'ASCII-8BIT'
if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT'
charset = 'LATIN1'
end
return string if charset.downcase == 'utf8' || charset.downcase == 'utf-8'
@ -25,7 +25,8 @@ class Channel::EmailParser
data[:from_email] = Mail::Address.new( mail[:from].value ).address
data[:from_local] = Mail::Address.new( mail[:from].value ).local
data[:from_domain] = Mail::Address.new( mail[:from].value ).domain
data[:from_display_name] = Mail::Address.new( mail[:from].value ).display_name
data[:from_display_name] = Mail::Address.new( mail[:from].value ).display_name ||
( Mail::Address.new( mail[:from].value ).comments && Mail::Address.new( mail[:from].value ).comments[0] )
# do extra decoding because we needed to use field.value
data[:from_display_name] = Mail::Field.new( 'X-From', data[:from_display_name] ).to_s
@ -36,17 +37,50 @@ class Channel::EmailParser
# body
# plain_part = mail.multipart? ? (mail.text_part ? mail.text_part.body.decoded : nil) : mail.body.decoded
# html_part = message.html_part ? message.html_part.body.decoded : nil
data[:attachments] = []
if mail.multipart?
data[:plain_part] = mail.text_part.body.decoded
data[:plain_part] = conv( mail.text_part.charset || 'LATIN1', data[:plain_part] )
data[:plain_part] = conv( mail.text_part.charset, data[:plain_part] )
else
# text part
if !mail.mime_type || mail.mime_type.to_s == '' || mail.mime_type.to_s.downcase == 'text/plain'
data[:plain_part] = mail.body.decoded
data[:plain_part] = conv( mail.body.charset || 'LATIN1', data[:plain_part] )
data[:plain_part] = conv( mail.charset, data[:plain_part] )
else
# html part
filename = '-no name-'
if mail.mime_type.to_s.downcase == 'text/html'
filename = 'html-email'
data[:plain_part] = mail.body.decoded
data[:plain_part] = conv( mail.charset, data[:plain_part] )
data[:plain_part] = html2ascii( data[:plain_part] )
# any other attachments
else
data[:plain_part] = 'no visible content'
end
# add body as attachment
headers_store = {}
if mail.mime_type
headers_store['Mime-Type'] = mail.mime_type
end
if mail.charset
headers_store['Charset'] = mail.charset
end
attachment = {
:data => mail.body.decoded,
:filename => mail.filename || filename,
:preferences => headers_store
}
data[:attachments].push attachment
end
end
# attachments
if mail.attachments
data[:attachments] = []
mail.attachments.each do |attachment|
# get file preferences
@ -232,4 +266,57 @@ class Channel::EmailParser
# return new objects
return ticket, article, user
end
def html2ascii(string)
# find <a href=....> and replace it with [x]
link_list = ''
counter = 0
string.gsub!( /<a\s.*?href=("|')(.+?)("|').*?>/ix ) { |item|
link = $2
counter = counter + 1
link_list += "[#{counter}] #{link}\n"
"[#{counter}]"
}
# remove empty lines
string.gsub!( /^\s*/m, '' )
# fix some bad stuff from opera and others
string.gsub!( /(\n\r|\r\r\n|\r\n)/s, "\n" )
# strip all other tags
string.gsub!( /\<.+?\>/s, '' )
# encode html entities like "&#8211;"
string.gsub!( /(&\#(\d+);?)/x ) { |item|
$2.chr
}
# encode html entities like "&#3d;"
string.gsub!( /(&\#[xX]([0-9a-fA-F]+);?)/x ) { |item|
chr_orig = $1
hex = $2.hex
if hex
chr = hex.chr
if chr
chr
else
chr_orig
end
else
chr_orig
end
}
# remove empty lines
string.gsub!( /^\s*\n\s*\n/m, "\n" )
# add extracted links
if link_list
string += "\n\n" + link_list
end
return string
end
end

38
test/fixtures/mail4.box vendored Normal file
View file

@ -0,0 +1,38 @@
From k.guenther@example.com Mon May 7 15:08:10 2012
Return-Path: <k.guenther@example.com>
X-Original-To: support@example.com
Delivered-To: box@samba.example.com
X-Greylist: delayed 355 seconds by postgrey-1.32 at samba; Mon, 07 May 2012 15:08:09 BST
Received: from smtprelay05.example.com (smtprelay05.example.com [8.6.3.9])
by samba.example.com (Postfix) with ESMTP id 011F9500D3D
for <support@example.com>; Mon, 7 May 2012 15:08:09 +0100 (BST)
Received: from [1.1.0.7] (helo=exchange.df.eu)
by smtprelay05.example.com with esmtps (TLSv1:RC4-MD5:128)
(Exim 4.68)
(envelope-from <k.guenther@example.com>)
id 1SROW2-0007tk-QP
for support@example.com; Mon, 07 May 2012 16:02:18 +0200
Received: from ECCR04PUBLIC.exchange.local ([1.1.2.4]) by
efe04.exchange.local ([1.1.0.7]) with mapi; Mon, 7 May 2012 15:58:33 +0200
From: =?utf-8?B?R8O8bnRoZXIgS2F0amEgfCBFeGFtcGxlIEdtYkg=?=
<k.guenther@example.com>
To: Martin Edenhofer via Znuny Team <support@example.com>
Date: Mon, 7 May 2012 15:58:32 +0200
Subject: AW: Ticket Templates [Ticket#11168]
Thread-Topic: Ticket Templates [Ticket#11168]
Thread-Index: Ac0sGqTnvktNHx1lQoaTDcVI7lUxJQAPqvXA
Message-ID: <F799DA4E63A20B4EBE9D5A412196D71D3CADBEA04E@ECCR04PUBLIC.exchange.local>
References: <F799DA4E63A20B4EBE9D5A412196D71D3CADBE9DF6@ECCR04PUBLIC.exchange.local>
<20120507062840.265.107538@portal.example.com>
In-Reply-To: <20120507062840.265.107538@portal.example.com>
Accept-Language: de-DE
Content-Language: de-DE
X-MS-Has-Attach:
X-MS-TNEF-Correlator:
acceptlanguage: de-DE
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
MIME-Version: 1.0
SGFsbG8gS2F0amEsCgpzdXBlciEgSWNoIGZyZXUgbWljaCEKCldpciB3w7xyZGVuIGdlcm5lIGRpZSBQcsOkc2VudGF0aW9uL0VpbmbDvGhydW5nIGluIGRpZSBUaWNrZXQgVGVtcGxhdGVzIHBlciBTY3JlZW5zaGFyaW5nIG9kZXIgenVtaW5kZXN0IHBlciBUZWxlZm9uIG1hY2hlbi4KCk3DtmdsaWNoZSBUZXJtaW5lOgpvIERvLCAxMC4wNS4yMDEyIDE1OjAwLTE2OjAwCm8gRnIsICAxMS4wNS4yMDEyIDEzOjAwLTE0OjAwCm8gRGksICAxNS4wNS4yMDEyIDE3OjAwLTE4OjAwCgrDnGJlciBGZWVkYmFjayB3w7xyZGUgaWNoIG1pY2ggZnJldWVuIQoKUFM6IFp1ciBiZXNzZXJlbiDDnGJlcnNpY2h0IGhhYmUgaWNoIGVpbiBUaWNrZXQgZXJzdGVsbHQuIDopIEltIEZvb3RlciBzaW5kIHVuc2VyZSBnZXNjaMOkZnRsaWNoZW4gS29udGFrdGRhdGVuIChmYWxscyBkaWVzZSBpcmdlbmR3YW5uIGVpbm1hbCBiZW7DtnRpZ3Qgd2VyZGVuIHNvbGx0ZW4pLCBtZWhyIGRhenUgaW4gZWluIHBhYXIgVGFnZW4uCgpMaWViZSBHcsO8w59lIQoKIC1NYXJ0aW4KCgo

76
test/fixtures/mail5.box vendored Normal file
View file

@ -0,0 +1,76 @@
From marc.smith@example.com Mon May 7 07:45:48 2012
Return-Path: <marc.smith@example.com>
X-Original-To: support@znuny.com
Delivered-To: box@samba.example.com
Received: from mailout-de.example.com (mailout-de.example.com [2.1.6.2])
by samba.example.com (Postfix) with SMTP id F1C9E500D3D
for <support@znuny.com>; Mon, 7 May 2012 07:45:47 +0100 (BST)
Received: (qmail invoked by alias); 07 May 2012 06:45:48 -0000
Received: from unknown (EHLO [1.2.1.2]) [7.3.2.1]
by mail.example.com (mp072) with SMTP; 07 May 2012 08:45:48 +0200
X-Authenticated: #69078992
X-Provags-ID: V01U2FsdGVkX1+IkUVPK6GIbZ2ezhmZfpCU0OVlFkuyPGDNsL0V5H
FxvJdecWb4ibKL
Message-ID: <4FA76F9A.3060602@example.com>
Date: Mon, 07 May 2012 08:45:46 +0200
From: marc.smith@example.com (Marc Smith)
User-Agent: Mozilla/5.0 (Windows NT 6.0; WOW64; rv:12.0) Gecko/20120428 Thunderbird/12.0.1
MIME-Version: 1.0
To: Martin Edenhofer via Znuny Team <support@znuny.com>
Subject: Re: XXXX Betatest Ticket Templates [Ticket#11162]
References: <20120507061007.259.822311@portal.znuny.com>
In-Reply-To: <20120507061007.259.822311@portal.znuny.com>
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 8bit
X-Y-GMX-Trusted: 0
Status: RO
Content-Length: 1418
Lines: 46
Am 07.05.2012 08:10, schrieb Martin Edenhofer via Znuny Team:
> Hallo Marc,
>
> super! Ich freu mich!
>
> Wir würden gerne die Präsentation/Einführung in die Ticket Templates per Screensharing oder zumindest per Telefon machen.
>
> Mögliche Termine:
> o Do, 10.05.2012 11:00-12:00
> o Fr, 11.05.2012 09:00-10:00
> o Di, 15.05.2012 14:00-15:00
>
> Über Feedback würde ich mich freuen!
>
> PS: Zur besseren Übersicht habe ich ein Ticket erstellt. :) Im Footer sind unsere geschäftlichen Kontaktdaten (falls diese irgendwann einmal benötigt werden sollten), mehr dazu in ein paar Tagen.
>
> Liebe Grüße!
>
> -Martin
>
> --
> Martin Edenhofer
>
> Znuny GmbH // Marienstraße 11 // 10117 Berlin // Germany
>
> P: +49 (0) 30 60 98 54 18-0
> F: +49 (0) 30 60 98 54 18-8
>
> Location: Berlin - HRB 139852 B Amtsgericht Berlin-Charlottenburg
> Managing Director: Martin Edenhofer
Hallo Martin,
John und ich könnten leider nur am Freitag, da wir Donnerstag und nächste
Woche bereits Termine haben.
Wir würden uns dann den Freitag vormerken...;-)
N Screensharing ist bei uns leider nicht so ohne Probleme möglich, bzw.
wir könnten einen PC aufsetzen mit nem seperaten Internetzugang auf dem
wir ne VM vorbereiten könnten, da wir von dem "Internet PC" nicht auf
unser XXXX zugreifen können. Falls ihr sonst noch irgendwas benötigt
einfach kurz ne Rückmeldung...;-)
Grüße aus Bonn
John & Marc

32
test/fixtures/mail6.box vendored Normal file
View file

@ -0,0 +1,32 @@
From me@bogen.net Sat Sep 13 16:50:43 2003
Return-Path: <me@bogen.net>
Received: from airoma.example (law10-f30.law10.airoma.example [4.4.4.4]) by esanta.edenhofer.de (Postfix) with ESMTP id 2307484296 for <demo@exampel.com>; Sat, 13 Sep 2003 16:50:43 +0200 (CEST)
Received: from mail pickup service by airoma.example with Mc SMTPSVC; Sat, 13 Sep 2003 07:37:26 -0700
Received: from 11.11.11.11 by lw10fd.law10.com with HTTP; Sat, 13 Sep 2003 14:37:26 GMT
X-Originating-Ip: [5.5.5.5]
X-Originating-Email: [me@example.com]
From: =?Windows-1252?Q?Hans_B=C4KO?= =?iso-8859-15?q?Sch=F6nland?= <me@bogen.net>
To: =?iso-8859-2?Q?Namedy=F1ski?= (hans@example.com)
Subject: utf8: =?UTF-8?Q?=E4=BD=BF=E3=81=A3=E3=81=A6?= / ISO-8859-1: =?iso-8859-1?Q?Priorit=E4t=22_?= / cp-1251: =?windows-1251?B?0eXw4+XpINPj6+j26uj1?=
Date: Sat, 13 Sep 2003 10:37:26 -0400
MIME-Version: 1.0
Content-Type: text/html; charset="iso-8859-15"; format=flowed
Message-Id: <Law10-F30dRmhKuTqtA00018823@coolair.example>
X-Originalarrivaltime: 13 Sep 2003 14:37:26.0630 (UTC) FILETIME=[8D57B860:01C37A04]
<html><div style='background-color:'><P>this is a test</P></div><br clear=all><hr> <a href="http://localhost/8HMZENUS/2737??PS=">Compare Cable, DSL or Satellite plans: As low as $2.95. </a>
<br>
<br>
Test1:&#8211;
<br>
Test2:&amp;
<br>
Test3:&ni;
<br>
Test4:&amp;
<br>
Test5:&#x3d;
</html>

View file

@ -35,6 +35,70 @@ class EmailParserTest < ActiveSupport::TestCase
:subject => 'Ticket Templates',
},
},
{
:data => IO.read('test/fixtures/mail4.box'),
:body_md5 => '2f2c3a5c233dbd9658ab37d39469b7d0',
:params => {
:from => '"Günther Katja | Example GmbH" <k.guenther@example.com>',
:from_email => 'k.guenther@example.com',
:from_display_name => 'Günther Katja | Example GmbH',
:subject => 'AW: Ticket Templates [Ticket#11168]',
:plain_part => "Hallo Katja,
super! Ich freu mich!
Wir würden gerne die Präsentation/Einführung in die Ticket Templates per Screensharing oder zumindest per Telefon machen.
Mögliche Termine:
o Do, 10.05.2012 15:00-16:00
o Fr, 11.05.2012 13:00-14:00
o Di, 15.05.2012 17:00-18:00
Über Feedback würde ich mich freuen!
PS: Zur besseren Übersicht habe ich ein Ticket erstellt. :) Im Footer sind unsere geschäftlichen Kontaktdaten (falls diese irgendwann einmal benötigt werden sollten), mehr dazu in ein paar Tagen.
Liebe Grüße!
-Martin
",
},
},
{
:data => IO.read('test/fixtures/mail5.box'),
:body_md5 => '51364a306362f513f53f2bbea7820f37',
:params => {
:from => 'marc.smith@example.com (Marc Smith)',
:from_email => 'marc.smith@example.com',
:from_display_name => 'Marc Smith',
:subject => 'Re: XXXX Betatest Ticket Templates [Ticket#11162]',
},
},
{
:data => IO.read('test/fixtures/mail6.box'),
:body_md5 => '1fc492b8d762d82f861dbb70b7cf7610',
:params => {
:from => '"Hans BÄKOSchönland" <me@bogen.net>',
:from_email => 'me@bogen.net',
:from_display_name => 'Hans BÄKOSchönland',
:subject => 'utf8: 使って / ISO-8859-1: Priorität" / cp-1251: Сергей Углицких',
:plain_part => "this is a test [1]Compare Cable, DSL or Satellite plans: As low as $2.95.
Test1:8
Test2:&amp;
Test3:&ni;
Test4:&amp;
Test5:=
[1] http://localhost/8HMZENUS/2737??PS=
"
},
},
]
files.each { |file|