From 21b244bc084fb45097d0baf02207d0e8a95e6b0c Mon Sep 17 00:00:00 2001 From: Martin Edenhofer Date: Tue, 8 May 2012 00:37:07 +0200 Subject: [PATCH] Improved email parsing. --- app/models/channel/email_parser.rb | 99 ++++++++++++++++++++++++++++-- test/fixtures/mail4.box | 38 ++++++++++++ test/fixtures/mail5.box | 76 +++++++++++++++++++++++ test/fixtures/mail6.box | 32 ++++++++++ test/unit/email_parser_test.rb | 64 +++++++++++++++++++ 5 files changed, 303 insertions(+), 6 deletions(-) create mode 100644 test/fixtures/mail4.box create mode 100644 test/fixtures/mail5.box create mode 100644 test/fixtures/mail6.box diff --git a/app/models/channel/email_parser.rb b/app/models/channel/email_parser.rb index ee9bf35b2..1f3cfa228 100644 --- a/app/models/channel/email_parser.rb +++ b/app/models/channel/email_parser.rb @@ -2,7 +2,7 @@ require 'mail' require 'iconv' class Channel::EmailParser def conv (charset, string) - if charset == 'US-ASCII' || charset == 'ASCII-8BIT' + if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT' charset = 'LATIN1' end return string if charset.downcase == 'utf8' || charset.downcase == 'utf-8' @@ -25,7 +25,8 @@ class Channel::EmailParser data[:from_email] = Mail::Address.new( mail[:from].value ).address data[:from_local] = Mail::Address.new( mail[:from].value ).local data[:from_domain] = Mail::Address.new( mail[:from].value ).domain - data[:from_display_name] = Mail::Address.new( mail[:from].value ).display_name + data[:from_display_name] = Mail::Address.new( mail[:from].value ).display_name || + ( Mail::Address.new( mail[:from].value ).comments && Mail::Address.new( mail[:from].value ).comments[0] ) # do extra decoding because we needed to use field.value data[:from_display_name] = Mail::Field.new( 'X-From', data[:from_display_name] ).to_s @@ -36,17 +37,50 @@ class Channel::EmailParser # body # plain_part = mail.multipart? ? (mail.text_part ? mail.text_part.body.decoded : nil) : mail.body.decoded # html_part = message.html_part ? message.html_part.body.decoded : nil + data[:attachments] = [] if mail.multipart? data[:plain_part] = mail.text_part.body.decoded - data[:plain_part] = conv( mail.text_part.charset || 'LATIN1', data[:plain_part] ) + data[:plain_part] = conv( mail.text_part.charset, data[:plain_part] ) else - data[:plain_part] = mail.body.decoded - data[:plain_part] = conv( mail.body.charset || 'LATIN1', data[:plain_part] ) + + # text part + if !mail.mime_type || mail.mime_type.to_s == '' || mail.mime_type.to_s.downcase == 'text/plain' + data[:plain_part] = mail.body.decoded + data[:plain_part] = conv( mail.charset, data[:plain_part] ) + else + + # html part + filename = '-no name-' + if mail.mime_type.to_s.downcase == 'text/html' + filename = 'html-email' + data[:plain_part] = mail.body.decoded + data[:plain_part] = conv( mail.charset, data[:plain_part] ) + data[:plain_part] = html2ascii( data[:plain_part] ) + + # any other attachments + else + data[:plain_part] = 'no visible content' + end + + # add body as attachment + headers_store = {} + if mail.mime_type + headers_store['Mime-Type'] = mail.mime_type + end + if mail.charset + headers_store['Charset'] = mail.charset + end + attachment = { + :data => mail.body.decoded, + :filename => mail.filename || filename, + :preferences => headers_store + } + data[:attachments].push attachment + end end # attachments if mail.attachments - data[:attachments] = [] mail.attachments.each do |attachment| # get file preferences @@ -232,4 +266,57 @@ class Channel::EmailParser # return new objects return ticket, article, user end + + def html2ascii(string) + + # find and replace it with [x] + link_list = '' + counter = 0 + string.gsub!( //ix ) { |item| + link = $2 + counter = counter + 1 + link_list += "[#{counter}] #{link}\n" + "[#{counter}]" + } + + # remove empty lines + string.gsub!( /^\s*/m, '' ) + + # fix some bad stuff from opera and others + string.gsub!( /(\n\r|\r\r\n|\r\n)/s, "\n" ) + + # strip all other tags + string.gsub!( /\<.+?\>/s, '' ) + + # encode html entities like "–" + string.gsub!( /(&\#(\d+);?)/x ) { |item| + $2.chr + } + + # encode html entities like "d;" + string.gsub!( /(&\#[xX]([0-9a-fA-F]+);?)/x ) { |item| + chr_orig = $1 + hex = $2.hex + if hex + chr = hex.chr + if chr + chr + else + chr_orig + end + else + chr_orig + end + } + + # remove empty lines + string.gsub!( /^\s*\n\s*\n/m, "\n" ) + + # add extracted links + if link_list + string += "\n\n" + link_list + end + + return string + end end \ No newline at end of file diff --git a/test/fixtures/mail4.box b/test/fixtures/mail4.box new file mode 100644 index 000000000..7e316f4d8 --- /dev/null +++ b/test/fixtures/mail4.box @@ -0,0 +1,38 @@ +From k.guenther@example.com Mon May 7 15:08:10 2012 +Return-Path: +X-Original-To: support@example.com +Delivered-To: box@samba.example.com +X-Greylist: delayed 355 seconds by postgrey-1.32 at samba; Mon, 07 May 2012 15:08:09 BST +Received: from smtprelay05.example.com (smtprelay05.example.com [8.6.3.9]) + by samba.example.com (Postfix) with ESMTP id 011F9500D3D + for ; Mon, 7 May 2012 15:08:09 +0100 (BST) +Received: from [1.1.0.7] (helo=exchange.df.eu) + by smtprelay05.example.com with esmtps (TLSv1:RC4-MD5:128) + (Exim 4.68) + (envelope-from ) + id 1SROW2-0007tk-QP + for support@example.com; Mon, 07 May 2012 16:02:18 +0200 +Received: from ECCR04PUBLIC.exchange.local ([1.1.2.4]) by + efe04.exchange.local ([1.1.0.7]) with mapi; Mon, 7 May 2012 15:58:33 +0200 +From: =?utf-8?B?R8O8bnRoZXIgS2F0amEgfCBFeGFtcGxlIEdtYkg=?= + +To: Martin Edenhofer via Znuny Team +Date: Mon, 7 May 2012 15:58:32 +0200 +Subject: AW: Ticket Templates [Ticket#11168] +Thread-Topic: Ticket Templates [Ticket#11168] +Thread-Index: Ac0sGqTnvktNHx1lQoaTDcVI7lUxJQAPqvXA +Message-ID: +References: + <20120507062840.265.107538@portal.example.com> +In-Reply-To: <20120507062840.265.107538@portal.example.com> +Accept-Language: de-DE +Content-Language: de-DE +X-MS-Has-Attach: +X-MS-TNEF-Correlator: +acceptlanguage: de-DE +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 +MIME-Version: 1.0 + +SGFsbG8gS2F0amEsCgpzdXBlciEgSWNoIGZyZXUgbWljaCEKCldpciB3w7xyZGVuIGdlcm5lIGRpZSBQcsOkc2VudGF0aW9uL0VpbmbDvGhydW5nIGluIGRpZSBUaWNrZXQgVGVtcGxhdGVzIHBlciBTY3JlZW5zaGFyaW5nIG9kZXIgenVtaW5kZXN0IHBlciBUZWxlZm9uIG1hY2hlbi4KCk3DtmdsaWNoZSBUZXJtaW5lOgpvIERvLCAxMC4wNS4yMDEyIDE1OjAwLTE2OjAwCm8gRnIsICAxMS4wNS4yMDEyIDEzOjAwLTE0OjAwCm8gRGksICAxNS4wNS4yMDEyIDE3OjAwLTE4OjAwCgrDnGJlciBGZWVkYmFjayB3w7xyZGUgaWNoIG1pY2ggZnJldWVuIQoKUFM6IFp1ciBiZXNzZXJlbiDDnGJlcnNpY2h0IGhhYmUgaWNoIGVpbiBUaWNrZXQgZXJzdGVsbHQuIDopIEltIEZvb3RlciBzaW5kIHVuc2VyZSBnZXNjaMOkZnRsaWNoZW4gS29udGFrdGRhdGVuIChmYWxscyBkaWVzZSBpcmdlbmR3YW5uIGVpbm1hbCBiZW7DtnRpZ3Qgd2VyZGVuIHNvbGx0ZW4pLCBtZWhyIGRhenUgaW4gZWluIHBhYXIgVGFnZW4uCgpMaWViZSBHcsO8w59lIQoKIC1NYXJ0aW4KCgo + diff --git a/test/fixtures/mail5.box b/test/fixtures/mail5.box new file mode 100644 index 000000000..5679179e6 --- /dev/null +++ b/test/fixtures/mail5.box @@ -0,0 +1,76 @@ +From marc.smith@example.com Mon May 7 07:45:48 2012 +Return-Path: +X-Original-To: support@znuny.com +Delivered-To: box@samba.example.com +Received: from mailout-de.example.com (mailout-de.example.com [2.1.6.2]) + by samba.example.com (Postfix) with SMTP id F1C9E500D3D + for ; Mon, 7 May 2012 07:45:47 +0100 (BST) +Received: (qmail invoked by alias); 07 May 2012 06:45:48 -0000 +Received: from unknown (EHLO [1.2.1.2]) [7.3.2.1] + by mail.example.com (mp072) with SMTP; 07 May 2012 08:45:48 +0200 +X-Authenticated: #69078992 +X-Provags-ID: V01U2FsdGVkX1+IkUVPK6GIbZ2ezhmZfpCU0OVlFkuyPGDNsL0V5H + FxvJdecWb4ibKL +Message-ID: <4FA76F9A.3060602@example.com> +Date: Mon, 07 May 2012 08:45:46 +0200 +From: marc.smith@example.com (Marc Smith) +User-Agent: Mozilla/5.0 (Windows NT 6.0; WOW64; rv:12.0) Gecko/20120428 Thunderbird/12.0.1 +MIME-Version: 1.0 +To: Martin Edenhofer via Znuny Team +Subject: Re: XXXX Betatest Ticket Templates [Ticket#11162] +References: <20120507061007.259.822311@portal.znuny.com> +In-Reply-To: <20120507061007.259.822311@portal.znuny.com> +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 8bit +X-Y-GMX-Trusted: 0 +Status: RO +Content-Length: 1418 +Lines: 46 + +Am 07.05.2012 08:10, schrieb Martin Edenhofer via Znuny Team: +> Hallo Marc, +> +> super! Ich freu mich! +> +> Wir würden gerne die Präsentation/Einführung in die Ticket Templates per Screensharing oder zumindest per Telefon machen. +> +> Mögliche Termine: +> o Do, 10.05.2012 11:00-12:00 +> o Fr, 11.05.2012 09:00-10:00 +> o Di, 15.05.2012 14:00-15:00 +> +> Über Feedback würde ich mich freuen! +> +> PS: Zur besseren Übersicht habe ich ein Ticket erstellt. :) Im Footer sind unsere geschäftlichen Kontaktdaten (falls diese irgendwann einmal benötigt werden sollten), mehr dazu in ein paar Tagen. +> +> Liebe Grüße! +> +> -Martin +> +> -- +> Martin Edenhofer +> +> Znuny GmbH // Marienstraße 11 // 10117 Berlin // Germany +> +> P: +49 (0) 30 60 98 54 18-0 +> F: +49 (0) 30 60 98 54 18-8 +> +> Location: Berlin - HRB 139852 B Amtsgericht Berlin-Charlottenburg +> Managing Director: Martin Edenhofer +Hallo Martin, + +John und ich könnten leider nur am Freitag, da wir Donnerstag und nächste +Woche bereits Termine haben. + +Wir würden uns dann den Freitag vormerken...;-) + +N Screensharing ist bei uns leider nicht so ohne Probleme möglich, bzw. +wir könnten einen PC aufsetzen mit nem seperaten Internetzugang auf dem +wir ne VM vorbereiten könnten, da wir von dem "Internet PC" nicht auf +unser XXXX zugreifen können. Falls ihr sonst noch irgendwas benötigt +einfach kurz ne Rückmeldung...;-) + +Grüße aus Bonn + +John & Marc + diff --git a/test/fixtures/mail6.box b/test/fixtures/mail6.box new file mode 100644 index 000000000..46b6a8101 --- /dev/null +++ b/test/fixtures/mail6.box @@ -0,0 +1,32 @@ +From me@bogen.net Sat Sep 13 16:50:43 2003 +Return-Path: +Received: from airoma.example (law10-f30.law10.airoma.example [4.4.4.4]) by esanta.edenhofer.de (Postfix) with ESMTP id 2307484296 for ; Sat, 13 Sep 2003 16:50:43 +0200 (CEST) +Received: from mail pickup service by airoma.example with Mc SMTPSVC; Sat, 13 Sep 2003 07:37:26 -0700 +Received: from 11.11.11.11 by lw10fd.law10.com with HTTP; Sat, 13 Sep 2003 14:37:26 GMT +X-Originating-Ip: [5.5.5.5] +X-Originating-Email: [me@example.com] +From: =?Windows-1252?Q?Hans_B=C4KO?= =?iso-8859-15?q?Sch=F6nland?= +To: =?iso-8859-2?Q?Namedy=F1ski?= (hans@example.com) +Subject: utf8: =?UTF-8?Q?=E4=BD=BF=E3=81=A3=E3=81=A6?= / ISO-8859-1: =?iso-8859-1?Q?Priorit=E4t=22_?= / cp-1251: =?windows-1251?B?0eXw4+XpINPj6+j26uj1?= +Date: Sat, 13 Sep 2003 10:37:26 -0400 +MIME-Version: 1.0 +Content-Type: text/html; charset="iso-8859-15"; format=flowed +Message-Id: +X-Originalarrivaltime: 13 Sep 2003 14:37:26.0630 (UTC) FILETIME=[8D57B860:01C37A04] + +

this is a test



Compare Cable, DSL or Satellite plans: As low as $2.95. + +
+ +
+Test1:– +
+Test2:& +
+Test3:∋ +
+Test4:& +
+Test5:= + + diff --git a/test/unit/email_parser_test.rb b/test/unit/email_parser_test.rb index 1b950ebb8..39530709b 100644 --- a/test/unit/email_parser_test.rb +++ b/test/unit/email_parser_test.rb @@ -35,6 +35,70 @@ class EmailParserTest < ActiveSupport::TestCase :subject => 'Ticket Templates', }, }, + { + :data => IO.read('test/fixtures/mail4.box'), + :body_md5 => '2f2c3a5c233dbd9658ab37d39469b7d0', + :params => { + :from => '"Günther Katja | Example GmbH" ', + :from_email => 'k.guenther@example.com', + :from_display_name => 'Günther Katja | Example GmbH', + :subject => 'AW: Ticket Templates [Ticket#11168]', + :plain_part => "Hallo Katja, + +super! Ich freu mich! + +Wir würden gerne die Präsentation/Einführung in die Ticket Templates per Screensharing oder zumindest per Telefon machen. + +Mögliche Termine: +o Do, 10.05.2012 15:00-16:00 +o Fr, 11.05.2012 13:00-14:00 +o Di, 15.05.2012 17:00-18:00 + +Über Feedback würde ich mich freuen! + +PS: Zur besseren Übersicht habe ich ein Ticket erstellt. :) Im Footer sind unsere geschäftlichen Kontaktdaten (falls diese irgendwann einmal benötigt werden sollten), mehr dazu in ein paar Tagen. + +Liebe Grüße! + + -Martin +", + }, + }, + { + :data => IO.read('test/fixtures/mail5.box'), + :body_md5 => '51364a306362f513f53f2bbea7820f37', + :params => { + :from => 'marc.smith@example.com (Marc Smith)', + :from_email => 'marc.smith@example.com', + :from_display_name => 'Marc Smith', + :subject => 'Re: XXXX Betatest Ticket Templates [Ticket#11162]', + }, + }, + { + :data => IO.read('test/fixtures/mail6.box'), + :body_md5 => '1fc492b8d762d82f861dbb70b7cf7610', + :params => { + :from => '"Hans BÄKOSchönland" ', + :from_email => 'me@bogen.net', + :from_display_name => 'Hans BÄKOSchönland', + :subject => 'utf8: 使って / ISO-8859-1: Priorität" / cp-1251: Сергей Углицких', + :plain_part => "this is a test [1]Compare Cable, DSL or Satellite plans: As low as $2.95. + +Test1:8 + +Test2:& + +Test3:∋ + +Test4:& + +Test5:= + + +[1] http://localhost/8HMZENUS/2737??PS= +" + }, + }, ] files.each { |file|