Improved String.html2text method (cut out html style content).

This commit is contained in:
Martin Edenhofer 2015-09-02 09:14:48 +02:00
parent 2563f15238
commit d9574d312c
5 changed files with 245 additions and 33 deletions

View file

@ -1,6 +1,15 @@
# Copyright (C) 2012-2014 Zammad Foundation, http://zammad-foundation.org/ # Copyright (C) 2012-2014 Zammad Foundation, http://zammad-foundation.org/
class Channel::Driver::MailStdin < Channel::EmailParser class Channel::Driver::MailStdin < Channel::EmailParser
=begin
process emails from STDIN
cat /path/to/mail.eml | rails r 'Channel::Driver::MailStdin.new'
=end
def initialize def initialize
Rails.logger.info 'read main from STDIN' Rails.logger.info 'read main from STDIN'

View file

@ -71,11 +71,11 @@ class String
returns returns
'string with text' 'string with text only'
=end =end
def html2text def html2text(string_only = false)
string = "#{self}" string = "#{self}"
# in case of invalid encodeing, strip invalid chars # in case of invalid encodeing, strip invalid chars
@ -88,12 +88,17 @@ class String
# find <a href=....> and replace it with [x] # find <a href=....> and replace it with [x]
link_list = '' link_list = ''
counter = 0 counter = 0
if !string_only
string.gsub!( /<a\s.*?href=("|')(.+?)("|').*?>/ix ) { string.gsub!( /<a\s.*?href=("|')(.+?)("|').*?>/ix ) {
link = $2 link = $2
counter = counter + 1 counter = counter + 1
link_list += "[#{counter}] #{link}\n" link_list += "[#{counter}] #{link}\n"
"[#{counter}] " "[#{counter}] "
} }
end
# remove style tags with content
string.gsub!(/<style(|\s.+?)>(.+?)<\/style>/im, '')
# remove empty lines # remove empty lines
string.gsub!( /^\s*/m, '' ) string.gsub!( /^\s*/m, '' )
@ -107,28 +112,38 @@ class String
} }
# remove all new lines # remove all new lines
string.gsub!( /(\n\r|\r\r\n|\r\n|\n)/, '' ) string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
# blockquote handling
string.gsub!( %r{<blockquote(| [^>]*)>(.+?)</blockquote>}m ) { |placeholder|
placeholder = "\n" + $2.html2text(true).gsub(/^(.*)$/, "&gt; \\1") + "\n"
}
# pre/code handling 2/2 # pre/code handling 2/2
string.gsub!( /###BR###/, "\n" ) string.gsub!(/###BR###/, "\n" )
# add counting # add counting
string.gsub!(/<li(| [^>]*)>/i, "\n* ") string.gsub!(/<li(| [^>]*)>/i, "\n* ")
# add quoting
string.gsub!(/<blockquote(| [^>]*)>/i, '> ')
# add hr # add hr
string.gsub!(%r{<hr(|/| [^>]*)>}i, "___\n") string.gsub!(%r{<hr(|/| [^>]*)>}i, "\n___\n")
# add h\d
string.gsub!(%r{</h\d>}i, "\n")
# add new lines # add new lines
string.gsub!( %r{<(br|table)(|/| [^>]*)>}i, "\n" ) string.gsub!( %r{</div><div(|\s.+?)>}im, "\n" )
string.gsub!( %r{</(div|p|pre|blockquote|table|tr)(|\s.+?)>}i, "\n" ) string.gsub!( %r{</p><p(|\s.+?)>}im, "\n" )
string.gsub!( %r{<(div|p|pre|br|table|h)(|/| [^>]*)>}i, "\n" )
string.gsub!( %r{</(tr|p|br|div)(|\s.+?)>}i, "\n" )
string.gsub!( %r{</td>}i, ' ' ) string.gsub!( %r{</td>}i, ' ' )
# strip all other tags # strip all other tags
string.gsub!( /\<.+?\>/, '' ) string.gsub!( /\<.+?\>/, '' )
# replace multible spaces with one
string.gsub!(/ /, ' ')
# strip all &amp; &lt; &gt; &quot; # strip all &amp; &lt; &gt; &quot;
string.gsub!( '&amp;', '&' ) string.gsub!( '&amp;', '&' )
string.gsub!( '&lt;', '<' ) string.gsub!( '&lt;', '<' )
@ -173,9 +188,11 @@ class String
# remove multible empty lines # remove multible empty lines
string.gsub!(/\n\n\n/, "\n\n") string.gsub!(/\n\n\n/, "\n\n")
string.strip!
# add extracted links # add extracted links
if link_list != '' if link_list != ''
string += "\n\n" + link_list string += "\n\n\n" + link_list
end end
string.strip string.strip

View file

@ -153,5 +153,182 @@ you
>' >'
assert_equal( should, html.html2text ) assert_equal( should, html.html2text )
html = ' <style type="text/css">
body {
width:90% !important;
-webkit-text-size-adjust:90%;
-ms-text-size-adjust:90%;
font-family:\'helvetica neue\', helvetica, arial, geneva, sans-serif; f=
ont-size: 12px;;
}
img {
outline:none; text-decoration:none; -ms-interpolation-mode: bicubic;
}
a img {
border:none;
}
table td {
border-collapse: collapse;
}
table {
border-collapse: collapse; mso-table-lspace:0pt; mso-table-rspace:0pt;
}
p, table, div, td {
max-width: 600px;
}
p {
margin: 0;
}
blockquote, pre {
margin: 0px;
padding: 8px 12px 8px 12px;
}
</style><p>some other content</p>'
should = 'some other content'
assert_equal( should, html.html2text )
html = ' IT-Infrastruktur</span><br>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="Generator" content="Microsoft Word 14 (filtered
medium)">
<!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style><![endif]-->
<style><!--
@font-face
{font-family:calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
p.msonormal, li.msonormal, div.msonormal
{margin:0cm;
margin-bottom:.0001pt;
font-size:11.0pt;
font-family:"calibri","sans-serif";
mso-fareast-language:en-us;}
a:link, span.msohyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.msohyperlinkfollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p.msoacetate, li.msoacetate, div.msoacetate
{mso-style-priority:99;
mso-style-link:"sprechblasentext zchn";
margin:0cm;
margin-bottom:.0001pt;
font-size:8.0pt;
font-family:"tahoma","sans-serif";
mso-fareast-language:en-us;}
span.e-mailformatvorlage17
{mso-style-type:personal;
font-family:"calibri","sans-serif";
color:windowtext;}
span.sprechblasentextzchn
{mso-style-name:"sprechblasentext zchn";
mso-style-priority:99;
mso-style-link:sprechblasentext;
font-family:"tahoma","sans-serif";}
.msochpdefault
{mso-style-type:export-only;
font-family:"calibri","sans-serif";
mso-fareast-language:en-us;}
@page wordsection1
{size:612.0pt 792.0pt;
margin:70.85pt 70.85pt 2.0cm 70.85pt;}
div.wordsection1
{page:wordsection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->'
should = 'IT-Infrastruktur'
assert_equal( should, html.html2text )
html = "<h1>some head</h1>
some content
<blockquote>
<p>line 1</p>
<p>line 2</p>
</blockquote>
<p>some text later</p>"
result = 'some head
some content
> line 1
> line 2
some text later'
assert_equal( result, html.html2text )
html = "<h1>some head</h1>
some content
<blockquote>
line 1<br/>
line 2<br>
</blockquote>
<p>some text later</p>"
result = 'some head
some content
> line 1
> line 2
some text later'
assert_equal( result, html.html2text )
html = "<h1>some head</h1>
some content
<blockquote>
<div><div>line 1</div><br></div>
<div><div>line 2</div><br></div>
</blockquote>
some text later"
result = 'some head
some content
> line 1
>
> line 2
some text later'
assert_equal( result, html.html2text )
html = "<p>Best regards,</p>
<p><i>Your Team Team</i></p>
<p>P.S.: You receive this e-mail because you are listed in our database as person who ordered a Team license. Please click <a href=\"http://www.teamviewer.example/en/company/unsubscribe.aspx?id=1009645&ident=xxx\">here</a> to unsubscribe from further e-mails.</p>
-----------------------------
<br />"
result = 'Best regards,
Your Team Team
P.S.: You receive this e-mail because you are listed in our database as person who ordered a Team license. Please click [1] here to unsubscribe from further e-mails.
-----------------------------
[1] http://www.teamviewer.example/en/company/unsubscribe.aspx?id=1009645&ident=xxx'
assert_equal( result, html.html2text )
html = "<div><br>Dave and leaned her
days adam.</div><span style=\"color:#F7F3FF; font-size:8px\">Maybe we
want any help me that.<br>Next morning charlie saw at their
father.<br>Well as though adam took out here. Melvin will be more money.
Called him into this one last thing.<br>Men-----------------------
<br />"
result = 'Dave and leaned her days adam.
Maybe we want any help me that.
Next morning charlie saw at their father.
Well as though adam took out here. Melvin will be more money. Called him into this one last thing.
Men-----------------------'
assert_equal( result, html.html2text )
end end
end end

View file

@ -86,7 +86,7 @@ Liebe Grüße!
}, },
{ {
data: IO.read('test/fixtures/mail6.box'), data: IO.read('test/fixtures/mail6.box'),
body_md5: 'cc60217317756f45a6e02829c0a8c49c', body_md5: '6229bcc5fc1396445d781daf3c12a285',
params: { params: {
from: '"Hans BÄKOSchönland" <me@bogen.net>', from: '"Hans BÄKOSchönland" <me@bogen.net>',
from_email: 'me@bogen.net', from_email: 'me@bogen.net',
@ -103,6 +103,7 @@ Test3:&ni;
Test4:& Test4:&
Test5:= Test5:=
[1] http://localhost/8HMZENUS/2737??PS=" [1] http://localhost/8HMZENUS/2737??PS="
}, },
}, },
@ -327,7 +328,7 @@ Hof
# spam email # spam email
{ {
data: IO.read('test/fixtures/mail16.box'), data: IO.read('test/fixtures/mail16.box'),
body_md5: 'a2367adfa77857a078dad83826d659e8', body_md5: '5e96cc53e78c0e44523502ee50647808',
params: { params: {
from: nil, from: nil,
from_email: 'vipyimin@126.com', from_email: 'vipyimin@126.com',
@ -361,7 +362,7 @@ Hof
}, },
{ {
data: IO.read('test/fixtures/mail19.box'), data: IO.read('test/fixtures/mail19.box'),
body_md5: '3e42be74f967379a3053f21f4125ca66', body_md5: '0bf7e746158d121bce7e2c46b64b0d39',
params: { params: {
from: '"我" <>', from: '"我" <>',
from_email: '"=?GB2312?B?ztI=?=" <>', from_email: '"=?GB2312?B?ztI=?=" <>',
@ -372,7 +373,7 @@ Hof
}, },
{ {
data: IO.read('test/fixtures/mail20.box'), data: IO.read('test/fixtures/mail20.box'),
body_md5: '65ca1367dfc26abcf49d30f68098f122', body_md5: '646e803f30cddf06db90f426df3672c1',
params: { params: {
from: 'Health and Care-Mall <drugs-cheapest8@sicor.com>', from: 'Health and Care-Mall <drugs-cheapest8@sicor.com>',
from_email: 'drugs-cheapest8@sicor.com', from_email: 'drugs-cheapest8@sicor.com',
@ -407,12 +408,13 @@ x1qJ>mC7f 512y1GA420lCQe09s9u%uks&atilde; &psi;2X5A4g3nu&larr;&Tau;yst72pMh&scar
Both hands through the fear in front. Both hands through the fear in front.
Wade to give it seemed like this. Yeah but one for any longer. Everything you going inside the kids. Wade to give it seemed like this. Yeah but one for any longer. Everything you going inside the kids.
[1] http://pxmzcgy.storeprescription.ru?zz=fkxffti" [1] http://pxmzcgy.storeprescription.ru?zz=fkxffti"
}, },
}, },
{ {
data: IO.read('test/fixtures/mail21.box'), data: IO.read('test/fixtures/mail21.box'),
body_md5: 'f909a17fde261099903f3236f8755249', body_md5: '617017ee0b2d1842f410fceaac696230',
params: { params: {
from: 'Viagra Super Force Online <pharmacy_affordable1@ertelecom.ru>', from: 'Viagra Super Force Online <pharmacy_affordable1@ertelecom.ru>',
from_email: 'pharmacy_affordable1@ertelecom.ru', from_email: 'pharmacy_affordable1@ertelecom.ru',
@ -423,7 +425,7 @@ Wade to give it seemed like this. Yeah but one for any longer. Everything you go
}, },
{ {
data: IO.read('test/fixtures/mail22.box'), data: IO.read('test/fixtures/mail22.box'),
body_md5: '9e79cb133d52afe9e18e8438df539305', body_md5: '7dd64b40dce1aa3053fc7bbdea136612',
params: { params: {
from: 'Gilbertina Suthar <ireoniqla@lipetsk.ru>', from: 'Gilbertina Suthar <ireoniqla@lipetsk.ru>',
from_email: 'ireoniqla@lipetsk.ru', from_email: 'ireoniqla@lipetsk.ru',
@ -434,7 +436,8 @@ Wade to give it seemed like this. Yeah but one for any longer. Everything you go
Continued adam helped charlie cried. Soon joined the master bathroom. Grinned adam rubbed his arms she nodded. Continued adam helped charlie cried. Soon joined the master bathroom. Grinned adam rubbed his arms she nodded.
Freemont and they talked with beppe. Freemont and they talked with beppe.
Thinking of bed and whenever adam. Thinking of bed and whenever adam.
Mike was too tired man to hear.I10PQSHEJl2Nwf&tilde;2113S173 &Icirc;1mEbb5N371L&piv;C7AlFnR1&diams;HG64B242&brvbar;M2242zk&Iota;N&rceil;7&rceil;TBN&ETH; T2xPI&ograve;gI2&Atilde;lL2&Otilde;ML&perp;22Sa&Psi;RBreathed adam gave the master bedroom door. Mike was too tired man to hear.
I10PQSHEJl2Nwf&tilde;2113S173 &Icirc;1mEbb5N371L&piv;C7AlFnR1&diams;HG64B242&brvbar;M2242zk&Iota;N&rceil;7&rceil;TBN&ETH; T2xPI&ograve;gI2&Atilde;lL2&Otilde;ML&perp;22Sa&Psi;RBreathed adam gave the master bedroom door.
Better get charlie took the wall. Better get charlie took the wall.
Charlotte clark smile he saw charlie. Charlotte clark smile he saw charlie.
Dave and leaned her tears adam. Dave and leaned her tears adam.
@ -445,6 +448,7 @@ Men joined the pickup truck pulled away. Chuck could make sure that.[1] &dagger;
Just then returned to believe it here. Just then returned to believe it here.
Freemont and pulling out several minutes. Freemont and pulling out several minutes.
[1] &#104;&#116;&#116;&#112;&#58;&#47;&#47;&#1072;&#1086;&#1089;&#1082;&#46;&#1088;&#1092;?jmlfwnwe&ucwkiyyc", [1] &#104;&#116;&#116;&#112;&#58;&#47;&#47;&#1072;&#1086;&#1089;&#1082;&#46;&#1088;&#1092;?jmlfwnwe&ucwkiyyc",
}, },
@ -571,14 +575,15 @@ gate GmbH * Gladbacher Str. 74 * 40219 Düsseldorf
}, },
{ {
data: IO.read('test/fixtures/mail29.box'), data: IO.read('test/fixtures/mail29.box'),
body_md5: 'b6cc8164ce896046d631ddd44f8c9f6e', body_md5: 'bd34701dd5246b7651f67aeea6dd0fd3',
params: { params: {
from: 'Example Sales <sales@example.com>', from: 'Example Sales <sales@example.com>',
from_email: 'sales@example.com', from_email: 'sales@example.com',
from_display_name: 'Example Sales', from_display_name: 'Example Sales',
subject: 'Example licensing information: No channel available', subject: 'Example licensing information: No channel available',
to: 'info@znuny.inc', to: 'info@znuny.inc',
body: "Dear Mr. Edenhofer,We want to keep you updated on TeamViewer licensing shortages on a regular basis. body: "Dear Mr. Edenhofer,
We want to keep you updated on TeamViewer licensing shortages on a regular basis.
We would like to inform you that since the last message on 25-Nov-2014 there have been temporary session channel exceedances which make it impossible to establish more sessions. Since the last e-mail this has occurred in a total of 1 cases. We would like to inform you that since the last message on 25-Nov-2014 there have been temporary session channel exceedances which make it impossible to establish more sessions. Since the last e-mail this has occurred in a total of 1 cases.
Additional session channels can be added at any time. Please visit our [1] TeamViewer Online Shop for pricing information. Additional session channels can be added at any time. Please visit our [1] TeamViewer Online Shop for pricing information.
Thank you - and again all the best with TeamViewer! Thank you - and again all the best with TeamViewer!
@ -601,7 +606,7 @@ Registration AG Ulm HRB 534075 * General Manager Holger Felgner
}, },
{ {
data: IO.read('test/fixtures/mail30.box'), data: IO.read('test/fixtures/mail30.box'),
body_md5: 'bba63e2dbe29e7b82d893c2554ff466a', body_md5: '23220f9537e59a8febc62705aa1c387c',
params: { params: {
from: 'Manfred Haert <Manfred.Haert@example.com>', from: 'Manfred Haert <Manfred.Haert@example.com>',
from_email: 'Manfred.Haert@example.com', from_email: 'Manfred.Haert@example.com',
@ -634,7 +639,8 @@ JETZT AUCH BEI FACEBOOK !
[3] https://www.facebook.com/test [3] https://www.facebook.com/test
___________________________________ ___________________________________
Test Somewhere GmbH Test Somewhere GmbH
Diesee-Mail ist ausschließlich für den beabsichtigten Empfängerbestimmt. Sollten Sie irrtümlich diese e-Mail erhaltenhaben, unterrichten Sie uns bitte umgehend unter[4] kontakt@example.com und vernichten Sie diese Mitteilungeinschließlich der ggf. beigefügten Dateien.
Diesee-Mail ist ausschließlich für den beabsichtigten Empfängerbestimmt. Sollten Sie irrtümlich diese e-Mail erhaltenhaben, unterrichten Sie uns bitte umgehend unter[4] kontakt@example.com und vernichten Sie diese Mitteilungeinschließlich der ggf. beigefügten Dateien.
Weil wir die Echtheit oder Vollständigkeit der in dieserNachricht enthaltenen Informationen nicht garantierenkönnen, bitten wir um Verständnis, dass wir zu Ihrem undunserem Schutz die rechtliche Verbindlichkeit dervorstehenden Erklärungen ausschließen, soweit wir mitIhnen keine anders lautenden Vereinbarungen getroffenhaben. Weil wir die Echtheit oder Vollständigkeit der in dieserNachricht enthaltenen Informationen nicht garantierenkönnen, bitten wir um Verständnis, dass wir zu Ihrem undunserem Schutz die rechtliche Verbindlichkeit dervorstehenden Erklärungen ausschließen, soweit wir mitIhnen keine anders lautenden Vereinbarungen getroffenhaben.

View file

@ -170,7 +170,8 @@ Homegrown dandelions by herself into her lips. Such an excuse to stop thinking a
___ ___
[2] Это сообщение свободно от вирусов и вредоносного ПО благодаря [3] avast! Antivirus защита активна. [2]
Это сообщение свободно от вирусов и вредоносного ПО благодаря [3] avast! Antivirus защита активна.
[1] http://piufup.medicatingsafemart.ru [1] http://piufup.medicatingsafemart.ru
@ -195,7 +196,8 @@ ___
Continued adam helped charlie cried. Soon joined the master bathroom. Grinned adam rubbed his arms she nodded. Continued adam helped charlie cried. Soon joined the master bathroom. Grinned adam rubbed his arms she nodded.
Freemont and they talked with beppe. Freemont and they talked with beppe.
Thinking of bed and whenever adam. Thinking of bed and whenever adam.
Mike was too tired man to hear.I10PQSHEJl2Nwf&tilde;2113S173 &Icirc;1mEbb5N371L&piv;C7AlFnR1&diams;HG64B242&brvbar;M2242zk&Iota;N&rceil;7&rceil;TBN&ETH; T2xPI&ograve;gI2&Atilde;lL2&Otilde;ML&perp;22Sa&Psi;RBreathed adam gave the master bedroom door. Mike was too tired man to hear.
I10PQSHEJl2Nwf&tilde;2113S173 &Icirc;1mEbb5N371L&piv;C7AlFnR1&diams;HG64B242&brvbar;M2242zk&Iota;N&rceil;7&rceil;TBN&ETH; T2xPI&ograve;gI2&Atilde;lL2&Otilde;ML&perp;22Sa&Psi;RBreathed adam gave the master bedroom door.
Better get charlie took the wall. Better get charlie took the wall.
Charlotte clark smile he saw charlie. Charlotte clark smile he saw charlie.
Dave and leaned her tears adam. Dave and leaned her tears adam.
@ -206,6 +208,7 @@ Men joined the pickup truck pulled away. Chuck could make sure that.[1] &dagger;
Just then returned to believe it here. Just then returned to believe it here.
Freemont and pulling out several minutes. Freemont and pulling out several minutes.
[1] &#104;&#116;&#116;&#112;&#58;&#47;&#47;&#1072;&#1086;&#1089;&#1082;&#46;&#1088;&#1092;?jmlfwnwe&ucwkiyyc", [1] &#104;&#116;&#116;&#112;&#58;&#47;&#47;&#1072;&#1086;&#1089;&#1082;&#46;&#1088;&#1092;?jmlfwnwe&ucwkiyyc",
sender: 'Customer', sender: 'Customer',
type: 'email', type: 'email',