Improved String.html2text method (cut out html style content).
This commit is contained in:
parent
2563f15238
commit
d9574d312c
5 changed files with 245 additions and 33 deletions
|
@ -1,6 +1,15 @@
|
|||
# Copyright (C) 2012-2014 Zammad Foundation, http://zammad-foundation.org/
|
||||
|
||||
class Channel::Driver::MailStdin < Channel::EmailParser
|
||||
|
||||
=begin
|
||||
|
||||
process emails from STDIN
|
||||
|
||||
cat /path/to/mail.eml | rails r 'Channel::Driver::MailStdin.new'
|
||||
|
||||
=end
|
||||
|
||||
def initialize
|
||||
Rails.logger.info 'read main from STDIN'
|
||||
|
||||
|
|
|
@ -71,11 +71,11 @@ class String
|
|||
|
||||
returns
|
||||
|
||||
'string with text'
|
||||
'string with text only'
|
||||
|
||||
=end
|
||||
|
||||
def html2text
|
||||
def html2text(string_only = false)
|
||||
string = "#{self}"
|
||||
|
||||
# in case of invalid encodeing, strip invalid chars
|
||||
|
@ -88,12 +88,17 @@ class String
|
|||
# find <a href=....> and replace it with [x]
|
||||
link_list = ''
|
||||
counter = 0
|
||||
if !string_only
|
||||
string.gsub!( /<a\s.*?href=("|')(.+?)("|').*?>/ix ) {
|
||||
link = $2
|
||||
counter = counter + 1
|
||||
link_list += "[#{counter}] #{link}\n"
|
||||
"[#{counter}] "
|
||||
}
|
||||
end
|
||||
|
||||
# remove style tags with content
|
||||
string.gsub!(/<style(|\s.+?)>(.+?)<\/style>/im, '')
|
||||
|
||||
# remove empty lines
|
||||
string.gsub!( /^\s*/m, '' )
|
||||
|
@ -109,26 +114,36 @@ class String
|
|||
# remove all new lines
|
||||
string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
|
||||
|
||||
# blockquote handling
|
||||
string.gsub!( %r{<blockquote(| [^>]*)>(.+?)</blockquote>}m ) { |placeholder|
|
||||
placeholder = "\n" + $2.html2text(true).gsub(/^(.*)$/, "> \\1") + "\n"
|
||||
}
|
||||
|
||||
# pre/code handling 2/2
|
||||
string.gsub!(/###BR###/, "\n" )
|
||||
|
||||
# add counting
|
||||
string.gsub!(/<li(| [^>]*)>/i, "\n* ")
|
||||
|
||||
# add quoting
|
||||
string.gsub!(/<blockquote(| [^>]*)>/i, '> ')
|
||||
|
||||
# add hr
|
||||
string.gsub!(%r{<hr(|/| [^>]*)>}i, "___\n")
|
||||
string.gsub!(%r{<hr(|/| [^>]*)>}i, "\n___\n")
|
||||
|
||||
# add h\d
|
||||
string.gsub!(%r{</h\d>}i, "\n")
|
||||
|
||||
# add new lines
|
||||
string.gsub!( %r{<(br|table)(|/| [^>]*)>}i, "\n" )
|
||||
string.gsub!( %r{</(div|p|pre|blockquote|table|tr)(|\s.+?)>}i, "\n" )
|
||||
string.gsub!( %r{</div><div(|\s.+?)>}im, "\n" )
|
||||
string.gsub!( %r{</p><p(|\s.+?)>}im, "\n" )
|
||||
string.gsub!( %r{<(div|p|pre|br|table|h)(|/| [^>]*)>}i, "\n" )
|
||||
string.gsub!( %r{</(tr|p|br|div)(|\s.+?)>}i, "\n" )
|
||||
string.gsub!( %r{</td>}i, ' ' )
|
||||
|
||||
# strip all other tags
|
||||
string.gsub!( /\<.+?\>/, '' )
|
||||
|
||||
# replace multible spaces with one
|
||||
string.gsub!(/ /, ' ')
|
||||
|
||||
# strip all & < > "
|
||||
string.gsub!( '&', '&' )
|
||||
string.gsub!( '<', '<' )
|
||||
|
@ -173,9 +188,11 @@ class String
|
|||
# remove multible empty lines
|
||||
string.gsub!(/\n\n\n/, "\n\n")
|
||||
|
||||
string.strip!
|
||||
|
||||
# add extracted links
|
||||
if link_list != ''
|
||||
string += "\n\n" + link_list
|
||||
string += "\n\n\n" + link_list
|
||||
end
|
||||
|
||||
string.strip
|
||||
|
|
|
@ -153,5 +153,182 @@ you
|
|||
>'
|
||||
assert_equal( should, html.html2text )
|
||||
|
||||
html = ' <style type="text/css">
|
||||
body {
|
||||
width:90% !important;
|
||||
-webkit-text-size-adjust:90%;
|
||||
-ms-text-size-adjust:90%;
|
||||
font-family:\'helvetica neue\', helvetica, arial, geneva, sans-serif; f=
|
||||
ont-size: 12px;;
|
||||
}
|
||||
img {
|
||||
outline:none; text-decoration:none; -ms-interpolation-mode: bicubic;
|
||||
}
|
||||
a img {
|
||||
border:none;
|
||||
}
|
||||
table td {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
table {
|
||||
border-collapse: collapse; mso-table-lspace:0pt; mso-table-rspace:0pt;
|
||||
}
|
||||
p, table, div, td {
|
||||
max-width: 600px;
|
||||
}
|
||||
p {
|
||||
margin: 0;
|
||||
}
|
||||
blockquote, pre {
|
||||
margin: 0px;
|
||||
padding: 8px 12px 8px 12px;
|
||||
}
|
||||
|
||||
</style><p>some other content</p>'
|
||||
should = 'some other content'
|
||||
assert_equal( should, html.html2text )
|
||||
|
||||
|
||||
html = ' IT-Infrastruktur</span><br>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="Generator" content="Microsoft Word 14 (filtered
|
||||
medium)">
|
||||
<!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
|
||||
o\:* {behavior:url(#default#VML);}
|
||||
w\:* {behavior:url(#default#VML);}
|
||||
.shape {behavior:url(#default#VML);}
|
||||
</style><![endif]-->
|
||||
<style><!--
|
||||
|
||||
@font-face
|
||||
{font-family:calibri;
|
||||
panose-1:2 15 5 2 2 2 4 3 2 4;}
|
||||
@font-face
|
||||
{font-family:tahoma;
|
||||
panose-1:2 11 6 4 3 5 4 4 2 4;}
|
||||
|
||||
p.msonormal, li.msonormal, div.msonormal
|
||||
{margin:0cm;
|
||||
margin-bottom:.0001pt;
|
||||
font-size:11.0pt;
|
||||
font-family:"calibri","sans-serif";
|
||||
mso-fareast-language:en-us;}
|
||||
a:link, span.msohyperlink
|
||||
{mso-style-priority:99;
|
||||
color:blue;
|
||||
text-decoration:underline;}
|
||||
a:visited, span.msohyperlinkfollowed
|
||||
{mso-style-priority:99;
|
||||
color:purple;
|
||||
text-decoration:underline;}
|
||||
p.msoacetate, li.msoacetate, div.msoacetate
|
||||
{mso-style-priority:99;
|
||||
mso-style-link:"sprechblasentext zchn";
|
||||
margin:0cm;
|
||||
margin-bottom:.0001pt;
|
||||
font-size:8.0pt;
|
||||
font-family:"tahoma","sans-serif";
|
||||
mso-fareast-language:en-us;}
|
||||
span.e-mailformatvorlage17
|
||||
{mso-style-type:personal;
|
||||
font-family:"calibri","sans-serif";
|
||||
color:windowtext;}
|
||||
span.sprechblasentextzchn
|
||||
{mso-style-name:"sprechblasentext zchn";
|
||||
mso-style-priority:99;
|
||||
mso-style-link:sprechblasentext;
|
||||
font-family:"tahoma","sans-serif";}
|
||||
.msochpdefault
|
||||
{mso-style-type:export-only;
|
||||
font-family:"calibri","sans-serif";
|
||||
mso-fareast-language:en-us;}
|
||||
@page wordsection1
|
||||
{size:612.0pt 792.0pt;
|
||||
margin:70.85pt 70.85pt 2.0cm 70.85pt;}
|
||||
div.wordsection1
|
||||
{page:wordsection1;}
|
||||
--></style><!--[if gte mso 9]><xml>
|
||||
<o:shapedefaults v:ext="edit" spidmax="1026" />
|
||||
</xml><![endif]--><!--[if gte mso 9]><xml>
|
||||
<o:shapelayout v:ext="edit">
|
||||
<o:idmap v:ext="edit" data="1" />
|
||||
</o:shapelayout></xml><![endif]-->'
|
||||
should = 'IT-Infrastruktur'
|
||||
assert_equal( should, html.html2text )
|
||||
|
||||
html = "<h1>some head</h1>
|
||||
some content
|
||||
<blockquote>
|
||||
<p>line 1</p>
|
||||
<p>line 2</p>
|
||||
</blockquote>
|
||||
<p>some text later</p>"
|
||||
result = 'some head
|
||||
some content
|
||||
> line 1
|
||||
> line 2
|
||||
|
||||
some text later'
|
||||
assert_equal( result, html.html2text )
|
||||
|
||||
html = "<h1>some head</h1>
|
||||
some content
|
||||
<blockquote>
|
||||
line 1<br/>
|
||||
line 2<br>
|
||||
</blockquote>
|
||||
<p>some text later</p>"
|
||||
result = 'some head
|
||||
some content
|
||||
> line 1
|
||||
> line 2
|
||||
|
||||
some text later'
|
||||
assert_equal( result, html.html2text )
|
||||
|
||||
|
||||
html = "<h1>some head</h1>
|
||||
some content
|
||||
<blockquote>
|
||||
<div><div>line 1</div><br></div>
|
||||
<div><div>line 2</div><br></div>
|
||||
</blockquote>
|
||||
some text later"
|
||||
result = 'some head
|
||||
some content
|
||||
> line 1
|
||||
>
|
||||
> line 2
|
||||
some text later'
|
||||
assert_equal( result, html.html2text )
|
||||
|
||||
html = "<p>Best regards,</p>
|
||||
<p><i>Your Team Team</i></p>
|
||||
<p>P.S.: You receive this e-mail because you are listed in our database as person who ordered a Team license. Please click <a href=\"http://www.teamviewer.example/en/company/unsubscribe.aspx?id=1009645&ident=xxx\">here</a> to unsubscribe from further e-mails.</p>
|
||||
-----------------------------
|
||||
<br />"
|
||||
result = 'Best regards,
|
||||
Your Team Team
|
||||
P.S.: You receive this e-mail because you are listed in our database as person who ordered a Team license. Please click [1] here to unsubscribe from further e-mails.
|
||||
-----------------------------
|
||||
|
||||
|
||||
[1] http://www.teamviewer.example/en/company/unsubscribe.aspx?id=1009645&ident=xxx'
|
||||
assert_equal( result, html.html2text )
|
||||
|
||||
html = "<div><br>Dave and leaned her
|
||||
days adam.</div><span style=\"color:#F7F3FF; font-size:8px\">Maybe we
|
||||
want any help me that.<br>Next morning charlie saw at their
|
||||
father.<br>Well as though adam took out here. Melvin will be more money.
|
||||
Called him into this one last thing.<br>Men-----------------------
|
||||
<br />"
|
||||
result = 'Dave and leaned her days adam.
|
||||
Maybe we want any help me that.
|
||||
Next morning charlie saw at their father.
|
||||
Well as though adam took out here. Melvin will be more money. Called him into this one last thing.
|
||||
Men-----------------------'
|
||||
assert_equal( result, html.html2text )
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -86,7 +86,7 @@ Liebe Grüße!
|
|||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail6.box'),
|
||||
body_md5: 'cc60217317756f45a6e02829c0a8c49c',
|
||||
body_md5: '6229bcc5fc1396445d781daf3c12a285',
|
||||
params: {
|
||||
from: '"Hans BÄKOSchönland" <me@bogen.net>',
|
||||
from_email: 'me@bogen.net',
|
||||
|
@ -103,6 +103,7 @@ Test3:∋
|
|||
Test4:&
|
||||
Test5:=
|
||||
|
||||
|
||||
[1] http://localhost/8HMZENUS/2737??PS="
|
||||
},
|
||||
},
|
||||
|
@ -327,7 +328,7 @@ Hof
|
|||
# spam email
|
||||
{
|
||||
data: IO.read('test/fixtures/mail16.box'),
|
||||
body_md5: 'a2367adfa77857a078dad83826d659e8',
|
||||
body_md5: '5e96cc53e78c0e44523502ee50647808',
|
||||
params: {
|
||||
from: nil,
|
||||
from_email: 'vipyimin@126.com',
|
||||
|
@ -361,7 +362,7 @@ Hof
|
|||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail19.box'),
|
||||
body_md5: '3e42be74f967379a3053f21f4125ca66',
|
||||
body_md5: '0bf7e746158d121bce7e2c46b64b0d39',
|
||||
params: {
|
||||
from: '"我" <>',
|
||||
from_email: '"=?GB2312?B?ztI=?=" <>',
|
||||
|
@ -372,7 +373,7 @@ Hof
|
|||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail20.box'),
|
||||
body_md5: '65ca1367dfc26abcf49d30f68098f122',
|
||||
body_md5: '646e803f30cddf06db90f426df3672c1',
|
||||
params: {
|
||||
from: 'Health and Care-Mall <drugs-cheapest8@sicor.com>',
|
||||
from_email: 'drugs-cheapest8@sicor.com',
|
||||
|
@ -407,12 +408,13 @@ x1qJ>mC7f 512y1GA420lCQe09s9u%uksã ψ2X5A4g3nu←Τyst72pMh&scar
|
|||
Both hands through the fear in front.
|
||||
Wade to give it seemed like this. Yeah but one for any longer. Everything you going inside the kids.
|
||||
|
||||
|
||||
[1] http://pxmzcgy.storeprescription.ru?zz=fkxffti"
|
||||
},
|
||||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail21.box'),
|
||||
body_md5: 'f909a17fde261099903f3236f8755249',
|
||||
body_md5: '617017ee0b2d1842f410fceaac696230',
|
||||
params: {
|
||||
from: 'Viagra Super Force Online <pharmacy_affordable1@ertelecom.ru>',
|
||||
from_email: 'pharmacy_affordable1@ertelecom.ru',
|
||||
|
@ -423,7 +425,7 @@ Wade to give it seemed like this. Yeah but one for any longer. Everything you go
|
|||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail22.box'),
|
||||
body_md5: '9e79cb133d52afe9e18e8438df539305',
|
||||
body_md5: '7dd64b40dce1aa3053fc7bbdea136612',
|
||||
params: {
|
||||
from: 'Gilbertina Suthar <ireoniqla@lipetsk.ru>',
|
||||
from_email: 'ireoniqla@lipetsk.ru',
|
||||
|
@ -434,7 +436,8 @@ Wade to give it seemed like this. Yeah but one for any longer. Everything you go
|
|||
Continued adam helped charlie cried. Soon joined the master bathroom. Grinned adam rubbed his arms she nodded.
|
||||
Freemont and they talked with beppe.
|
||||
Thinking of bed and whenever adam.
|
||||
Mike was too tired man to hear.I10PQSHEJl2Nwf˜2113S173 Î1mEbb5N371LϖC7AlFnR1♦HG64B242¦M2242zkΙN⌉7⌉TBNÐ T2xPIògI2ÃlL2ÕML⊥22SaΨRBreathed adam gave the master bedroom door.
|
||||
Mike was too tired man to hear.
|
||||
I10PQSHEJl2Nwf˜2113S173 Î1mEbb5N371LϖC7AlFnR1♦HG64B242¦M2242zkΙN⌉7⌉TBNÐ T2xPIògI2ÃlL2ÕML⊥22SaΨRBreathed adam gave the master bedroom door.
|
||||
Better get charlie took the wall.
|
||||
Charlotte clark smile he saw charlie.
|
||||
Dave and leaned her tears adam.
|
||||
|
@ -445,6 +448,7 @@ Men joined the pickup truck pulled away. Chuck could make sure that.[1] †
|
|||
Just then returned to believe it here.
|
||||
Freemont and pulling out several minutes.
|
||||
|
||||
|
||||
[1] http://аоск.рф?jmlfwnwe&ucwkiyyc",
|
||||
},
|
||||
|
||||
|
@ -571,14 +575,15 @@ gate GmbH * Gladbacher Str. 74 * 40219 Düsseldorf
|
|||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail29.box'),
|
||||
body_md5: 'b6cc8164ce896046d631ddd44f8c9f6e',
|
||||
body_md5: 'bd34701dd5246b7651f67aeea6dd0fd3',
|
||||
params: {
|
||||
from: 'Example Sales <sales@example.com>',
|
||||
from_email: 'sales@example.com',
|
||||
from_display_name: 'Example Sales',
|
||||
subject: 'Example licensing information: No channel available',
|
||||
to: 'info@znuny.inc',
|
||||
body: "Dear Mr. Edenhofer,We want to keep you updated on TeamViewer licensing shortages on a regular basis.
|
||||
body: "Dear Mr. Edenhofer,
|
||||
We want to keep you updated on TeamViewer licensing shortages on a regular basis.
|
||||
We would like to inform you that since the last message on 25-Nov-2014 there have been temporary session channel exceedances which make it impossible to establish more sessions. Since the last e-mail this has occurred in a total of 1 cases.
|
||||
Additional session channels can be added at any time. Please visit our [1] TeamViewer Online Shop for pricing information.
|
||||
Thank you - and again all the best with TeamViewer!
|
||||
|
@ -601,7 +606,7 @@ Registration AG Ulm HRB 534075 * General Manager Holger Felgner
|
|||
},
|
||||
{
|
||||
data: IO.read('test/fixtures/mail30.box'),
|
||||
body_md5: 'bba63e2dbe29e7b82d893c2554ff466a',
|
||||
body_md5: '23220f9537e59a8febc62705aa1c387c',
|
||||
params: {
|
||||
from: 'Manfred Haert <Manfred.Haert@example.com>',
|
||||
from_email: 'Manfred.Haert@example.com',
|
||||
|
@ -634,6 +639,7 @@ JETZT AUCH BEI FACEBOOK !
|
|||
[3] https://www.facebook.com/test
|
||||
___________________________________
|
||||
Test Somewhere GmbH
|
||||
|
||||
Diesee-Mail ist ausschließlich für den beabsichtigten Empfängerbestimmt. Sollten Sie irrtümlich diese e-Mail erhaltenhaben, unterrichten Sie uns bitte umgehend unter[4] kontakt@example.com und vernichten Sie diese Mitteilungeinschließlich der ggf. beigefügten Dateien.
|
||||
Weil wir die Echtheit oder Vollständigkeit der in dieserNachricht enthaltenen Informationen nicht garantierenkönnen, bitten wir um Verständnis, dass wir zu Ihrem undunserem Schutz die rechtliche Verbindlichkeit dervorstehenden Erklärungen ausschließen, soweit wir mitIhnen keine anders lautenden Vereinbarungen getroffenhaben.
|
||||
|
||||
|
|
|
@ -170,7 +170,8 @@ Homegrown dandelions by herself into her lips. Such an excuse to stop thinking a
|
|||
|
||||
___
|
||||
|
||||
[2] Это сообщение свободно от вирусов и вредоносного ПО благодаря [3] avast! Antivirus защита активна.
|
||||
[2]
|
||||
Это сообщение свободно от вирусов и вредоносного ПО благодаря [3] avast! Antivirus защита активна.
|
||||
|
||||
|
||||
[1] http://piufup.medicatingsafemart.ru
|
||||
|
@ -195,7 +196,8 @@ ___
|
|||
Continued adam helped charlie cried. Soon joined the master bathroom. Grinned adam rubbed his arms she nodded.
|
||||
Freemont and they talked with beppe.
|
||||
Thinking of bed and whenever adam.
|
||||
Mike was too tired man to hear.I10PQSHEJl2Nwf˜2113S173 Î1mEbb5N371LϖC7AlFnR1♦HG64B242¦M2242zkΙN⌉7⌉TBNÐ T2xPIògI2ÃlL2ÕML⊥22SaΨRBreathed adam gave the master bedroom door.
|
||||
Mike was too tired man to hear.
|
||||
I10PQSHEJl2Nwf˜2113S173 Î1mEbb5N371LϖC7AlFnR1♦HG64B242¦M2242zkΙN⌉7⌉TBNÐ T2xPIògI2ÃlL2ÕML⊥22SaΨRBreathed adam gave the master bedroom door.
|
||||
Better get charlie took the wall.
|
||||
Charlotte clark smile he saw charlie.
|
||||
Dave and leaned her tears adam.
|
||||
|
@ -206,6 +208,7 @@ Men joined the pickup truck pulled away. Chuck could make sure that.[1] †
|
|||
Just then returned to believe it here.
|
||||
Freemont and pulling out several minutes.
|
||||
|
||||
|
||||
[1] http://аоск.рф?jmlfwnwe&ucwkiyyc",
|
||||
sender: 'Customer',
|
||||
type: 'email',
|
||||
|
|
Loading…
Reference in a new issue