Improved html2text to remove html comments.

This commit is contained in:
Martin Edenhofer 2016-06-22 16:39:20 +02:00
parent a881e21f10
commit 1c1398d60c
2 changed files with 16 additions and 0 deletions

View file

@ -88,6 +88,9 @@ class String
string = string.chars.select(&:valid_encoding?).join
end
# remove html comments
string.gsub!(/<!--.+?-->/m, '')
# find <a href=....> and replace it with [x]
link_list = ''
counter = 0

View file

@ -141,6 +141,19 @@ class AaaStringTest < ActiveSupport::TestCase
result = "test\n\n___"
assert_equal(result, html.html2text)
html = "Ihr RZ-Team<br />
<br />
<!--[if gte mso 9]><xml> <o:DocumentProperties> <o:Author>test</o:Author> =
<o:Template>A75DB76E.dotm</o:Template> <o:LastAuthor>test</o:LastAuthor> =
<o:Revision>5</o:Revision> <o:Created>2011-05-18T07:08:00Z</o:Created> <=
o:LastSaved>2011-07-04T17:59:00Z</o:LastSaved> <o:Pages>1</o:Pages> <o:Wo=
rds>189</o:Words> <o:Characters>1192</o:Characters> <o:Lines>9</o:Lines> =
<o:Paragraphs>2</o:Paragraphs> <o:CharactersWithSpaces>1379</o:Characters=
WithSpaces> <o:Version>11.5606</o:Version> </o:DocumentProperties></xml><!=
[endif]-->"
result = 'Ihr RZ-Team'
assert_equal(result, html.html2text)
html = ' line&nbsp;1<br>
you<br/>
-----&amp;'