Improved html2text.

This commit is contained in:
Martin Edenhofer 2016-06-22 14:55:23 +02:00
parent 2824280a30
commit e9eae11262
4 changed files with 43 additions and 33 deletions

View file

@ -124,7 +124,7 @@ class String
string.gsub!(%r{<style(|\s.+?)>(.+?)</style>}im, '')
# remove empty lines
string.gsub!(/^\s*/m, '')
string.gsub!(/^[[:space:]]*/m, '')
if strict
string.gsub!(%r{< \s* (/*) \s* (b|i|ul|ol|li|u|h1|h2|h3|hr) (\s*|\s+[^>]*) >}mxi, '######\1\2######')
end
@ -138,7 +138,7 @@ class String
}
# insert spaces on [A-z]\n[A-z]
string.gsub!(/([A-z])\n([A-z])/m, '\1 \2')
string.gsub!(/([A-z])[[:space:]]([A-z])/m, '\1 \2')
# remove all new lines
string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
@ -219,18 +219,16 @@ class String
end
# remove tailing empty spaces
string.gsub!(/\s+\n$/, "\n")
# remove multiple empty lines
string.gsub!(/\n\n\n/, "\n\n")
string.strip!
string.gsub!(/[[:blank:]]+$/, '')
# add extracted links
if link_list != ''
string += "\n\n\n" + link_list
end
# remove double multiple empty lines
string.gsub!(/\n\n\n/, "\n\n")
string.strip
end

View file

@ -93,6 +93,18 @@ class AaaStringTest < ActiveSupport::TestCase
result = 'test'
assert_equal(result, html.html2text)
html = "<div>test<br><br> <br> \n<br> \n<br> \n</div>"
result = 'test'
assert_equal(result, html.html2text)
html = "<div>test<br><br>&nbsp;<br>&nbsp;\n<br>&nbsp;\n<br>&nbsp;\n</div>"
result = 'test'
assert_equal(result, html.html2text)
html = "<div>test<br><br>&nbsp;<br>&nbsp;\n<br>&nbsp;\n<br>&nbsp;\n</div>&nbsp;"
result = 'test'
assert_equal(result, html.html2text)
html = "<pre>test\n\ntest</pre>"
result = "test\ntest"
assert_equal(result, html.html2text)
@ -102,7 +114,7 @@ class AaaStringTest < ActiveSupport::TestCase
assert_equal(result, html.html2text)
html = '<table><tr><td>test</td><td>col</td></td></tr><tr><td>test</td><td>4711</td></tr></table>'
result = "test col \ntest 4711"
result = "test col\ntest 4711"
assert_equal(result, html.html2text)
html = "<p><span>Was\nsoll verbessert werden:</span></p>"
@ -299,7 +311,7 @@ some text later'
result = 'some head
some content
> line 1
>
>
> line 2
some text later'
assert_equal(result, html.html2text)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long