Improved html2text.

This commit is contained in:
Martin Edenhofer 2016-06-22 14:55:23 +02:00
parent 2824280a30
commit e9eae11262
4 changed files with 43 additions and 33 deletions

View file

@ -124,7 +124,7 @@ class String
string.gsub!(%r{<style(|\s.+?)>(.+?)</style>}im, '') string.gsub!(%r{<style(|\s.+?)>(.+?)</style>}im, '')
# remove empty lines # remove empty lines
string.gsub!(/^\s*/m, '') string.gsub!(/^[[:space:]]*/m, '')
if strict if strict
string.gsub!(%r{< \s* (/*) \s* (b|i|ul|ol|li|u|h1|h2|h3|hr) (\s*|\s+[^>]*) >}mxi, '######\1\2######') string.gsub!(%r{< \s* (/*) \s* (b|i|ul|ol|li|u|h1|h2|h3|hr) (\s*|\s+[^>]*) >}mxi, '######\1\2######')
end end
@ -138,7 +138,7 @@ class String
} }
# insert spaces on [A-z]\n[A-z] # insert spaces on [A-z]\n[A-z]
string.gsub!(/([A-z])\n([A-z])/m, '\1 \2') string.gsub!(/([A-z])[[:space:]]([A-z])/m, '\1 \2')
# remove all new lines # remove all new lines
string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '') string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
@ -219,18 +219,16 @@ class String
end end
# remove tailing empty spaces # remove tailing empty spaces
string.gsub!(/\s+\n$/, "\n") string.gsub!(/[[:blank:]]+$/, '')
# remove multiple empty lines
string.gsub!(/\n\n\n/, "\n\n")
string.strip!
# add extracted links # add extracted links
if link_list != '' if link_list != ''
string += "\n\n\n" + link_list string += "\n\n\n" + link_list
end end
# remove double multiple empty lines
string.gsub!(/\n\n\n/, "\n\n")
string.strip string.strip
end end

View file

@ -93,6 +93,18 @@ class AaaStringTest < ActiveSupport::TestCase
result = 'test' result = 'test'
assert_equal(result, html.html2text) assert_equal(result, html.html2text)
html = "<div>test<br><br> <br> \n<br> \n<br> \n</div>"
result = 'test'
assert_equal(result, html.html2text)
html = "<div>test<br><br>&nbsp;<br>&nbsp;\n<br>&nbsp;\n<br>&nbsp;\n</div>"
result = 'test'
assert_equal(result, html.html2text)
html = "<div>test<br><br>&nbsp;<br>&nbsp;\n<br>&nbsp;\n<br>&nbsp;\n</div>&nbsp;"
result = 'test'
assert_equal(result, html.html2text)
html = "<pre>test\n\ntest</pre>" html = "<pre>test\n\ntest</pre>"
result = "test\ntest" result = "test\ntest"
assert_equal(result, html.html2text) assert_equal(result, html.html2text)

File diff suppressed because one or more lines are too long