Improved html2html_strict (allow hyperlinks but clean it before).

This commit is contained in:
Martin Edenhofer 2016-06-28 01:32:17 +02:00
parent f170a8a870
commit 249fba71c0
4 changed files with 153 additions and 46 deletions

View file

@ -95,41 +95,55 @@ class String
link_list = '' link_list = ''
counter = 0 counter = 0
if !string_only if !string_only
string.gsub!(/<a\s.*?href=("|')(.+?)("|').*?>/ix) { string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) {
link = $2 link = $2
counter = counter + 1 counter = counter + 1
link_list += "[#{counter}] #{link}\n" link_list += "[#{counter}] #{link}\n"
"[#{counter}] " "[#{counter}] "
} }
else else
string.gsub!(%r{<a\s+href=("|')(.+?)("|')(\s*|\s+[^>]*)>(.+?)<\s*/a\s*>}mxi) {|_placeholder| string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) {|_placeholder|
link = $2 link = $3
if !link.empty? text = $6
text.gsub!(/\<.+?\>/, '')
link_compare = link.dup
if !link_compare.empty?
link.strip! link.strip!
link_compare.strip!
link_compare.downcase!
link_compare.sub!(%r{/$}, '')
end end
text = $5 text_compare = text.dup
if !text.empty? if !text_compare.empty?
text.strip! text.strip!
text_compare.strip!
text_compare.downcase!
text_compare.sub!(%r{/$}, '')
end end
placeholder = if !link.empty? && text.empty? placeholder = if !link_compare.empty? && text_compare.empty?
link link
elsif link.empty? && !text.empty? elsif link_compare.empty? && !text_compare.empty?
text text
elsif !link.empty? && !text.empty? && (link.downcase == text.downcase || link.downcase == "mailto:#{text}".downcase || link.downcase == "http://#{text}".downcase) elsif link_compare && link_compare =~ /^mailto/i
text text
elsif !link_compare.empty? && !text_compare.empty? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase)
"######LINKEXT:#{link}/TEXT:#{text}######"
elsif text !~ /^http/
"#{text} (######LINKRAW:#{link}######)"
else else
"#{text} (#{link})" "#{link} (######LINKRAW:#{text}######)"
end end
} }
end end
# remove style tags with content # remove style tags with content
string.gsub!(%r{<style(|\s.+?)>(.+?)</style>}im, '') string.gsub!(%r{<style(|[[:space:]].+?)>(.+?)</style>}im, '')
# remove empty lines # remove empty lines
string.gsub!(/^[[:space:]]*/m, '') string.gsub!(/^[[:space:]]*/m, '')
if strict if strict
string.gsub!(%r{< \s* (/*) \s* (b|i|ul|ol|li|u|h1|h2|h3|hr) (\s*|\s+[^>]*) >}mxi, '######\1\2######') string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######')
end end
# pre/code handling 1/2 # pre/code handling 1/2
@ -164,10 +178,10 @@ class String
string.gsub!(%r{</h\d>}i, "\n") string.gsub!(%r{</h\d>}i, "\n")
# add new lines # add new lines
string.gsub!(%r{</div><div(|\s.+?)>}im, "\n") string.gsub!(%r{</div><div(|[[:space:]].+?)>}im, "\n")
string.gsub!(%r{</p><p(|\s.+?)>}im, "\n") string.gsub!(%r{</p><p(|[[:space:]].+?)>}im, "\n")
string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n") string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
string.gsub!(%r{</(p|br|div)(|\s.+?)>}i, "\n") string.gsub!(%r{</(p|br|div)(|[[:space:]].+?)>}i, "\n")
string.gsub!(%r{</td>}i, ' ') string.gsub!(%r{</td>}i, ' ')
# strip all other tags # strip all other tags
@ -176,6 +190,23 @@ class String
# replace multiple spaces with one # replace multiple spaces with one
string.gsub!(/ /, ' ') string.gsub!(/ /, ' ')
# add hyperlinks
if strict
string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) {|_placeholder|
pre = $1
content = $2
post = $5
if content =~ /^www/i
content = "http://#{content}"
end
placeholder = if content =~ /^(http|https|ftp|tel)/i
"#{pre}######LINKRAW:#{content}#######{post}"
else
"#{pre}#{content}#{post}"
end
}
end
# try HTMLEntities, if it fails on invalid signes, use manual way # try HTMLEntities, if it fails on invalid signes, use manual way
begin begin
coder = HTMLEntities.new coder = HTMLEntities.new
@ -259,6 +290,8 @@ class String
def html2html_strict def html2html_strict
string = html2text(true, true) string = html2text(true, true)
string = string.text2html string = string.text2html
string.gsub!(%r{######LINKEXT:(.+?)/TEXT:(.+?)######}, '<a href="\1" target="_blank">\2</a>')
string.gsub!(/######LINKRAW:(.+?)######/, '<a href="\1" target="_blank">\1</a>')
string.gsub!(/######(.+?)######/, '<\1>') string.gsub!(/######(.+?)######/, '<\1>')
string.chomp string.chomp
end end

View file

@ -450,8 +450,81 @@ Men-----------------------'
result = '<h3>test</h3>' result = '<h3>test</h3>'
assert_equal(result, html.html2html_strict) assert_equal(result, html.html2html_strict)
html = "<b\n>test</b>" html = '<a href="http://example.com">http://example.com</a>'
result = '<b>test</b>' result = '<a href="http://example.com" target="_blank">http://example.com</a>'
assert_equal(result, html.html2html_strict)
html = '<A href="http://example.com?a=1;">http://example.com?a=1;</A>'
result = '<a href="http://example.com?a=1;" target="_blank">http://example.com?a=1;</a>'
assert_equal(result, html.html2html_strict)
html = '<a href="http://web.de">web.de</a>'
result = '<a href="http://web.de" target="_blank">web.de</a>'
assert_equal(result, html.html2html_strict)
html = '<a id="123" href="http://web.de">web.de</a>'
result = '<a href="http://web.de" target="_blank">web.de</a>'
assert_equal(result, html.html2html_strict)
html = '<br>https://www.facebook.com/test<br>'
result = '<a href="https://www.facebook.com/test" target="_blank">https://www.facebook.com/test</a>'
assert_equal(result, html.html2html_strict)
html = 'some text http://example.com some other text'
result = 'some text <a href="http://example.com" target="_blank">http://example.com</a> some other text'
assert_equal(result, html.html2html_strict)
html = 'some text www.example.com some other text'
result = 'some text <a href="http://www.example.com" target="_blank">http://www.example.com</a> some other text'
assert_equal(result, html.html2html_strict)
html = '<a href="http://example.com">http://what-different.example.com</a>'
result = 'http://example.com (<a href="http://what-different.example.com" target="_blank">http://what-different.example.com</a>)'
result = 'http://example.com (<a href="http://what-different.example.com" target="_blank">http://what-different.example.com</a>)'
assert_equal(result, html.html2html_strict)
html = '<a href="http://example.com">http://EXAMPLE.com</a>'
result = '<a href="http://example.com" target="_blank">http://EXAMPLE.com</a>'
assert_equal(result, html.html2html_strict)
html = '<a href="http://example.com" class="abc">http://example.com</a>'
result = '<a href="http://example.com" target="_blank">http://example.com</a>'
assert_equal(result, html.html2html_strict)
html = '<a href="http://example.com/" class="abc">http://example.com</a>'
result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
assert_equal(result, html.html2html_strict)
html = "<a href=\"http://example.com/\n\" class=\"abc\">http://example.com</a>"
result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
assert_equal(result, html.html2html_strict)
html = "<a href=\"http://example.com/\n \" class=\"abc\n\"\n>http://example.com</a>"
result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
assert_equal(result, html.html2html_strict)
html = "<div>http://example.com</div>"
result = '<a href="http://example.com" target="_blank">http://example.com</a>'
assert_equal(result, html.html2html_strict)
html = "<div>http://example.com.</div>"
result = '<a href="http://example.com" target="_blank">http://example.com</a>.'
assert_equal(result, html.html2html_strict)
html = "<div>http://example.com, and so on</div>"
result = '<a href="http://example.com" target="_blank">http://example.com</a>, and so on'
assert_equal(result, html.html2html_strict)
html = "<div>http://example.com?lala=me, and so on</div>"
result = '<a href="http://example.com?lala=me" target="_blank">http://example.com?lala=me</a>, and so on'
assert_equal(result, html.html2html_strict)
html = "<a href=\"http://facebook.de/examplesrbog\"><span lang=\"EN-US\" style='color:blue'>http://facebook.de/examplesrbog</span></a>"
result = "<a href=\"http://facebook.de/examplesrbog\" target=\"_blank\">http://facebook.de/examplesrbog</a>"
assert_equal(result, html.html2html_strict)
html = "Damit Sie keinen Tag versäumen, empfehlen wir Ihnen den <a href=\"http://newsletters.cylex.de/\" class=\"\">Link des Adventkalenders</a> in<br class=\"\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Ihrer Lesezeichen-Symbolleiste zu ergänzen.</p><div class=\"\">&nbsp;"
result = "Damit Sie keinen Tag versäumen, empfehlen wir Ihnen den Link des Adventkalenders (<a href=\"http://newsletters.cylex.de/\" target=\"_blank\">http://newsletters.cylex.de/</a>) in<br>      Ihrer Lesezeichen-Symbolleiste zu ergänzen."
assert_equal(result, html.html2html_strict) assert_equal(result, html.html2html_strict)
html = '<b >test</b>' html = '<b >test</b>'
@ -495,7 +568,8 @@ Men-----------------------'
assert_equal(result, html.html2html_strict) assert_equal(result, html.html2html_strict)
html = '<a href="mailto:john.smith2@example.com" style="color: blue; text-decoration: underline; ">john.smith@example.com</a>' html = '<a href="mailto:john.smith2@example.com" style="color: blue; text-decoration: underline; ">john.smith@example.com</a>'
result = 'john.smith@example.com (mailto:john.smith2@example.com)' #result = 'john.smith@example.com (mailto:john.smith2@example.com)'
result = 'john.smith@example.com'
assert_equal(result, html.html2html_strict) assert_equal(result, html.html2html_strict)
end end

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long