Improved html2html_strict (allow hyperlinks but clean it before).
This commit is contained in:
parent
f170a8a870
commit
249fba71c0
4 changed files with 153 additions and 46 deletions
|
@ -95,41 +95,55 @@ class String
|
|||
link_list = ''
|
||||
counter = 0
|
||||
if !string_only
|
||||
string.gsub!(/<a\s.*?href=("|')(.+?)("|').*?>/ix) {
|
||||
string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) {
|
||||
link = $2
|
||||
counter = counter + 1
|
||||
link_list += "[#{counter}] #{link}\n"
|
||||
"[#{counter}] "
|
||||
}
|
||||
else
|
||||
string.gsub!(%r{<a\s+href=("|')(.+?)("|')(\s*|\s+[^>]*)>(.+?)<\s*/a\s*>}mxi) {|_placeholder|
|
||||
link = $2
|
||||
if !link.empty?
|
||||
string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) {|_placeholder|
|
||||
link = $3
|
||||
text = $6
|
||||
text.gsub!(/\<.+?\>/, '')
|
||||
|
||||
link_compare = link.dup
|
||||
if !link_compare.empty?
|
||||
link.strip!
|
||||
link_compare.strip!
|
||||
link_compare.downcase!
|
||||
link_compare.sub!(%r{/$}, '')
|
||||
end
|
||||
text = $5
|
||||
if !text.empty?
|
||||
text_compare = text.dup
|
||||
if !text_compare.empty?
|
||||
text.strip!
|
||||
text_compare.strip!
|
||||
text_compare.downcase!
|
||||
text_compare.sub!(%r{/$}, '')
|
||||
end
|
||||
placeholder = if !link.empty? && text.empty?
|
||||
placeholder = if !link_compare.empty? && text_compare.empty?
|
||||
link
|
||||
elsif link.empty? && !text.empty?
|
||||
elsif link_compare.empty? && !text_compare.empty?
|
||||
text
|
||||
elsif !link.empty? && !text.empty? && (link.downcase == text.downcase || link.downcase == "mailto:#{text}".downcase || link.downcase == "http://#{text}".downcase)
|
||||
elsif link_compare && link_compare =~ /^mailto/i
|
||||
text
|
||||
elsif !link_compare.empty? && !text_compare.empty? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase)
|
||||
"######LINKEXT:#{link}/TEXT:#{text}######"
|
||||
elsif text !~ /^http/
|
||||
"#{text} (######LINKRAW:#{link}######)"
|
||||
else
|
||||
"#{text} (#{link})"
|
||||
"#{link} (######LINKRAW:#{text}######)"
|
||||
end
|
||||
}
|
||||
end
|
||||
|
||||
# remove style tags with content
|
||||
string.gsub!(%r{<style(|\s.+?)>(.+?)</style>}im, '')
|
||||
string.gsub!(%r{<style(|[[:space:]].+?)>(.+?)</style>}im, '')
|
||||
|
||||
# remove empty lines
|
||||
string.gsub!(/^[[:space:]]*/m, '')
|
||||
if strict
|
||||
string.gsub!(%r{< \s* (/*) \s* (b|i|ul|ol|li|u|h1|h2|h3|hr) (\s*|\s+[^>]*) >}mxi, '######\1\2######')
|
||||
string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######')
|
||||
end
|
||||
|
||||
# pre/code handling 1/2
|
||||
|
@ -164,10 +178,10 @@ class String
|
|||
string.gsub!(%r{</h\d>}i, "\n")
|
||||
|
||||
# add new lines
|
||||
string.gsub!(%r{</div><div(|\s.+?)>}im, "\n")
|
||||
string.gsub!(%r{</p><p(|\s.+?)>}im, "\n")
|
||||
string.gsub!(%r{</div><div(|[[:space:]].+?)>}im, "\n")
|
||||
string.gsub!(%r{</p><p(|[[:space:]].+?)>}im, "\n")
|
||||
string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
|
||||
string.gsub!(%r{</(p|br|div)(|\s.+?)>}i, "\n")
|
||||
string.gsub!(%r{</(p|br|div)(|[[:space:]].+?)>}i, "\n")
|
||||
string.gsub!(%r{</td>}i, ' ')
|
||||
|
||||
# strip all other tags
|
||||
|
@ -176,6 +190,23 @@ class String
|
|||
# replace multiple spaces with one
|
||||
string.gsub!(/ /, ' ')
|
||||
|
||||
# add hyperlinks
|
||||
if strict
|
||||
string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) {|_placeholder|
|
||||
pre = $1
|
||||
content = $2
|
||||
post = $5
|
||||
if content =~ /^www/i
|
||||
content = "http://#{content}"
|
||||
end
|
||||
placeholder = if content =~ /^(http|https|ftp|tel)/i
|
||||
"#{pre}######LINKRAW:#{content}#######{post}"
|
||||
else
|
||||
"#{pre}#{content}#{post}"
|
||||
end
|
||||
}
|
||||
end
|
||||
|
||||
# try HTMLEntities, if it fails on invalid signes, use manual way
|
||||
begin
|
||||
coder = HTMLEntities.new
|
||||
|
@ -259,6 +290,8 @@ class String
|
|||
def html2html_strict
|
||||
string = html2text(true, true)
|
||||
string = string.text2html
|
||||
string.gsub!(%r{######LINKEXT:(.+?)/TEXT:(.+?)######}, '<a href="\1" target="_blank">\2</a>')
|
||||
string.gsub!(/######LINKRAW:(.+?)######/, '<a href="\1" target="_blank">\1</a>')
|
||||
string.gsub!(/######(.+?)######/, '<\1>')
|
||||
string.chomp
|
||||
end
|
||||
|
|
|
@ -450,8 +450,81 @@ Men-----------------------'
|
|||
result = '<h3>test</h3>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<b\n>test</b>"
|
||||
result = '<b>test</b>'
|
||||
html = '<a href="http://example.com">http://example.com</a>'
|
||||
result = '<a href="http://example.com" target="_blank">http://example.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<A href="http://example.com?a=1;">http://example.com?a=1;</A>'
|
||||
result = '<a href="http://example.com?a=1;" target="_blank">http://example.com?a=1;</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a href="http://web.de">web.de</a>'
|
||||
result = '<a href="http://web.de" target="_blank">web.de</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a id="123" href="http://web.de">web.de</a>'
|
||||
result = '<a href="http://web.de" target="_blank">web.de</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<br>https://www.facebook.com/test<br>'
|
||||
result = '<a href="https://www.facebook.com/test" target="_blank">https://www.facebook.com/test</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = 'some text http://example.com some other text'
|
||||
result = 'some text <a href="http://example.com" target="_blank">http://example.com</a> some other text'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = 'some text www.example.com some other text'
|
||||
result = 'some text <a href="http://www.example.com" target="_blank">http://www.example.com</a> some other text'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a href="http://example.com">http://what-different.example.com</a>'
|
||||
result = 'http://example.com (<a href="http://what-different.example.com" target="_blank">http://what-different.example.com</a>)'
|
||||
result = 'http://example.com (<a href="http://what-different.example.com" target="_blank">http://what-different.example.com</a>)'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a href="http://example.com">http://EXAMPLE.com</a>'
|
||||
result = '<a href="http://example.com" target="_blank">http://EXAMPLE.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a href="http://example.com" class="abc">http://example.com</a>'
|
||||
result = '<a href="http://example.com" target="_blank">http://example.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a href="http://example.com/" class="abc">http://example.com</a>'
|
||||
result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<a href=\"http://example.com/\n\" class=\"abc\">http://example.com</a>"
|
||||
result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<a href=\"http://example.com/\n \" class=\"abc\n\"\n>http://example.com</a>"
|
||||
result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<div>http://example.com</div>"
|
||||
result = '<a href="http://example.com" target="_blank">http://example.com</a>'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<div>http://example.com.</div>"
|
||||
result = '<a href="http://example.com" target="_blank">http://example.com</a>.'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<div>http://example.com, and so on</div>"
|
||||
result = '<a href="http://example.com" target="_blank">http://example.com</a>, and so on'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<div>http://example.com?lala=me, and so on</div>"
|
||||
result = '<a href="http://example.com?lala=me" target="_blank">http://example.com?lala=me</a>, and so on'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "<a href=\"http://facebook.de/examplesrbog\"><span lang=\"EN-US\" style='color:blue'>http://facebook.de/examplesrbog</span></a>"
|
||||
result = "<a href=\"http://facebook.de/examplesrbog\" target=\"_blank\">http://facebook.de/examplesrbog</a>"
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = "Damit Sie keinen Tag versäumen, empfehlen wir Ihnen den <a href=\"http://newsletters.cylex.de/\" class=\"\">Link des Adventkalenders</a> in<br class=\"\"> Ihrer Lesezeichen-Symbolleiste zu ergänzen.</p><div class=\"\"> "
|
||||
result = "Damit Sie keinen Tag versäumen, empfehlen wir Ihnen den Link des Adventkalenders (<a href=\"http://newsletters.cylex.de/\" target=\"_blank\">http://newsletters.cylex.de/</a>) in<br> Ihrer Lesezeichen-Symbolleiste zu ergänzen."
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<b >test</b>'
|
||||
|
@ -495,7 +568,8 @@ Men-----------------------'
|
|||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
html = '<a href="mailto:john.smith2@example.com" style="color: blue; text-decoration: underline; ">john.smith@example.com</a>'
|
||||
result = 'john.smith@example.com (mailto:john.smith2@example.com)'
|
||||
#result = 'john.smith@example.com (mailto:john.smith2@example.com)'
|
||||
result = 'john.smith@example.com'
|
||||
assert_equal(result, html.html2html_strict)
|
||||
|
||||
end
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue