Fixed issue #951 - Auto detection of www urls fails.

2017-04-13 14:47:12 +02:00 · 2017-04-13 14:47:12 +02:00 · 5868fdd19f
commit 5868fdd19f
parent fb1d20c5e4
3 changed files with 34 additions and 3 deletions
--- a/lib/html_sanitizer.rb
+++ b/lib/html_sanitizer.rb
@ -158,8 +158,15 @@ satinize html string based on whiltelist
      if node && node.name != 'a' && node.parent && node.parent.name != 'a' && (!node.parent.parent || node.parent.parent.name != 'a')
        if node.class == Nokogiri::XML::Text
          urls = []
-          node.content.scan(%r{((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)}mxi).each { |match|
-            urls.push match[0]
+          node.content.scan(%r{((http|https|ftp|tel)://.+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)}mxi).each { |match|
+            if match[0]
+              urls.push match[0].to_s.strip
+            end
+          }
+          node.content.scan(/(^|:|;|\s)(www\..+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)/mxi).each { |match|
+            if match[1]
+              urls.push match[1].to_s.strip
+            end
          }
          next if urls.empty?
          add_link(node.content, urls, node)
--- a/test/unit/aaa_string_test.rb
+++ b/test/unit/aaa_string_test.rb
@ -570,6 +570,30 @@ Men-----------------------'
    result = 'some text <a href="http://www.example.com" rel="nofollow" target="_blank">http://www.example.com</a> some other text'
    assert_equal(result, html.html2html_strict)

+    html   = 'some textwwwsome other text'
+    result = 'some textwwwsome other text'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'some text wwwsome other text'
+    result = 'some text wwwsome other text'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'some text www.some.dom other text'
+    result = 'some text <a href="http://www.some.dom" rel="nofollow" target="_blank">http://www.some.dom</a> other text'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'www.some.dom other text'
+    result = '<a href="http://www.some.dom" rel="nofollow" target="_blank">http://www.some.dom</a> other text'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'www.some.dom'
+    result = '<a href="http://www.some.dom" rel="nofollow" target="_blank">http://www.some.dom</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'web:www.some.dom other text'
+    result = 'web:<a href="http://www.some.dom" rel="nofollow" target="_blank">http://www.some.dom</a> other text'
+    assert_equal(result, html.html2html_strict)
+
    html   = '<a href="http://example.com">http://what-different.example.com</a>'
    #result = 'http://example.com (<a href="http://what-different.example.com" rel="nofollow" target="_blank">http://what-different.example.com</a>)'
    result = '<a href="http://what-different.example.com" rel="nofollow" target="_blank">http://what-different.example.com</a> (<a href="http://example.com" rel="nofollow" target="_blank">http://example.com</a>)'
--- a/test/unit/email_parser_test.rb
+++ b/test/unit/email_parser_test.rb
@ -552,7 +552,7 @@ Newsletter abbestellen (<a href="http://newsletters.cylex.de/ref/www.cylex.de/si
      },
      {
        data: IO.binread('test/fixtures/mail19.box'),
-        body_md5: '4355c52fdfd2adea0cda6814adb78ae3',
+        body_md5: '40bf3f7f830c6ba7947deb9a2acfc5bc',
        params: {
          from: '"我" <>',
          from_email: '"我" <>',