Improved performance of html utils (do replacement in a copy of sting, replace it later once), added word markup removal.

This commit is contained in:
Martin Edenhofer 2015-08-05 11:40:50 +02:00
parent 1741bc1bd1
commit 9643e2f214
2 changed files with 102 additions and 72 deletions

View file

@ -100,47 +100,62 @@ class App.Utils
# textWithoutTags = App.Utils.htmlRemoveTags( html ) # textWithoutTags = App.Utils.htmlRemoveTags( html )
@htmlRemoveTags: (html) -> @htmlRemoveTags: (html) ->
htmlTmp = $( '<div>' + html.html() + '</div>' )
# remove comments
@_removeComments( htmlTmp )
# remove work markup
htmlTmp = @_removeWordMarkup( htmlTmp )
# remove tags, keep content # remove tags, keep content
html.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( -> htmlTmp.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( ->
$(@).contents() $(@).contents()
) )
# remove tags & content # remove tags & content
html.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, br, hr, img, input, select, button, style, applet, canvas, script, frame, iframe').remove() htmlTmp.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, br, hr, img, input, select, button, style, applet, embed, noframes, canvas, script, frame, iframe').remove()
html html.html(htmlTmp)
# htmlOnlyWithRichtext = App.Utils.htmlRemoveRichtext( html ) # htmlOnlyWithRichtext = App.Utils.htmlRemoveRichtext( html )
@htmlRemoveRichtext: (html) -> @htmlRemoveRichtext: (html) ->
htmlTmp = $( '<div>' + html.html() + '</div>' )
# remove comments # remove comments
@_removeComments( html ) @_removeComments( htmlTmp )
# remove style and class # remove style and class
@_removeAttributes( html ) @_removeAttributes( htmlTmp )
# remove work markup
htmlTmp = @_removeWordMarkup( htmlTmp )
# remove tags, keep content # remove tags, keep content
html.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( -> htmlTmp.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( ->
$(@).contents() $(@).contents()
) )
# remove tags & content # remove tags & content
html.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, hr, img, input, select, button, style, applet, canvas, script, frame, iframe').remove() htmlTmp.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, hr, img, input, select, button, style, applet, embed, noframes, canvas, script, frame, iframe').remove()
html html.html(htmlTmp)
# cleanHtmlWithRichText = App.Utils.htmlCleanup( html ) # cleanHtmlWithRichText = App.Utils.htmlCleanup( html )
@htmlCleanup: (html) -> @htmlCleanup: (html) ->
htmlTmp = $( '<div>' + html.html() + '</div>' )
# remove comments # remove comments
@_removeComments( html ) @_removeComments( htmlTmp )
# remove style and class # remove style and class
@_removeAttributes( html ) @_removeAttributes( htmlTmp )
# remove work markup
htmlTmp = @_removeWordMarkup( htmlTmp )
# remove tags, keep content # remove tags, keep content
html.find('a, font, small, time').replaceWith( -> htmlTmp.find('a, font, small, time').replaceWith( ->
$(@).contents() $(@).contents()
) )
@ -148,31 +163,32 @@ class App.Utils
# New type of the tag # New type of the tag
replacementTag = 'div'; replacementTag = 'div';
# Replace all a tags with the type of replacementTag # Replace all x tags with the type of replacementTag
html.find('h1, h2, h3, h4, h5, h6, textarea').each( -> htmlTmp.find('h1, h2, h3, h4, h5, h6, textarea').each( ->
outer = this.outerHTML; outer = this.outerHTML;
# Replace opening tag # Replace opening tag
regex = new RegExp('<' + this.tagName, 'i'); regex = new RegExp('<' + this.tagName, 'i')
newTag = outer.replace(regex, '<' + replacementTag); newTag = outer.replace(regex, '<' + replacementTag)
# Replace closing tag # Replace closing tag
regex = new RegExp('</' + this.tagName, 'i'); regex = new RegExp('</' + this.tagName, 'i')
newTag = newTag.replace(regex, '</' + replacementTag); newTag = newTag.replace(regex, '</' + replacementTag)
$(@).replaceWith(newTag); $(@).replaceWith(newTag)
) )
# remove tags & content # remove tags & content
html.find('form, font, hr, img, input, select, button, style, applet, canvas, script, frame, iframe').remove() htmlTmp.find('form, font, hr, img, input, select, button, style, applet, embed, noframes, canvas, script, frame, iframe').remove()
html html.html(htmlTmp)
@_removeAttributes: (html) -> @_removeAttributes: (html) ->
html.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, h1, h2, h3, h4, h5, h6') html.find('*')
.removeAttr( 'style' ) .removeAttr( 'style' )
.removeAttr( 'class' ) .removeAttr( 'class' )
.removeAttr( 'title' ) .removeAttr( 'title' )
.removeAttr( 'lang' )
html html
@_removeComments: (html) -> @_removeComments: (html) ->
@ -182,6 +198,14 @@ class App.Utils
) )
html html
@_removeWordMarkup: (html) ->
htmlTmp = html.get(0).outerHTML
regex = new RegExp('<(/w|w)\:[A-Za-z]{3}>')
htmlTmp = htmlTmp.replace(regex, '')
regex = new RegExp('<(/o|o)\:[A-Za-z]{1}>')
htmlTmp = htmlTmp.replace(regex, '')
$(htmlTmp)
# signatureNeeded = App.Utils.signatureCheck( message, signature ) # signatureNeeded = App.Utils.signatureCheck( message, signature )
@signatureCheck: (message, signature) -> @signatureCheck: (message, signature) ->
messageText = $( '<div>' + message + '</div>' ).text().trim() messageText = $( '<div>' + message + '</div>' ).text().trim()

View file

@ -212,43 +212,47 @@ test( "htmlEscape", function() {
test( "htmlRemoveTags", function() { test( "htmlRemoveTags", function() {
var source = "<div>test</div>" var source = "<div>test</div>"
var should = "test" var should = "<div>test</div>"
var result = App.Utils.htmlRemoveTags( $(source) ) var result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div>test<!-- some comment --></div>"
should = "<div>test</div>"
result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<a href=\"some_link\">some link to somewhere</a>" source = "<a href=\"some_link\">some link to somewhere</a>"
should = "some link to somewhere" should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveTags( $(source) ) result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><a href=\"some_link\">some link to somewhere</a></div>" source = "<div><a href=\"some_link\">some link to somewhere</a></div>"
should = "some link to somewhere" should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveTags( $(source) ) result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><a href=\"some_link\">some link to somewhere</a><input value=\"should not be shown\"></div>" source = "<div><a href=\"some_link\">some link to somewhere</a><input value=\"should not be shown\"></div>"
should = "some link to somewhere" should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveTags( $(source) ) result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><a href=\"some_link\">some link to somewhere</a> <div><hr></div> <span>123</span> <img src=\"some_image\"/></div>" source = "<div><a href=\"some_link\">some link to somewhere</a> <div><hr></div> <span>123</span> <img src=\"some_image\"/></div>"
should = "some link to somewhere 123 " should = "<div>some link to somewhere 123 </div>"
result = App.Utils.htmlRemoveTags( $(source) ) result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form></div>" source = "<div><form class=\"xxx\">test 123</form></div>"
should = "test 123" should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><textarea class=\"xxx\">test 123</textarea></div>" source = "<div><textarea class=\"xxx\">test 123</textarea></div>"
should = "test 123" should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>" source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>"
//should = "<div>This is some text!</div>" should = "<div>This is some text!</div>"
should = "This is some text!"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
}); });
@ -257,145 +261,147 @@ test( "htmlRemoveTags", function() {
test( "htmlRemoveRichtext", function() { test( "htmlRemoveRichtext", function() {
var source = "<div><!--test comment--><a href=\"test\">test</a></div>" var source = "<div><!--test comment--><a href=\"test\">test</a></div>"
var should = "test" var should = "<div>test</div>"
var result = App.Utils.htmlRemoveRichtext( $(source) ) var result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><!--[if !supportLists]--><span lang=\"DE\">1.1.1<span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span></span><!--[endif]--><span lang=\"DE\">Description</span></div>"
should = "<div><span>1.1.1<span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span></span><span>Description</span></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<a href=\"some_link\">some link to somewhere</a>" source = "<a href=\"some_link\">some link to somewhere</a>"
should = "some link to somewhere" should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><a href=\"some_link\"></a> test </div>" source = "<div><a href=\"some_link\"></a> test </div>"
should = " test " should = "<div> test </div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><b></b> test </div>" source = "<div><b></b> test </div>"
should = " test " should = "<div> test </div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><div><b></b> test </div></div>" source = "<div><div><b></b> test </div></div>"
should = "<div> test </div>" should = "<div><div> test </div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><div><b></b> test <input value=\"should not be shown\"></div></div>" source = "<div><div><b></b> test <input value=\"should not be shown\"></div></div>"
should = "<div> test </div>" should = "<div><div> test </div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><div><b></b> test </div><span>123</span></div>" source = "<div><div><b></b> test </div><span>123</span></div>"
should = "<div> test </div><span>123</span>" should = "<div><div> test </div><span>123</span></div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><b></b> test </div></div>" source = "<div><div class=\"xxx\" title=\"some title\" lang=\"en\"><b></b> test </div></div>"
should = "<div> test </div>" should = "<div><div> test </div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><textarea class=\"xxx\"> test </textarea></div>" source = "<div><textarea class=\"xxx\"> test </textarea></div>"
//should = "<div> test </div>" should = "<div> test </div>"
should = " test "
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><br></div>" source = "<div><br></div>"
should = "<br>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><br></div></div>"
should = "<div><br></div>" should = "<div><br></div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><br></div></div>"
should = "<div><div><br></div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form></div>" source = "<div><form class=\"xxx\">test 123</form></div>"
//should = "<div>test 123</div>" should = "<div>test 123</div>"
should = "test 123"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>" source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>"
//should = "<div>This is some text!</div>" should = "<div>This is some text!</div>"
should = "This is some text!"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
}); });
// htmlCleanup // htmlCleanup
test( "htmlCleanup", function() { test( "htmlCleanup", function() {
var source = "<div><!--test comment--><a href=\"test\">test</a></div>" var source = "<div><!--test comment--><a href=\"test\">test</a></div>"
var should = "test" var should = "<div>test</div>"
var result = App.Utils.htmlCleanup( $(source) ) var result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<a href=\"some_link\">some link to somewhere</a>" source = "<a href=\"some_link\">some link to somewhere</a>"
should = "some link to somewhere" //should = "some link to somewhere"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><h1>some link to somewhere</h1></a>"
should = "<div>some link to somewhere</div>" should = "<div>some link to somewhere</div>"
result = App.Utils.htmlCleanup( $(source) ) result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><h1>some link to somewhere</h1></div>"
should = "<div><div>some link to somewhere</div></div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><small>some link to somewhere</small></a>" source = "<div><small>some link to somewhere</small></a>"
//should = "<div>some link to somewhere</div>" should = "<div>some link to somewhere</div>"
should = "some link to somewhere"
result = App.Utils.htmlCleanup( $(source) ) result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><time>some link to somewhere</time></a>" source = "<div><time>some link to somewhere</time></a>"
//should = "<div>some link to somewhere</div>" should = "<div>some link to somewhere</div>"
should = "some link to somewhere"
result = App.Utils.htmlCleanup( $(source) ) result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><h1>some link to somewhere</h1><p><hr></p></div>" source = "<div><h1>some h1 for somewhere</h1><p><hr></p></div>"
should = "<div>some link to somewhere</div><p></p><p></p>" should = "<div><div>some h1 for somewhere</div><p></p><p></p></div>"
result = App.Utils.htmlCleanup( $(source) ) result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><br></div>" source = "<div><br></div>"
should = "<br>" should = "<div><br></div>"
result = App.Utils.htmlCleanup( $(source) ) result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><br></div></div>" source = "<div><div class=\"xxx\"><br></div></div>"
should = "<div><br></div>" should = "<div><div><br></div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form></div>" source = "<div><form class=\"xxx\">test 123</form></div>"
//should = "<div>test 123<br></div>" //should = "<div>test 123<br></div>"
should = "test 123" should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form> some other value</div>" source = "<div><form class=\"xxx\">test 123</form> some other value</div>"
//should = "<div>ttest 123 some other value</div>" should = "<div>test 123 some other value</div>"
should = "test 123 some other value"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form> some other value<input value=\"should not be shown\"></div>" source = "<div><form class=\"xxx\">test 123</form> some other value<input value=\"should not be shown\"></div>"
//should = "<div>test 123 some other value</div>" should = "<div>test 123 some other value</div>"
should = "test 123 some other value"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>" source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>"
//should = "<div>This is some text!</div>" should = "<div>This is some text!</div>"
should = "This is some text!"
result = App.Utils.htmlRemoveRichtext( $(source) ) result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source ) equal( result.html(), should, source )
source = "<div><p>some link to somewhere from word<w:sdt>abc</w:sdt></p><o:p></o:p></a>"
should = "<div><p>some link to somewhere from wordabc</p></div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
}); });
// wrap // wrap