Improved performance of html utils (do replacement in a copy of sting, replace it later once), added word markup removal.

This commit is contained in:
Martin Edenhofer 2015-08-05 11:40:50 +02:00
parent 1741bc1bd1
commit 9643e2f214
2 changed files with 102 additions and 72 deletions

View file

@ -100,47 +100,62 @@ class App.Utils
# textWithoutTags = App.Utils.htmlRemoveTags( html )
@htmlRemoveTags: (html) ->
htmlTmp = $( '<div>' + html.html() + '</div>' )
# remove comments
@_removeComments( htmlTmp )
# remove work markup
htmlTmp = @_removeWordMarkup( htmlTmp )
# remove tags, keep content
html.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( ->
htmlTmp.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( ->
$(@).contents()
)
# remove tags & content
html.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, br, hr, img, input, select, button, style, applet, canvas, script, frame, iframe').remove()
htmlTmp.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, br, hr, img, input, select, button, style, applet, embed, noframes, canvas, script, frame, iframe').remove()
html
html.html(htmlTmp)
# htmlOnlyWithRichtext = App.Utils.htmlRemoveRichtext( html )
@htmlRemoveRichtext: (html) ->
htmlTmp = $( '<div>' + html.html() + '</div>' )
# remove comments
@_removeComments( html )
@_removeComments( htmlTmp )
# remove style and class
@_removeAttributes( html )
@_removeAttributes( htmlTmp )
# remove work markup
htmlTmp = @_removeWordMarkup( htmlTmp )
# remove tags, keep content
html.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( ->
htmlTmp.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6').replaceWith( ->
$(@).contents()
)
# remove tags & content
html.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, hr, img, input, select, button, style, applet, canvas, script, frame, iframe').remove()
htmlTmp.find('li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, form, textarea, font, address, table, thead, tbody, tr, td, h1, h2, h3, h4, h5, h6, hr, img, input, select, button, style, applet, embed, noframes, canvas, script, frame, iframe').remove()
html
html.html(htmlTmp)
# cleanHtmlWithRichText = App.Utils.htmlCleanup( html )
@htmlCleanup: (html) ->
htmlTmp = $( '<div>' + html.html() + '</div>' )
# remove comments
@_removeComments( html )
@_removeComments( htmlTmp )
# remove style and class
@_removeAttributes( html )
@_removeAttributes( htmlTmp )
# remove work markup
htmlTmp = @_removeWordMarkup( htmlTmp )
# remove tags, keep content
html.find('a, font, small, time').replaceWith( ->
htmlTmp.find('a, font, small, time').replaceWith( ->
$(@).contents()
)
@ -148,31 +163,32 @@ class App.Utils
# New type of the tag
replacementTag = 'div';
# Replace all a tags with the type of replacementTag
html.find('h1, h2, h3, h4, h5, h6, textarea').each( ->
# Replace all x tags with the type of replacementTag
htmlTmp.find('h1, h2, h3, h4, h5, h6, textarea').each( ->
outer = this.outerHTML;
# Replace opening tag
regex = new RegExp('<' + this.tagName, 'i');
newTag = outer.replace(regex, '<' + replacementTag);
regex = new RegExp('<' + this.tagName, 'i')
newTag = outer.replace(regex, '<' + replacementTag)
# Replace closing tag
regex = new RegExp('</' + this.tagName, 'i');
newTag = newTag.replace(regex, '</' + replacementTag);
regex = new RegExp('</' + this.tagName, 'i')
newTag = newTag.replace(regex, '</' + replacementTag)
$(@).replaceWith(newTag);
$(@).replaceWith(newTag)
)
# remove tags & content
html.find('form, font, hr, img, input, select, button, style, applet, canvas, script, frame, iframe').remove()
htmlTmp.find('form, font, hr, img, input, select, button, style, applet, embed, noframes, canvas, script, frame, iframe').remove()
html
html.html(htmlTmp)
@_removeAttributes: (html) ->
html.find('div, span, p, li, ul, ol, a, b, u, i, label, small, strong, strike, pre, code, center, blockquote, h1, h2, h3, h4, h5, h6')
html.find('*')
.removeAttr( 'style' )
.removeAttr( 'class' )
.removeAttr( 'title' )
.removeAttr( 'lang' )
html
@_removeComments: (html) ->
@ -182,6 +198,14 @@ class App.Utils
)
html
@_removeWordMarkup: (html) ->
htmlTmp = html.get(0).outerHTML
regex = new RegExp('<(/w|w)\:[A-Za-z]{3}>')
htmlTmp = htmlTmp.replace(regex, '')
regex = new RegExp('<(/o|o)\:[A-Za-z]{1}>')
htmlTmp = htmlTmp.replace(regex, '')
$(htmlTmp)
# signatureNeeded = App.Utils.signatureCheck( message, signature )
@signatureCheck: (message, signature) ->
messageText = $( '<div>' + message + '</div>' ).text().trim()

View file

@ -212,43 +212,47 @@ test( "htmlEscape", function() {
test( "htmlRemoveTags", function() {
var source = "<div>test</div>"
var should = "test"
var should = "<div>test</div>"
var result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<div>test<!-- some comment --></div>"
should = "<div>test</div>"
result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<a href=\"some_link\">some link to somewhere</a>"
should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<div><a href=\"some_link\">some link to somewhere</a></div>"
should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<div><a href=\"some_link\">some link to somewhere</a><input value=\"should not be shown\"></div>"
should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<div><a href=\"some_link\">some link to somewhere</a> <div><hr></div> <span>123</span> <img src=\"some_image\"/></div>"
should = "some link to somewhere 123 "
should = "<div>some link to somewhere 123 </div>"
result = App.Utils.htmlRemoveTags( $(source) )
equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form></div>"
should = "test 123"
should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><textarea class=\"xxx\">test 123</textarea></div>"
should = "test 123"
should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>"
//should = "<div>This is some text!</div>"
should = "This is some text!"
should = "<div>This is some text!</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
});
@ -257,145 +261,147 @@ test( "htmlRemoveTags", function() {
test( "htmlRemoveRichtext", function() {
var source = "<div><!--test comment--><a href=\"test\">test</a></div>"
var should = "test"
var should = "<div>test</div>"
var result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><!--[if !supportLists]--><span lang=\"DE\">1.1.1<span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span></span><!--[endif]--><span lang=\"DE\">Description</span></div>"
should = "<div><span>1.1.1<span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span></span><span>Description</span></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<a href=\"some_link\">some link to somewhere</a>"
should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><a href=\"some_link\"></a> test </div>"
should = " test "
should = "<div> test </div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><b></b> test </div>"
should = " test "
should = "<div> test </div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div><b></b> test </div></div>"
should = "<div> test </div>"
should = "<div><div> test </div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div><b></b> test <input value=\"should not be shown\"></div></div>"
should = "<div> test </div>"
should = "<div><div> test </div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div><b></b> test </div><span>123</span></div>"
should = "<div> test </div><span>123</span>"
should = "<div><div> test </div><span>123</span></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><b></b> test </div></div>"
should = "<div> test </div>"
source = "<div><div class=\"xxx\" title=\"some title\" lang=\"en\"><b></b> test </div></div>"
should = "<div><div> test </div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><textarea class=\"xxx\"> test </textarea></div>"
//should = "<div> test </div>"
should = " test "
should = "<div> test </div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><br></div>"
should = "<br>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><br></div></div>"
should = "<div><br></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><br></div></div>"
should = "<div><div><br></div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form></div>"
//should = "<div>test 123</div>"
should = "test 123"
should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>"
//should = "<div>This is some text!</div>"
should = "This is some text!"
should = "<div>This is some text!</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
});
// htmlCleanup
test( "htmlCleanup", function() {
var source = "<div><!--test comment--><a href=\"test\">test</a></div>"
var should = "test"
var should = "<div>test</div>"
var result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<a href=\"some_link\">some link to somewhere</a>"
should = "some link to somewhere"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><h1>some link to somewhere</h1></a>"
//should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><h1>some link to somewhere</h1></div>"
should = "<div><div>some link to somewhere</div></div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><small>some link to somewhere</small></a>"
//should = "<div>some link to somewhere</div>"
should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><time>some link to somewhere</time></a>"
//should = "<div>some link to somewhere</div>"
should = "some link to somewhere"
should = "<div>some link to somewhere</div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><h1>some link to somewhere</h1><p><hr></p></div>"
should = "<div>some link to somewhere</div><p></p><p></p>"
source = "<div><h1>some h1 for somewhere</h1><p><hr></p></div>"
should = "<div><div>some h1 for somewhere</div><p></p><p></p></div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><br></div>"
should = "<br>"
should = "<div><br></div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
source = "<div><div class=\"xxx\"><br></div></div>"
should = "<div><br></div>"
should = "<div><div><br></div></div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form></div>"
//should = "<div>test 123<br></div>"
should = "test 123"
should = "<div>test 123</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form> some other value</div>"
//should = "<div>ttest 123 some other value</div>"
should = "test 123 some other value"
should = "<div>test 123 some other value</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><form class=\"xxx\">test 123</form> some other value<input value=\"should not be shown\"></div>"
//should = "<div>test 123 some other value</div>"
should = "test 123 some other value"
should = "<div>test 123 some other value</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><font size=\"3\" color=\"red\">This is some text!</font></div>"
//should = "<div>This is some text!</div>"
should = "This is some text!"
should = "<div>This is some text!</div>"
result = App.Utils.htmlRemoveRichtext( $(source) )
equal( result.html(), should, source )
source = "<div><p>some link to somewhere from word<w:sdt>abc</w:sdt></p><o:p></o:p></a>"
should = "<div><p>some link to somewhere from wordabc</p></div>"
result = App.Utils.htmlCleanup( $(source) )
equal( result.html(), should, source )
});
// wrap