Improved performance of utf8_encode!() and html2text() (issue #2374).

This commit is contained in:
Martin Edenhofer 2018-11-26 20:47:35 +01:00 committed by Thorsten Eckel
parent af1781ff2a
commit 6be530bb1c
7 changed files with 53289 additions and 18 deletions

View file

@ -125,11 +125,13 @@ class String
link_list = ''
counter = 0
if !string_only
string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) do
link = $2
counter = counter + 1
link_list += "[#{counter}] #{link}\n"
"[#{counter}] "
if string.scan(/<a[[:space:]]/i).count < 5_000
string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) do
link = $2
counter = counter + 1
link_list += "[#{counter}] #{link}\n"
"[#{counter}] "
end
end
else
string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) do |_placeholder|
@ -477,7 +479,21 @@ class String
def utf8_encode!(**options)
return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
viable_encodings(try_first: options[:from]).each do |enc|
# convert string to given charset, if valid_encoding? is true
if options[:from].present?
begin
encoding = Encoding.find(options[:from])
if encoding.present? && dup.force_encoding(encoding).valid_encoding?
force_encoding(encoding)
return encode!('utf-8', encoding)
end
rescue ArgumentError, EncodingError => e
Rails.logger.error { e.inspect }
end
end
# try to find valid encodings of string
viable_encodings.each do |enc|
begin
return encode!('utf-8', enc)
rescue EncodingError => e

View file

@ -62,5 +62,42 @@ RSpec.describe String do
end
end
end
context 'perforamnce' do
let(:subject) { original_string.encode(input_encoding) }
context 'with utf8_encode in iso-8859-1' do
let(:original_string) { 'äöü0' * 999_999 }
let(:input_encoding) { Encoding::ISO_8859_1 }
it 'detects the input encoding' do
Timeout.timeout(1) do
expect(subject.utf8_encode(from: 'iso-8859-1')).to eq(original_string)
end
end
end
context 'with utf8_encode in utf-8' do
let(:original_string) { 'äöü0' * 999_999 }
let(:input_encoding) { Encoding::UTF_8 }
it 'detects the input encoding' do
Timeout.timeout(1) do
expect(subject.utf8_encode(from: 'utf-8')).to eq(original_string)
end
end
end
context 'with utf8_encode in iso-8859-1 and charset detection' do
let(:original_string) { 'äöü0' * 199_999 }
let(:input_encoding) { Encoding::ISO_8859_1 }
it 'detects the input encoding' do
Timeout.timeout(8) do
expect(subject.utf8_encode(from: 'utf-8')).to eq(original_string)
end
end
end
end
end
end

View file

@ -0,0 +1,20 @@
<html>
<title>some title</title>
<body>
<div>hello</div>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
</body>
</html>

View file

@ -0,0 +1,26 @@
some title
hello
some word [1] some url and the end.
some word [2] some url and the end.
some word [3] some url and the end.
some word [4] some url and the end.
some word [5] some url and the end.
some word [6] some url and the end.
some word [7] some url and the end.
some word [8] some url and the end.
some word [9] some url and the end.
some word [10] some url and the end.
some word [11] some url and the end.
[1] http://example.com?domain?example.com
[2] http://example.com?domain?example.com
[3] http://example.com?domain?example.com
[4] http://example.com?domain?example.com
[5] http://example.com?domain?example.com
[6] http://example.com?domain?example.com
[7] http://example.com?domain?example.com
[8] http://example.com?domain?example.com
[9] http://example.com?domain?example.com
[10] http://example.com?domain?example.com
[11] http://example.com?domain?example.com

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -56,64 +56,64 @@ class AaaStringTest < ActiveSupport::TestCase
modul = 'test'
result = 'test'
modul.to_filename
assert_equal(result, modul)
assert_equal(result, modul)
modul = 'Some::File'
result = 'Some::File'
modul.to_filename
assert_equal(result, modul)
assert_equal(result, modul)
end
test 'to_filename function' do
modul = 'test'
result = 'test'
assert_equal(result, modul.to_filename)
assert_equal(result, modul.to_filename)
modul = 'Some::File'
result = 'some/file'
assert_equal(result, modul.to_filename)
assert_equal(result, modul.to_filename)
end
test 'to_classname ref' do
modul = 'test'
result = 'test'
modul.to_filename
assert_equal(result, modul)
assert_equal(result, modul)
modul = 'some/file'
result = 'some/file'
modul.to_filename
assert_equal(result, modul)
assert_equal(result, modul)
end
test 'to_classname function' do
modul = 'test'
result = 'Test'
assert_equal(result, modul.to_classname)
assert_equal(result, modul.to_classname)
modul = 'some/file'
result = 'Some::File'
assert_equal(result, modul.to_classname)
assert_equal(result, modul.to_classname)
modul = 'some/files'
result = 'Some::Files'
assert_equal(result, modul.to_classname)
assert_equal(result, modul.to_classname)
modul = 'some_test/files'
result = 'SomeTest::Files'
assert_equal(result, modul.to_classname)
assert_equal(result, modul.to_classname)
end
test 'html2text ref' do
html = 'test'
result = 'test'
html.html2text
assert_equal(result, html)
assert_equal(result, html)
html = '<div>test</div>'
result = '<div>test</div>'
html.html2text
assert_equal(result, html)
assert_equal(result, html)
end
test 'html2text function' do
@ -458,6 +458,17 @@ Well as though adam took out here. Melvin will be more money. Called him into th
Men-----------------------'
assert_equal(result, html.html2text)
Timeout::timeout(2) do
html = File.read(Rails.root.join('test', 'data', 'string', 'html2text1.html'))
result = File.read(Rails.root.join('test', 'data', 'string', 'html2text1.txt'))
assert_equal(result, html.html2text)
end
Timeout::timeout(2) do
html = File.read(Rails.root.join('test', 'data', 'string', 'html2text2.html'))
result = File.read(Rails.root.join('test', 'data', 'string', 'html2text2.txt'))
assert_equal(result, html.html2text)
end
end
test 'html2html_strict function' do