Improved performance of utf8_encode!() and html2text() (issue #2374).
This commit is contained in:
parent
af1781ff2a
commit
6be530bb1c
7 changed files with 53289 additions and 18 deletions
|
@ -125,11 +125,13 @@ class String
|
|||
link_list = ''
|
||||
counter = 0
|
||||
if !string_only
|
||||
string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) do
|
||||
link = $2
|
||||
counter = counter + 1
|
||||
link_list += "[#{counter}] #{link}\n"
|
||||
"[#{counter}] "
|
||||
if string.scan(/<a[[:space:]]/i).count < 5_000
|
||||
string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) do
|
||||
link = $2
|
||||
counter = counter + 1
|
||||
link_list += "[#{counter}] #{link}\n"
|
||||
"[#{counter}] "
|
||||
end
|
||||
end
|
||||
else
|
||||
string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) do |_placeholder|
|
||||
|
@ -477,7 +479,21 @@ class String
|
|||
def utf8_encode!(**options)
|
||||
return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
|
||||
|
||||
viable_encodings(try_first: options[:from]).each do |enc|
|
||||
# convert string to given charset, if valid_encoding? is true
|
||||
if options[:from].present?
|
||||
begin
|
||||
encoding = Encoding.find(options[:from])
|
||||
if encoding.present? && dup.force_encoding(encoding).valid_encoding?
|
||||
force_encoding(encoding)
|
||||
return encode!('utf-8', encoding)
|
||||
end
|
||||
rescue ArgumentError, EncodingError => e
|
||||
Rails.logger.error { e.inspect }
|
||||
end
|
||||
end
|
||||
|
||||
# try to find valid encodings of string
|
||||
viable_encodings.each do |enc|
|
||||
begin
|
||||
return encode!('utf-8', enc)
|
||||
rescue EncodingError => e
|
||||
|
|
|
@ -62,5 +62,42 @@ RSpec.describe String do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'perforamnce' do
|
||||
let(:subject) { original_string.encode(input_encoding) }
|
||||
|
||||
context 'with utf8_encode in iso-8859-1' do
|
||||
let(:original_string) { 'äöü0' * 999_999 }
|
||||
let(:input_encoding) { Encoding::ISO_8859_1 }
|
||||
|
||||
it 'detects the input encoding' do
|
||||
Timeout.timeout(1) do
|
||||
expect(subject.utf8_encode(from: 'iso-8859-1')).to eq(original_string)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'with utf8_encode in utf-8' do
|
||||
let(:original_string) { 'äöü0' * 999_999 }
|
||||
let(:input_encoding) { Encoding::UTF_8 }
|
||||
|
||||
it 'detects the input encoding' do
|
||||
Timeout.timeout(1) do
|
||||
expect(subject.utf8_encode(from: 'utf-8')).to eq(original_string)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'with utf8_encode in iso-8859-1 and charset detection' do
|
||||
let(:original_string) { 'äöü0' * 199_999 }
|
||||
let(:input_encoding) { Encoding::ISO_8859_1 }
|
||||
|
||||
it 'detects the input encoding' do
|
||||
Timeout.timeout(8) do
|
||||
expect(subject.utf8_encode(from: 'utf-8')).to eq(original_string)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
20
test/data/string/html2text1.html
Normal file
20
test/data/string/html2text1.html
Normal file
|
@ -0,0 +1,20 @@
|
|||
<html>
|
||||
<title>some title</title>
|
||||
<body>
|
||||
|
||||
<div>hello</div>
|
||||
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
<p>some word <a href="http://example.com?domain?example.com">some url</a> and the end.</p>
|
||||
|
||||
</body>
|
||||
</html>
|
26
test/data/string/html2text1.txt
Normal file
26
test/data/string/html2text1.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
some title
|
||||
hello
|
||||
|
||||
some word [1] some url and the end.
|
||||
some word [2] some url and the end.
|
||||
some word [3] some url and the end.
|
||||
some word [4] some url and the end.
|
||||
some word [5] some url and the end.
|
||||
some word [6] some url and the end.
|
||||
some word [7] some url and the end.
|
||||
some word [8] some url and the end.
|
||||
some word [9] some url and the end.
|
||||
some word [10] some url and the end.
|
||||
some word [11] some url and the end.
|
||||
|
||||
[1] http://example.com?domain?example.com
|
||||
[2] http://example.com?domain?example.com
|
||||
[3] http://example.com?domain?example.com
|
||||
[4] http://example.com?domain?example.com
|
||||
[5] http://example.com?domain?example.com
|
||||
[6] http://example.com?domain?example.com
|
||||
[7] http://example.com?domain?example.com
|
||||
[8] http://example.com?domain?example.com
|
||||
[9] http://example.com?domain?example.com
|
||||
[10] http://example.com?domain?example.com
|
||||
[11] http://example.com?domain?example.com
|
27748
test/data/string/html2text2.html
Normal file
27748
test/data/string/html2text2.html
Normal file
File diff suppressed because it is too large
Load diff
25413
test/data/string/html2text2.txt
Normal file
25413
test/data/string/html2text2.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -56,64 +56,64 @@ class AaaStringTest < ActiveSupport::TestCase
|
|||
modul = 'test'
|
||||
result = 'test'
|
||||
modul.to_filename
|
||||
assert_equal(result, modul)
|
||||
assert_equal(result, modul)
|
||||
|
||||
modul = 'Some::File'
|
||||
result = 'Some::File'
|
||||
modul.to_filename
|
||||
assert_equal(result, modul)
|
||||
assert_equal(result, modul)
|
||||
end
|
||||
|
||||
test 'to_filename function' do
|
||||
modul = 'test'
|
||||
result = 'test'
|
||||
assert_equal(result, modul.to_filename)
|
||||
assert_equal(result, modul.to_filename)
|
||||
|
||||
modul = 'Some::File'
|
||||
result = 'some/file'
|
||||
assert_equal(result, modul.to_filename)
|
||||
assert_equal(result, modul.to_filename)
|
||||
end
|
||||
|
||||
test 'to_classname ref' do
|
||||
modul = 'test'
|
||||
result = 'test'
|
||||
modul.to_filename
|
||||
assert_equal(result, modul)
|
||||
assert_equal(result, modul)
|
||||
|
||||
modul = 'some/file'
|
||||
result = 'some/file'
|
||||
modul.to_filename
|
||||
assert_equal(result, modul)
|
||||
assert_equal(result, modul)
|
||||
end
|
||||
|
||||
test 'to_classname function' do
|
||||
modul = 'test'
|
||||
result = 'Test'
|
||||
assert_equal(result, modul.to_classname)
|
||||
assert_equal(result, modul.to_classname)
|
||||
|
||||
modul = 'some/file'
|
||||
result = 'Some::File'
|
||||
assert_equal(result, modul.to_classname)
|
||||
assert_equal(result, modul.to_classname)
|
||||
|
||||
modul = 'some/files'
|
||||
result = 'Some::Files'
|
||||
assert_equal(result, modul.to_classname)
|
||||
assert_equal(result, modul.to_classname)
|
||||
|
||||
modul = 'some_test/files'
|
||||
result = 'SomeTest::Files'
|
||||
assert_equal(result, modul.to_classname)
|
||||
assert_equal(result, modul.to_classname)
|
||||
end
|
||||
|
||||
test 'html2text ref' do
|
||||
html = 'test'
|
||||
result = 'test'
|
||||
html.html2text
|
||||
assert_equal(result, html)
|
||||
assert_equal(result, html)
|
||||
|
||||
html = '<div>test</div>'
|
||||
result = '<div>test</div>'
|
||||
html.html2text
|
||||
assert_equal(result, html)
|
||||
assert_equal(result, html)
|
||||
end
|
||||
|
||||
test 'html2text function' do
|
||||
|
@ -458,6 +458,17 @@ Well as though adam took out here. Melvin will be more money. Called him into th
|
|||
Men-----------------------'
|
||||
assert_equal(result, html.html2text)
|
||||
|
||||
Timeout::timeout(2) do
|
||||
html = File.read(Rails.root.join('test', 'data', 'string', 'html2text1.html'))
|
||||
result = File.read(Rails.root.join('test', 'data', 'string', 'html2text1.txt'))
|
||||
assert_equal(result, html.html2text)
|
||||
end
|
||||
|
||||
Timeout::timeout(2) do
|
||||
html = File.read(Rails.root.join('test', 'data', 'string', 'html2text2.html'))
|
||||
result = File.read(Rails.root.join('test', 'data', 'string', 'html2text2.txt'))
|
||||
assert_equal(result, html.html2text)
|
||||
end
|
||||
end
|
||||
|
||||
test 'html2html_strict function' do
|
||||
|
|
Loading…
Reference in a new issue