Make String#utf8_encode more robust (fixes #2176)
This commit is contained in:
parent
be11e9a1d5
commit
a93f81e20b
2 changed files with 19 additions and 5 deletions
|
@ -474,10 +474,15 @@ class String
|
||||||
end
|
end
|
||||||
|
|
||||||
def utf8_encode!(**options)
|
def utf8_encode!(**options)
|
||||||
return self if (encoding == Encoding::UTF_8) && valid_encoding?
|
return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
|
||||||
|
|
||||||
input_encoding = viable_encodings(try_first: options[:from]).first
|
viable_encodings(try_first: options[:from]).each do |e|
|
||||||
return encode!('utf-8', input_encoding) if input_encoding.present?
|
begin
|
||||||
|
return encode!('utf-8', e)
|
||||||
|
rescue Encoding::UndefinedConversionError
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
case options[:fallback]
|
case options[:fallback]
|
||||||
when :output_to_binary
|
when :output_to_binary
|
||||||
|
@ -501,6 +506,7 @@ class String
|
||||||
[provided, original, detected]
|
[provided, original, detected]
|
||||||
.compact
|
.compact
|
||||||
.reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT }
|
.reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT }
|
||||||
|
.reject { |e| Encoding.find(e) == Encoding::UTF_8 }
|
||||||
.select { |e| force_encoding(e).valid_encoding? }
|
.select { |e| force_encoding(e).valid_encoding? }
|
||||||
.tap { force_encoding(original) } # clean up changes from previous line
|
.tap { force_encoding(original) } # clean up changes from previous line
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ require 'rails_helper'
|
||||||
|
|
||||||
RSpec.describe String do
|
RSpec.describe String do
|
||||||
describe '#utf8_encode' do
|
describe '#utf8_encode' do
|
||||||
context 'for valid, UTF-8-encoded strings' do
|
context 'on valid, UTF-8-encoded strings' do
|
||||||
let(:subject) { 'hello' }
|
let(:subject) { 'hello' }
|
||||||
|
|
||||||
it 'returns an identical copy' do
|
it 'returns an identical copy' do
|
||||||
|
@ -10,9 +10,17 @@ RSpec.describe String do
|
||||||
expect(subject.utf8_encode.encoding).to be(subject.encoding)
|
expect(subject.utf8_encode.encoding).to be(subject.encoding)
|
||||||
expect(subject.utf8_encode).not_to be(subject)
|
expect(subject.utf8_encode).not_to be(subject)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'which are incorrectly set to other, technically valid encodings' do
|
||||||
|
let(:subject) { 'ö'.force_encoding('tis-620') }
|
||||||
|
|
||||||
|
it 'sets input encoding to UTF-8 instead of attempting conversion' do
|
||||||
|
expect(subject.utf8_encode).to eq(subject.force_encoding('utf-8'))
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'for strings in other encodings' do
|
context 'on strings in other encodings' do
|
||||||
let(:subject) { original_string.encode(input_encoding) }
|
let(:subject) { original_string.encode(input_encoding) }
|
||||||
|
|
||||||
context 'with no from: option' do
|
context 'with no from: option' do
|
||||||
|
|
Loading…
Reference in a new issue