Make String#utf8_encode more robust (fixes #2176)

This commit is contained in:
Ryan Lue 2018-08-09 12:05:09 +08:00
parent be11e9a1d5
commit a93f81e20b
2 changed files with 19 additions and 5 deletions

View file

@ -474,10 +474,15 @@ class String
end
def utf8_encode!(**options)
return self if (encoding == Encoding::UTF_8) && valid_encoding?
return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
input_encoding = viable_encodings(try_first: options[:from]).first
return encode!('utf-8', input_encoding) if input_encoding.present?
viable_encodings(try_first: options[:from]).each do |e|
begin
return encode!('utf-8', e)
rescue Encoding::UndefinedConversionError
nil
end
end
case options[:fallback]
when :output_to_binary
@ -501,6 +506,7 @@ class String
[provided, original, detected]
.compact
.reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT }
.reject { |e| Encoding.find(e) == Encoding::UTF_8 }
.select { |e| force_encoding(e).valid_encoding? }
.tap { force_encoding(original) } # clean up changes from previous line

View file

@ -2,7 +2,7 @@ require 'rails_helper'
RSpec.describe String do
describe '#utf8_encode' do
context 'for valid, UTF-8-encoded strings' do
context 'on valid, UTF-8-encoded strings' do
let(:subject) { 'hello' }
it 'returns an identical copy' do
@ -10,9 +10,17 @@ RSpec.describe String do
expect(subject.utf8_encode.encoding).to be(subject.encoding)
expect(subject.utf8_encode).not_to be(subject)
end
context 'which are incorrectly set to other, technically valid encodings' do
let(:subject) { 'ö'.force_encoding('tis-620') }
it 'sets input encoding to UTF-8 instead of attempting conversion' do
expect(subject.utf8_encode).to eq(subject.force_encoding('utf-8'))
end
end
end
context 'for strings in other encodings' do
context 'on strings in other encodings' do
let(:subject) { original_string.encode(input_encoding) }
context 'with no from: option' do