From a93f81e20beedb38d7d726d303ce5034b598da7a Mon Sep 17 00:00:00 2001 From: Ryan Lue Date: Thu, 9 Aug 2018 12:05:09 +0800 Subject: [PATCH] Make String#utf8_encode more robust (fixes #2176) --- lib/core_ext/string.rb | 12 +++++++++--- spec/lib/core_ext/string_spec.rb | 12 ++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/lib/core_ext/string.rb b/lib/core_ext/string.rb index dd522da16..f8699cd52 100644 --- a/lib/core_ext/string.rb +++ b/lib/core_ext/string.rb @@ -474,10 +474,15 @@ class String end def utf8_encode!(**options) - return self if (encoding == Encoding::UTF_8) && valid_encoding? + return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding? - input_encoding = viable_encodings(try_first: options[:from]).first - return encode!('utf-8', input_encoding) if input_encoding.present? + viable_encodings(try_first: options[:from]).each do |e| + begin + return encode!('utf-8', e) + rescue Encoding::UndefinedConversionError + nil + end + end case options[:fallback] when :output_to_binary @@ -501,6 +506,7 @@ class String [provided, original, detected] .compact .reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT } + .reject { |e| Encoding.find(e) == Encoding::UTF_8 } .select { |e| force_encoding(e).valid_encoding? } .tap { force_encoding(original) } # clean up changes from previous line diff --git a/spec/lib/core_ext/string_spec.rb b/spec/lib/core_ext/string_spec.rb index 52bd0ba8f..3981db7c6 100644 --- a/spec/lib/core_ext/string_spec.rb +++ b/spec/lib/core_ext/string_spec.rb @@ -2,7 +2,7 @@ require 'rails_helper' RSpec.describe String do describe '#utf8_encode' do - context 'for valid, UTF-8-encoded strings' do + context 'on valid, UTF-8-encoded strings' do let(:subject) { 'hello' } it 'returns an identical copy' do @@ -10,9 +10,17 @@ RSpec.describe String do expect(subject.utf8_encode.encoding).to be(subject.encoding) expect(subject.utf8_encode).not_to be(subject) end + + context 'which are incorrectly set to other, technically valid encodings' do + let(:subject) { 'รถ'.force_encoding('tis-620') } + + it 'sets input encoding to UTF-8 instead of attempting conversion' do + expect(subject.utf8_encode).to eq(subject.force_encoding('utf-8')) + end + end end - context 'for strings in other encodings' do + context 'on strings in other encodings' do let(:subject) { original_string.encode(input_encoding) } context 'with no from: option' do