From f2dd21acbe611cd378be1a95008fae9223f43983 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Mon, 24 Jun 2024 07:31:38 -0500 Subject: [PATCH] Improve String#encode :fallback option New specs here were upstreamed in ruby/spec#1164 and ruby/spec#1165. --- spec/core/string/shared/encode.rb | 184 ++++++++++++++++++++++++++++++ src/encoding_object.cpp | 34 ++++-- 2 files changed, 209 insertions(+), 9 deletions(-) diff --git a/spec/core/string/shared/encode.rb b/spec/core/string/shared/encode.rb index 096e5a30e..040bc88e7 100644 --- a/spec/core/string/shared/encode.rb +++ b/spec/core/string/shared/encode.rb @@ -224,6 +224,190 @@ end end + describe "given the fallback option" do + context "given a hash" do + it "looks up the replacement value from the hash" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => "bar" }) + encoded.should == "Bbar" + end + + it "calls to_str on the returned value" do + obj = Object.new + obj.should_receive(:to_str).and_return("bar") + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => obj }) + encoded.should == "Bbar" + end + + it "does not call to_s on the returned value" do + obj = Object.new + obj.should_not_receive(:to_s) + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => obj }) + }.should raise_error(TypeError, "no implicit conversion of Object into String") + end + + it "raises an error if the key is not present in the hash" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: { "foo" => "bar" }) + }.should raise_error(Encoding::UndefinedConversionError, "U+FFFD from UTF-8 to US-ASCII") + end + + it "raises an error if the value is itself invalid" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => "\uffee" }) + }.should raise_error(ArgumentError, "too big fallback string") + end + + it "uses the hash's default value if set" do + hash = {} + hash.default = "bar" + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: hash) + encoded.should == "Bbar" + end + + it "uses the result of calling default_proc if set" do + hash = {} + hash.default_proc = -> _, _ { "bar" } + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: hash) + encoded.should == "Bbar" + end + end + + context "given an object inheriting from Hash" do + before do + klass = Class.new(Hash) + @hash_like = klass.new + @hash_like["\ufffd"] = "bar" + end + + it "looks up the replacement value from the object" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: @hash_like) + encoded.should == "Bbar" + end + end + + context "given an object responding to []" do + before do + klass = Class.new do + def [](c) = c.bytes.inspect + end + @hash_like = klass.new + end + + it "calls [] on the object, passing the invalid character" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: @hash_like) + encoded.should == "B[239, 191, 189]" + end + end + + context "given an object not responding to []" do + before do + @non_hash_like = Object.new + end + + it "raises an error" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: @non_hash_like) + }.should raise_error(Encoding::UndefinedConversionError, "U+FFFD from UTF-8 to US-ASCII") + end + end + + context "given a proc" do + it "calls the proc to get the replacement value, passing in the invalid character" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| c.bytes.inspect }) + encoded.should == "B[239, 191, 189]" + end + + it "calls to_str on the returned value" do + obj = Object.new + obj.should_receive(:to_str).and_return("bar") + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| obj }) + encoded.should == "Bbar" + end + + it "does not call to_s on the returned value" do + obj = Object.new + obj.should_not_receive(:to_s) + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| obj }) + }.should raise_error(TypeError, "no implicit conversion of Object into String") + end + + it "raises an error if the returned value is itself invalid" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { "\uffee" }) + }.should raise_error(ArgumentError, "too big fallback string") + end + end + + context "given a lambda" do + it "calls the lambda to get the replacement value, passing in the invalid character" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { c.bytes.inspect }) + encoded.should == "B[239, 191, 189]" + end + + it "calls to_str on the returned value" do + obj = Object.new + obj.should_receive(:to_str).and_return("bar") + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { obj }) + encoded.should == "Bbar" + end + + it "does not call to_s on the returned value" do + obj = Object.new + obj.should_not_receive(:to_s) + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { obj }) + }.should raise_error(TypeError, "no implicit conversion of Object into String") + end + + it "raises an error if the returned value is itself invalid" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { "\uffee" }) + }.should raise_error(ArgumentError, "too big fallback string") + end + end + + context "given a method" do + def replace(c) = c.bytes.inspect + def replace_bad(c) = "\uffee" + + def replace_to_str(c) + obj = Object.new + obj.should_receive(:to_str).and_return("bar") + obj + end + + def replace_to_s(c) + obj = Object.new + obj.should_not_receive(:to_s) + obj + end + + it "calls the method to get the replacement value, passing in the invalid character" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace)) + encoded.should == "B[239, 191, 189]" + end + + it "calls to_str on the returned value" do + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_to_str)) + encoded.should == "Bbar" + end + + it "does not call to_s on the returned value" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_to_s)) + }.should raise_error(TypeError, "no implicit conversion of Object into String") + end + + it "raises an error if the returned value is itself invalid" do + -> { + "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_bad)) + }.should raise_error(ArgumentError, "too big fallback string") + end + end + end + describe "given the xml: :text option" do it "replaces all instances of '&' with '&'" do NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 9c2439649..eaf1a7063 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -53,13 +53,34 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje } auto handle_fallback = [&](nat_int_t cpt) { - Value result; + auto ch = new StringObject { orig_encoding->encode_codepoint(cpt) }; + Value result = NilObject::the(); if (options.fallback_option->respond_to(env, "[]"_s)) { - result = options.fallback_option->send(env, "[]"_s, { Value::integer(cpt) }); + result = options.fallback_option->send(env, "[]"_s, { ch }); } else if (options.fallback_option->respond_to(env, "call"_s)) { - result = options.fallback_option->send(env, "call"_s, { Value::integer(cpt) }); + result = options.fallback_option->send(env, "call"_s, { ch }); } - temp_string.append(result->to_s(env)); + + if (result->is_nil()) { + auto message = StringObject::format( + "U+{} from {} to {}", + String::hex(cpt, String::HexFormat::Uppercase), + orig_encoding->name(), + name()); + env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); + } + + auto result_str = result->to_str(env); + if (result_str->encoding()->num() != num()) { + try { + result_str = result_str->encode(env, EncodingObject::get(num()))->to_str(env); + } catch (ExceptionObject *e) { + // FIXME: I'm not sure how Ruby knows the character is "too big" for the target encoding. + // We don't have that kind of information yet. + env->raise("ArgumentError", "too big fallback string"); + } + } + temp_string.append(result_str); }; auto source_codepoint = valid ? c : -1; @@ -68,10 +89,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje if (source_codepoint == -1) { switch (options.invalid_option) { case EncodeInvalidOption::Raise: - if (options.fallback_option) { - handle_fallback(c); - continue; - } env->raise_invalid_byte_sequence_error(this); case EncodeInvalidOption::Replace: if (options.replace_option) { @@ -87,7 +104,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint); } - // handle error if (unicode_codepoint < 0) { StringObject *message; if (num() != Encoding::UTF_8) {