Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve String#encode :fallback option #2139

Merged
merged 1 commit into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions spec/core/string/shared/encode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,190 @@
end
end

describe "given the fallback option" do
context "given a hash" do
it "looks up the replacement value from the hash" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => "bar" })
encoded.should == "Bbar"
end

it "calls to_str on the returned value" do
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => obj })
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
obj = Object.new
obj.should_not_receive(:to_s)
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => obj })
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the key is not present in the hash" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: { "foo" => "bar" })
}.should raise_error(Encoding::UndefinedConversionError, "U+FFFD from UTF-8 to US-ASCII")
end

it "raises an error if the value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: { "\ufffd" => "\uffee" })
}.should raise_error(ArgumentError, "too big fallback string")
end

it "uses the hash's default value if set" do
hash = {}
hash.default = "bar"
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: hash)
encoded.should == "Bbar"
end

it "uses the result of calling default_proc if set" do
hash = {}
hash.default_proc = -> _, _ { "bar" }
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: hash)
encoded.should == "Bbar"
end
end

context "given an object inheriting from Hash" do
before do
klass = Class.new(Hash)
@hash_like = klass.new
@hash_like["\ufffd"] = "bar"
end

it "looks up the replacement value from the object" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: @hash_like)
encoded.should == "Bbar"
end
end

context "given an object responding to []" do
before do
klass = Class.new do
def [](c) = c.bytes.inspect
end
@hash_like = klass.new
end

it "calls [] on the object, passing the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: @hash_like)
encoded.should == "B[239, 191, 189]"
end
end

context "given an object not responding to []" do
before do
@non_hash_like = Object.new
end

it "raises an error" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: @non_hash_like)
}.should raise_error(Encoding::UndefinedConversionError, "U+FFFD from UTF-8 to US-ASCII")
end
end

context "given a proc" do
it "calls the proc to get the replacement value, passing in the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| c.bytes.inspect })
encoded.should == "B[239, 191, 189]"
end

it "calls to_str on the returned value" do
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| obj })
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
obj = Object.new
obj.should_not_receive(:to_s)
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: proc { |c| obj })
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the returned value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { "\uffee" })
}.should raise_error(ArgumentError, "too big fallback string")
end
end

context "given a lambda" do
it "calls the lambda to get the replacement value, passing in the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { c.bytes.inspect })
encoded.should == "B[239, 191, 189]"
end

it "calls to_str on the returned value" do
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { obj })
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
obj = Object.new
obj.should_not_receive(:to_s)
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { obj })
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the returned value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: -> c { "\uffee" })
}.should raise_error(ArgumentError, "too big fallback string")
end
end

context "given a method" do
def replace(c) = c.bytes.inspect
def replace_bad(c) = "\uffee"

def replace_to_str(c)
obj = Object.new
obj.should_receive(:to_str).and_return("bar")
obj
end

def replace_to_s(c)
obj = Object.new
obj.should_not_receive(:to_s)
obj
end

it "calls the method to get the replacement value, passing in the invalid character" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace))
encoded.should == "B[239, 191, 189]"
end

it "calls to_str on the returned value" do
encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_to_str))
encoded.should == "Bbar"
end

it "does not call to_s on the returned value" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_to_s))
}.should raise_error(TypeError, "no implicit conversion of Object into String")
end

it "raises an error if the returned value is itself invalid" do
-> {
"B\ufffd".encode(Encoding::US_ASCII, fallback: method(:replace_bad))
}.should raise_error(ArgumentError, "too big fallback string")
end
end
end

describe "given the xml: :text option" do
it "replaces all instances of '&' with '&'" do
NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do
Expand Down
34 changes: 25 additions & 9 deletions src/encoding_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,34 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje
}

auto handle_fallback = [&](nat_int_t cpt) {
Value result;
auto ch = new StringObject { orig_encoding->encode_codepoint(cpt) };
Value result = NilObject::the();
if (options.fallback_option->respond_to(env, "[]"_s)) {
result = options.fallback_option->send(env, "[]"_s, { Value::integer(cpt) });
result = options.fallback_option->send(env, "[]"_s, { ch });
} else if (options.fallback_option->respond_to(env, "call"_s)) {
result = options.fallback_option->send(env, "call"_s, { Value::integer(cpt) });
result = options.fallback_option->send(env, "call"_s, { ch });
}
temp_string.append(result->to_s(env));

if (result->is_nil()) {
auto message = StringObject::format(
"U+{} from {} to {}",
String::hex(cpt, String::HexFormat::Uppercase),
orig_encoding->name(),
name());
env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message);
}

auto result_str = result->to_str(env);
if (result_str->encoding()->num() != num()) {
try {
result_str = result_str->encode(env, EncodingObject::get(num()))->to_str(env);
} catch (ExceptionObject *e) {
// FIXME: I'm not sure how Ruby knows the character is "too big" for the target encoding.
// We don't have that kind of information yet.
env->raise("ArgumentError", "too big fallback string");
}
}
temp_string.append(result_str);
};

auto source_codepoint = valid ? c : -1;
Expand All @@ -68,10 +89,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje
if (source_codepoint == -1) {
switch (options.invalid_option) {
case EncodeInvalidOption::Raise:
if (options.fallback_option) {
handle_fallback(c);
continue;
}
env->raise_invalid_byte_sequence_error(this);
case EncodeInvalidOption::Replace:
if (options.replace_option) {
Expand All @@ -87,7 +104,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje
unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint);
}

// handle error
if (unicode_codepoint < 0) {
StringObject *message;
if (num() != Encoding::UTF_8) {
Expand Down
Loading