Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make memorynew intrinsic #55913

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions base/boot.jl
Original file line number Diff line number Diff line change
Expand Up @@ -551,12 +551,7 @@ struct UndefInitializer end
const undef = UndefInitializer()

# type and dimensionality specified
(self::Type{GenericMemory{kind,T,addrspace}})(::UndefInitializer, m::Int) where {T,addrspace,kind} =
if isdefined(self, :instance) && m === 0
self.instance
else
ccall(:jl_alloc_genericmemory, Ref{GenericMemory{kind,T,addrspace}}, (Any, Int), self, m)
end
(self::Type{GenericMemory{kind,T,addrspace}})(::UndefInitializer, m::Int) where {T,addrspace,kind} = Core.memorynew(self, m)
(self::Type{GenericMemory{kind,T,addrspace}})(::UndefInitializer, d::NTuple{1,Int}) where {T,kind,addrspace} = self(undef, getfield(d,1))
# empty vector constructor
(self::Type{GenericMemory{kind,T,addrspace}})() where {T,kind,addrspace} = self(undef, 0)
Expand Down
6 changes: 6 additions & 0 deletions base/compiler/tfuncs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1997,6 +1997,12 @@ function tuple_tfunc(𝕃::AbstractLattice, argtypes::Vector{Any})
return anyinfo ? PartialStruct(typ, argtypes) : typ
end

@nospecs function memorynew_tfunc(𝕃::AbstractLattice, memtype, m)
hasintersect(widenconst(m), Int) || return Bottom
return tmeet(𝕃, instanceof_tfunc(memtype, true)[1], GenericMemory)
end
add_tfunc(Core.memorynew, 2, 2, memorynew_tfunc, 10)

@nospecs function memoryrefget_tfunc(𝕃::AbstractLattice, mem, order, boundscheck)
memoryref_builtin_common_errorcheck(mem, order, boundscheck) || return Bottom
return memoryref_elemtype(mem)
Expand Down
9 changes: 8 additions & 1 deletion base/essentials.jl
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,8 @@ default_access_order(a::GenericMemory{:atomic}) = :monotonic
default_access_order(a::GenericMemoryRef{:not_atomic}) = :not_atomic
default_access_order(a::GenericMemoryRef{:atomic}) = :monotonic

getindex(A::GenericMemory, i::Int) = (@_noub_if_noinbounds_meta;
# bootstrap version for Memory{Any}
getindex(A::Memory{Any}, i::Int) = (@_noub_if_noinbounds_meta;
memoryrefget(memoryrefnew(memoryrefnew(A), i, @_boundscheck), default_access_order(A), false))
getindex(A::GenericMemoryRef) = memoryrefget(A, default_access_order(A), @_boundscheck)

Expand Down Expand Up @@ -905,6 +906,12 @@ macro goto(name::Symbol)
end

# linear indexing
function getindex(A::GenericMemory, i::Int)
@_noub_if_noinbounds_meta
@boundscheck ult_int(bitcast(UInt, sub_int(i, 1)), bitcast(UInt, A.length)) || throw_boundserror(A, (i,))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just make this the bootstrap method too? The boundscheck macro is just expanded simply to @_boundscheck &&, so this seems compatible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds reasonable

memoryrefget(memoryrefnew(memoryrefnew(A), i, false), default_access_order(A), false)
end

function getindex(A::Array, i::Int)
@_noub_if_noinbounds_meta
@boundscheck ult_int(bitcast(UInt, sub_int(i, 1)), bitcast(UInt, length(A))) || throw_boundserror(A, (i,))
Expand Down
7 changes: 4 additions & 3 deletions base/genericmemory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,10 @@ getindex(A::Memory, c::Colon) = copy(A)

## Indexing: setindex! ##

function _setindex!(A::Memory{T}, x::T, i1::Int) where {T}
ref = memoryrefnew(memoryref(A), i1, @_boundscheck)
memoryrefset!(ref, x, :not_atomic, @_boundscheck)
function _setindex!(A::Memory{T}, x::T, i::Int) where {T}
@boundscheck Core.Intrinsics.ult_int(i, A.length)
ref = memoryrefnew(memoryref(A), i, false)
memoryrefset!(ref, x, :not_atomic, false)
return A
end

Expand Down
2 changes: 1 addition & 1 deletion doc/src/manual/performance-tips.md
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ Sometimes you can circumvent the need to allocate memory on each function call b
the output. As a trivial example, compare

```jldoctest prealloc
julia> function xinc(x)
julia> @noinline function xinc(x)
return [x, x+1, x+2]
end;

Expand Down
11 changes: 2 additions & 9 deletions src/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
extern "C" {
#endif

#if defined(_P64) && defined(UINT128MAX)
typedef __uint128_t wideint_t;
#else
typedef uint64_t wideint_t;
#endif

#define MAXINTVAL (((size_t)-1)>>1)

JL_DLLEXPORT int jl_array_validate_dims(size_t *nel, uint32_t ndims, size_t *dims)
Expand All @@ -30,10 +24,9 @@ JL_DLLEXPORT int jl_array_validate_dims(size_t *nel, uint32_t ndims, size_t *dim
size_t _nel = 1;
for (i = 0; i < ndims; i++) {
size_t di = dims[i];
wideint_t prod = (wideint_t)_nel * (wideint_t)di;
if (prod >= (wideint_t) MAXINTVAL || di >= MAXINTVAL)
int overflow = __builtin_mul_overflow(_nel, di, &_nel);
if (overflow || di >= MAXINTVAL)
return 1;
_nel = prod;
}
*nel = _nel;
return 0;
Expand Down
1 change: 1 addition & 0 deletions src/builtin_proto.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ DECLARE_BUILTIN(is);
DECLARE_BUILTIN(isa);
DECLARE_BUILTIN(isdefined);
DECLARE_BUILTIN(issubtype);
DECLARE_BUILTIN(memorynew);
DECLARE_BUILTIN(memoryref);
DECLARE_BUILTIN(memoryref_isassigned);
DECLARE_BUILTIN(memoryrefget);
Expand Down
9 changes: 9 additions & 0 deletions src/builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1643,6 +1643,14 @@ JL_CALLABLE(jl_f__typevar)
}

// genericmemory ---------------------------------------------------------------------
JL_CALLABLE(jl_f_memorynew)
{
JL_NARGS(memorynew, 2, 2);
JL_TYPECHK(memorynew, datatype, args[0]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should probably also check that it is a GenericMemory subtype

Suggested change
JL_TYPECHK(memorynew, datatype, args[0]);
jl_datatype_t *jl_genericmemory_type_type = jl_datatype_type;
JL_TYPECHK(memorynew, genericmemory_type, args[0]);

JL_TYPECHK(memorynew, long, args[1]);
size_t nel = jl_unbox_long(args[1]);
return (jl_value_t*)jl_alloc_genericmemory(args[0], nel);
}

JL_CALLABLE(jl_f_memoryref)
{
Expand Down Expand Up @@ -2409,6 +2417,7 @@ void jl_init_primitives(void) JL_GC_DISABLED
jl_builtin_setglobalonce = add_builtin_func("setglobalonce!", jl_f_setglobalonce);

// memory primitives
jl_builtin_memorynew = add_builtin_func("memorynew", jl_f_memorynew);
jl_builtin_memoryref = add_builtin_func("memoryrefnew", jl_f_memoryref);
jl_builtin_memoryrefoffset = add_builtin_func("memoryrefoffset", jl_f_memoryrefoffset);
jl_builtin_memoryrefget = add_builtin_func("memoryrefget", jl_f_memoryrefget);
Expand Down
27 changes: 0 additions & 27 deletions src/ccall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1880,33 +1880,6 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
JL_GC_POP();
return mark_julia_type(ctx, obj, true, jl_any_type);
}
else if (is_libjulia_func(jl_alloc_genericmemory)) {
++CCALL_STAT(jl_alloc_genericmemory);
assert(lrt == ctx.types().T_prjlvalue);
assert(!isVa && !llvmcall && nccallargs == 2);
const jl_cgval_t &typ = argv[0];
const jl_cgval_t &nel = argv[1];
auto arg_typename = [&] JL_NOTSAFEPOINT {
auto istyp = argv[0].constant;
std::string type_str;
if (istyp && jl_is_datatype(istyp) && jl_is_genericmemory_type(istyp)){
auto eltype = jl_tparam1(istyp);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
}
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};
auto alloc = ctx.builder.CreateCall(prepare_call(jl_allocgenericmemory), { boxed(ctx,typ), emit_unbox(ctx, ctx.types().T_size, nel, (jl_value_t*)jl_ulong_type)});
setName(ctx.emission_context, alloc, arg_typename);
JL_GC_POP();
return mark_julia_type(ctx, alloc, true, jl_any_type);
}
else if (is_libjulia_func(memcpy) && (rt == (jl_value_t*)jl_nothing_type || jl_is_cpointer_type(rt))) {
++CCALL_STAT(memcpy);
const jl_cgval_t &dst = argv[0];
Expand Down
192 changes: 190 additions & 2 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1602,19 +1602,24 @@ static void emit_error(jl_codectx_t &ctx, const Twine &txt)
}

// DO NOT PASS IN A CONST CONDITION!
static void error_unless(jl_codectx_t &ctx, Value *cond, const Twine &msg)
static void error_unless(jl_codectx_t &ctx, Function *F, Value *cond, const Twine &msg)
{
++EmittedConditionalErrors;
BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(), "fail", ctx.f);
BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "pass");
ctx.builder.CreateCondBr(cond, passBB, failBB);
ctx.builder.SetInsertPoint(failBB);
just_emit_error(ctx, prepare_call(jlerror_func), msg);
just_emit_error(ctx, F, msg);
ctx.builder.CreateUnreachable();
passBB->insertInto(ctx.f);
ctx.builder.SetInsertPoint(passBB);
}

static void error_unless(jl_codectx_t &ctx, Value *cond, const Twine &msg)
{
error_unless(ctx, prepare_call(jlerror_func), cond, msg);
}

static void raise_exception(jl_codectx_t &ctx, Value *exc,
BasicBlock *contBB=nullptr)
{
Expand Down Expand Up @@ -4490,6 +4495,189 @@ static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b)
}
#endif


static jl_cgval_t emit_const_len_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, size_t nel, jl_genericmemory_t *inst)
{
if (nel == 0){
Value *empty_alloc = track_pjlvalue(ctx, literal_pointer_val(ctx, (jl_value_t*)inst));
return mark_julia_type(ctx, empty_alloc, true, typ);
}
const jl_datatype_layout_t *layout = ((jl_datatype_t*)typ)->layout;
assert(((jl_datatype_t*)typ)->has_concrete_subtype && layout != NULL);
size_t elsz = layout->size;
int isboxed = layout->flags.arrayelem_isboxed;
int isunion = layout->flags.arrayelem_isunion;
int zi = ((jl_datatype_t*)typ)->zeroinit;
if (isboxed)
elsz = sizeof(void*);

size_t nbytes;
bool overflow = __builtin_mul_overflow(nel, elsz, &nbytes);
if (isunion) {
// an extra byte for each isbits union memory element, stored at m->ptr + m->length
overflow |= __builtin_add_overflow(nbytes, nel, &nbytes);
}
Comment on lines +4514 to +4519
Copy link
Member

@vtjnash vtjnash Oct 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This arithmetic appears to be unsigned, while the overflow itself is supposed to be a signed check (or multiply elsz by 2 for the same effect) also may be missing the final overflow |= __builtin_add_overflow_p(nbytes, 1, 1) call to convert the condition < MAXINT into an overflow test

if (overflow)
emit_error(ctx, prepare_call(jlargumenterror_func), "invalid GenericMemory size: the number of elements is either negative or too large for system address width");

auto ct = get_current_task(ctx);
auto T_size = ctx.types().T_size;
auto int8t = getInt8Ty(ctx.builder.getContext());
auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
auto cg_nbytes = ConstantInt::get(T_size, nbytes);
auto cg_nel = ConstantInt::get(T_size, nel);

// else actually allocate mem
auto arg_typename = [&] JL_NOTSAFEPOINT {
std::string type_str;
auto eltype = jl_tparam1(typ);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};
size_t tot = nbytes + LLT_ALIGN(sizeof(jl_genericmemory_t),JL_SMALL_BYTE_ALIGNMENT);

int pooled = tot <= GC_MAX_SZCLASS;
Value *alloc, *decay_alloc, *memory_ptr;
jl_aliasinfo_t aliasinfo;
if (pooled) {
auto cg_tot = ConstantInt::get(T_size, tot);
auto call = prepare_call(jl_alloc_obj_func);
alloc = ctx.builder.CreateCall(call, { ct, cg_tot, track_pjlvalue(ctx, cg_typ)});
decay_alloc = decay_derived(ctx, alloc);
memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
setName(ctx.emission_context, memory_ptr, "memory_ptr");
auto objref = emit_pointer_from_objref(ctx, alloc);
Value *memory_data = emit_ptrgep(ctx, objref, JL_SMALL_BYTE_ALIGNMENT);
auto *store = ctx.builder.CreateAlignedStore(memory_data, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(store);
setName(ctx.emission_context, memory_data, "memory_data");
} else { // just use the dynamic length version since the malloc will be slow anyway
auto ptls = get_current_ptls(ctx);
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
alloc = ctx.builder.CreateCall(call, { ptls, cg_nbytes, cg_typ});
decay_alloc = maybe_decay_tracked(ctx, alloc);
}
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
auto *len_store = ctx.builder.CreateAlignedStore(cg_nel, len_field, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
aliasinfo.decorateInst(len_store);

// zeroinit pointers and unions
if (zi) {
memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(load);
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), cg_nbytes, Align(sizeof(void*)));
}

setName(ctx.emission_context, alloc, arg_typename);
return mark_julia_type(ctx, alloc, true, typ);
}

static jl_cgval_t emit_memorynew(jl_codectx_t &ctx, jl_datatype_t *typ, jl_cgval_t nel, jl_genericmemory_t *inst)
{
emit_typecheck(ctx, nel, (jl_value_t*)jl_long_type, "memorynew");
nel = update_julia_type(ctx, nel, (jl_value_t*)jl_long_type);
if (nel.typ == jl_bottom_type)
return jl_cgval_t();

const jl_datatype_layout_t *layout = ((jl_datatype_t*)typ)->layout;
assert(((jl_datatype_t*)typ)->has_concrete_subtype && layout != NULL);
size_t elsz = layout->size;
int isboxed = layout->flags.arrayelem_isboxed;
int isunion = layout->flags.arrayelem_isunion;
int zi = ((jl_datatype_t*)typ)->zeroinit;
if (isboxed)
elsz = sizeof(void*);

auto ptls = get_current_ptls(ctx);
auto T_size = ctx.types().T_size;
auto int8t = getInt8Ty(ctx.builder.getContext());
BasicBlock *emptymemBB, *nonemptymemBB, *retvalBB;
emptymemBB = BasicBlock::Create(ctx.builder.getContext(), "emptymem");
nonemptymemBB = BasicBlock::Create(ctx.builder.getContext(), "nonemptymem");
retvalBB = BasicBlock::Create(ctx.builder.getContext(), "retval");
auto nel_unboxed = emit_unbox(ctx, ctx.types().T_size, nel, (jl_value_t*)jl_long_type);
Value *memorynew_empty = ctx.builder.CreateICmpEQ(nel_unboxed, ConstantInt::get(T_size, 0));
setName(ctx.emission_context, memorynew_empty, "memorynew_empty");
ctx.builder.CreateCondBr(memorynew_empty, emptymemBB, nonemptymemBB);
// if nel == 0
emptymemBB->insertInto(ctx.f);
ctx.builder.SetInsertPoint(emptymemBB);
auto emptyalloc = track_pjlvalue(ctx, literal_pointer_val(ctx, (jl_value_t*)inst));
ctx.builder.CreateBr(retvalBB);
nonemptymemBB->insertInto(ctx.f);
ctx.builder.SetInsertPoint(nonemptymemBB);
// else actually allocate mem
auto arg_typename = [&] JL_NOTSAFEPOINT {
std::string type_str;
auto eltype = jl_tparam1(typ);
if (jl_is_datatype(eltype))
type_str = jl_symbol_name(((jl_datatype_t*)eltype)->name->name);
else if (jl_is_uniontype(eltype))
type_str = "Union";
else
type_str = "<unknown type>";
return "Memory{" + type_str + "}[]";
};
auto cg_typ = literal_pointer_val(ctx, (jl_value_t*) typ);
auto cg_elsz = ConstantInt::get(T_size, elsz);

FunctionCallee intr = Intrinsic::getDeclaration(jl_Module, Intrinsic::smul_with_overflow, ArrayRef<Type*>(T_size));
// compute nbytes with possible overflow
Value *prod_with_overflow = ctx.builder.CreateCall(intr, {nel_unboxed, cg_elsz});
Value *nbytes = ctx.builder.CreateExtractValue(prod_with_overflow, 0);
Value *overflow = ctx.builder.CreateExtractValue(prod_with_overflow, 1);
if (isunion) {
// if isunion, we need to allocate the union selector bytes as well
intr = Intrinsic::getDeclaration(jl_Module, Intrinsic::sadd_with_overflow, ArrayRef<Type*>(T_size));
Value *add_with_overflow = ctx.builder.CreateCall(intr, {nel_unboxed, nbytes});
nbytes = ctx.builder.CreateExtractValue(add_with_overflow, 0);
Value *overflow1 = ctx.builder.CreateExtractValue(add_with_overflow, 1);
overflow = ctx.builder.CreateOr(overflow, overflow1);
}
Value *negnel = ctx.builder.CreateICmpSLT(nel_unboxed, ConstantInt::get(T_size, 0));
overflow = ctx.builder.CreateOr(overflow, negnel);
Value *notoverflow = ctx.builder.CreateNot(overflow);
error_unless(ctx, prepare_call(jlargumenterror_func), notoverflow, "invalid GenericMemory size: the number of elements is either negative or too large for system address width");
// actually allocate
auto call = prepare_call(jl_alloc_genericmemory_unchecked_func);
Value *alloc = ctx.builder.CreateCall(call, { ptls, nbytes, cg_typ});
// set length (jl_alloc_genericmemory_unchecked_func doesn't have it)
Value *decay_alloc = decay_derived(ctx, alloc);
Value *len_field = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 0);
auto len_store = ctx.builder.CreateAlignedStore(nel_unboxed, len_field, Align(sizeof(void*)));
auto aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memorylen);
aliasinfo.decorateInst(len_store);
// zeroinit pointers and unions
if (zi) {
Value *memory_ptr = ctx.builder.CreateStructGEP(ctx.types().T_jlgenericmemory, decay_alloc, 1);
auto *load = ctx.builder.CreateAlignedLoad(ctx.types().T_ptr, memory_ptr, Align(sizeof(void*)));
aliasinfo = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_memoryptr);
aliasinfo.decorateInst(load);
ctx.builder.CreateMemSet(load, ConstantInt::get(int8t, 0), nbytes, Align(sizeof(void*)));
}

setName(ctx.emission_context, alloc, arg_typename);
ctx.builder.CreateBr(retvalBB);
nonemptymemBB = ctx.builder.GetInsertBlock();
// phi node to choose which side of branch
retvalBB->insertInto(ctx.f);
ctx.builder.SetInsertPoint(retvalBB);
auto phi = ctx.builder.CreatePHI(ctx.types().T_prjlvalue, 2);
phi->addIncoming(emptyalloc, emptymemBB);
phi->addIncoming(alloc, nonemptymemBB);
return mark_julia_type(ctx, phi, true, typ);
}

static jl_cgval_t _emit_memoryref(jl_codectx_t &ctx, Value *mem, Value *data, const jl_datatype_layout_t *layout, jl_value_t *typ)
{
//jl_cgval_t argv[] = {
Expand Down
Loading