8252848: Optimize small primitive arrayCopy operations through partial inlining using AVX-512 masked instructions

Reviewed-by: neliasso, kvn
This commit is contained in:
Jatin Bhateja 2020-11-25 06:08:19 +00:00
parent 66943fefa7
commit 0d91f0a1df
25 changed files with 470 additions and 45 deletions

View File

@ -2706,34 +2706,18 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, bool mer
emit_operand(dst, src);
}
void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x6F);
emit_operand(dst, src);
}
void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.reset_is_clear_context();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
if (merge) {
attributes.reset_is_clear_context();
}
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}

View File

@ -1549,6 +1549,7 @@ private:
void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len);
@ -1566,10 +1567,6 @@ private:
void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
// Generic move instructions.
void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);
// Move lower 64bit to high 64bit in 128bit register
void movlhps(XMMRegister dst, XMMRegister src);

View File

@ -1891,6 +1891,20 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}
void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
if (ArrayCopyPartialInlineSize <= 32) {
mov64(dst, 1);
shlxq(dst, dst, len);
decq(dst);
} else {
mov64(dst, -1);
movq(temp, len);
negptr(temp);
addptr(temp, 64);
shrxq(dst, dst, temp);
}
}
#endif // _LP64
void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
@ -1937,6 +1951,15 @@ void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, X
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
}
void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}
void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}
void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
XMMRegister dst, XMMRegister src,
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,

View File

@ -120,6 +120,9 @@ public:
void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);
// extract
void extract(BasicType typ, Register dst, XMMRegister src, int idx);
XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
@ -139,6 +142,7 @@ public:
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void genmask(Register dst, Register len, Register temp);
#endif // _LP64
// dst = reduce(op, src2) using vtmp as temps

View File

@ -8000,6 +8000,56 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
bind(done);
}
void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
switch(type) {
case T_BYTE:
case T_BOOLEAN:
evmovdqub(dst, kmask, src, false, vector_len);
break;
case T_CHAR:
case T_SHORT:
evmovdquw(dst, kmask, src, false, vector_len);
break;
case T_INT:
case T_FLOAT:
evmovdqul(dst, kmask, src, false, vector_len);
break;
case T_LONG:
case T_DOUBLE:
evmovdquq(dst, kmask, src, false, vector_len);
break;
default:
fatal("Unexpected type argument %s", type2name(type));
break;
}
}
void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
switch(type) {
case T_BYTE:
case T_BOOLEAN:
evmovdqub(dst, kmask, src, true, vector_len);
break;
case T_CHAR:
case T_SHORT:
evmovdquw(dst, kmask, src, true, vector_len);
break;
case T_INT:
case T_FLOAT:
evmovdqul(dst, kmask, src, true, vector_len);
break;
case T_LONG:
case T_DOUBLE:
evmovdquq(dst, kmask, src, true, vector_len);
break;
default:
fatal("Unexpected type argument %s", type2name(type));
break;
}
}
#ifdef _LP64
void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
Label done;

View File

@ -1094,10 +1094,14 @@ public:
void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
// AVX512 Unaligned
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);
void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); }

View File

@ -200,8 +200,8 @@ void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister x
mov64(temp, -1);
shrxq(temp, temp, length);
kmovql(mask, temp);
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_512bit, type[shift]);
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_512bit, type[shift]);
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
}
}
@ -216,8 +216,8 @@ void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister x
shlxq(temp, temp, length);
decq(temp);
kmovql(mask, temp);
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_256bit, type[shift]);
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_256bit, type[shift]);
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_256bit);
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_256bit);
}

View File

@ -1362,6 +1362,7 @@ void VM_Version::get_processor_features() {
MaxLoopPad = 11;
}
#endif // COMPILER2
if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
}
@ -1399,6 +1400,38 @@ void VM_Version::get_processor_features() {
if (FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
FLAG_SET_DEFAULT(AllocatePrefetchInstr, 3);
}
#ifdef COMPILER2
if (UseAVX > 2) {
if (FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) ||
(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) &&
ArrayCopyPartialInlineSize != 0 &&
ArrayCopyPartialInlineSize != 32 &&
ArrayCopyPartialInlineSize != 16 &&
ArrayCopyPartialInlineSize != 64)) {
int inline_size = 0;
if (MaxVectorSize >= 64 && AVX3Threshold == 0) {
inline_size = 64;
} else if (MaxVectorSize >= 32) {
inline_size = 32;
} else if (MaxVectorSize >= 16) {
inline_size = 16;
}
if(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize)) {
warning("Setting ArrayCopyPartialInlineSize as %d", inline_size);
}
ArrayCopyPartialInlineSize = inline_size;
}
if (ArrayCopyPartialInlineSize > MaxVectorSize) {
ArrayCopyPartialInlineSize = MaxVectorSize >= 16 ? MaxVectorSize : 0;
if (ArrayCopyPartialInlineSize) {
warning("Setting ArrayCopyPartialInlineSize as MaxVectorSize" INTX_FORMAT ")", MaxVectorSize);
} else {
warning("Setting ArrayCopyPartialInlineSize as " INTX_FORMAT, ArrayCopyPartialInlineSize);
}
}
}
#endif
}
#ifdef _LP64

View File

@ -1521,6 +1521,13 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (UseAVX < 3) {
return false;
}
break;
#ifndef _LP64
case Op_AddReductionVF:
case Op_AddReductionVD:
@ -1594,6 +1601,16 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (!VM_Version::supports_avx512bw()) {
return false;
}
if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_CMoveVD:
if (vlen != 4) {
return false; // implementation limitation (only vcmov4D_reg is present)
@ -7894,3 +7911,50 @@ instruct vprorate(vec dst, vec src, vec shift) %{
ins_pipe( pipe_slow );
%}
#ifdef _LP64
// ---------------------------------- Masked Block Copy ------------------------------------
instruct vmasked_load64(vec dst, memory mem, rRegL mask) %{
match(Set dst (LoadVectorMasked mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
ins_encode %{
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(this);
__ kmovql(k2, $mask$$Register);
__ evmovdqu(elmType, k2, $dst$$XMMRegister, $mem$$Address, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vmask_gen(rRegL dst, rRegL len, rRegL tempLen) %{
match(Set dst (VectorMaskGen len));
effect(TEMP_DEF dst, TEMP tempLen);
format %{ "vector_mask_gen $len \t! vector mask generator" %}
ins_encode %{
__ genmask($dst$$Register, $len$$Register, $tempLen$$Register);
%}
ins_pipe( pipe_slow );
%}
instruct vmask_gen_imm(rRegL dst, immL len) %{
match(Set dst (VectorMaskGen len));
format %{ "vector_mask_gen $len \t! vector mask generator" %}
ins_encode %{
__ mov64($dst$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
%}
ins_pipe( pipe_slow );
%}
instruct vmasked_store64(memory mem, vec src, rRegL mask) %{
match(Set mem (StoreVectorMasked mem (Binary src mask)));
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
ins_encode %{
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(src_node);
__ kmovql(k2, $mask$$Register);
__ evmovdqu(elmType, k2, $mem$$Address, $src$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
#endif // _LP64

View File

@ -269,6 +269,7 @@ Form::DataType Form::is_load_from_memory(const char *opType) const {
if( strcmp(opType,"LoadS")==0 ) return Form::idealS;
if( strcmp(opType,"LoadVector")==0 ) return Form::idealV;
if( strcmp(opType,"LoadVectorGather")==0 ) return Form::idealV;
if( strcmp(opType,"LoadVectorMasked")==0 ) return Form::idealV;
assert( strcmp(opType,"Load") != 0, "Must type Loads" );
return Form::none;
}
@ -286,6 +287,7 @@ Form::DataType Form::is_store_to_memory(const char *opType) const {
if( strcmp(opType,"StoreNKlass")==0) return Form::idealNKlass;
if( strcmp(opType,"StoreVector")==0 ) return Form::idealV;
if( strcmp(opType,"StoreVectorScatter")==0 ) return Form::idealV;
if( strcmp(opType,"StoreVectorMasked")==0 ) return Form::idealV;
assert( strcmp(opType,"Store") != 0, "Must type Stores" );
return Form::none;
}

View File

@ -781,6 +781,7 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeP") ||
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeN") ||
#endif
!strcmp(_matrule->_rChild->_opType,"VectorMaskGen")||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeP") ||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true;
else if ( is_ideal_load() == Form::idealP ) return true;
@ -3489,7 +3490,7 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
"StoreB","StoreC","Store" ,"StoreFP",
"LoadI", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF" ,
"LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
"StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter",
"StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter", "LoadVectorMasked", "StoreVectorMasked",
"LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
"LoadPLocked",
"StorePConditional", "StoreIConditional", "StoreLConditional",
@ -4181,7 +4182,7 @@ bool MatchRule::is_vector() const {
"VectorRearrange","VectorLoadShuffle", "VectorLoadConst",
"VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
"VectorCastL2X", "VectorCastF2X", "VectorCastD2X",
"VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret",
"VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
"FmaVD", "FmaVF","PopCountVI",
// Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",

View File

@ -684,6 +684,8 @@ bool ArrayCopyNode::may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTra
assert(c == mb->in(0) || (ac != NULL && ac->is_clonebasic() && !use_ReduceInitialCardMarks), "only for clone");
#endif
return true;
} else if (mb->trailing_partial_array_copy()) {
return true;
}
return false;
@ -730,3 +732,16 @@ bool ArrayCopyNode::modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransf
}
return false;
}
// As an optimization, choose optimum vector size for copy length known at compile time.
int ArrayCopyNode::get_partial_inline_vector_lane_count(BasicType type, int const_len) {
int lane_count = ArrayCopyPartialInlineSize/type2aelembytes(type);
if (const_len > 0) {
int size_in_bytes = const_len * type2aelembytes(type);
if (size_in_bytes <= 16)
lane_count = 16/type2aelembytes(type);
else if (size_in_bytes > 16 && size_in_bytes <= 32)
lane_count = 32/type2aelembytes(type);
}
return lane_count;
}

View File

@ -180,6 +180,9 @@ public:
bool has_negative_length_guard() const { return _has_negative_length_guard; }
static bool may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase, ArrayCopyNode*& ac);
static int get_partial_inline_vector_lane_count(BasicType type, int const_len);
bool modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify) const;
#ifndef PRODUCT

View File

@ -80,6 +80,10 @@
"actual size could be less depending on elements type") \
range(0, max_jint) \
\
product(intx, ArrayCopyPartialInlineSize, -1, DIAGNOSTIC, \
"Partial inline size used for array copy acceleration.") \
range(-1, 64) \
\
product(bool, AlignVector, true, \
"Perform vector store/load alignment in loop") \
\

View File

@ -407,6 +407,9 @@ macro(LoadVector)
macro(LoadVectorGather)
macro(StoreVector)
macro(StoreVectorScatter)
macro(LoadVectorMasked)
macro(StoreVectorMasked)
macro(VectorMaskGen)
macro(Pack)
macro(PackB)
macro(PackS)

View File

@ -3355,6 +3355,9 @@ void Compile::final_graph_reshaping_main_switch(Node* n, Final_Reshape_Counts& f
case Op_StoreVector:
case Op_LoadVectorGather:
case Op_StoreVectorScatter:
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
break;
case Op_AddReductionVI:

View File

@ -687,6 +687,7 @@ void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_
case Op_StoreN:
case Op_StoreVector:
case Op_StoreVectorScatter:
case Op_StoreVectorMasked:
case Op_StoreNKlass:
for (uint k = 1; k < m->req(); k++) {
Node *in = m->in(k);

View File

@ -243,8 +243,10 @@ static Node *scan_mem_chain(Node *mem, int alias_idx, int offset, Node *start_me
} else if (in->is_MemBar()) {
ArrayCopyNode* ac = NULL;
if (ArrayCopyNode::may_modify(tinst, in->as_MemBar(), phase, ac)) {
assert(ac != NULL && ac->is_clonebasic(), "Only basic clone is a non escaping clone");
return ac;
if (ac != NULL) {
assert(ac->is_clonebasic(), "Only basic clone is a non escaping clone");
return ac;
}
}
mem = in->in(TypeFunc::Memory);
} else {

View File

@ -126,6 +126,11 @@ private:
// helper methods modeled after LibraryCallKit for array copy
Node* generate_guard(Node** ctrl, Node* test, RegionNode* region, float true_prob);
Node* generate_slow_guard(Node** ctrl, Node* test, RegionNode* region);
void generate_partial_inlining_block(Node** ctrl, MergeMemNode** mem, const TypePtr* adr_type,
RegionNode** exit_block, Node** result_memory, Node* length,
Node* src_start, Node* dst_start, BasicType type);
void generate_negative_guard(Node** ctrl, Node* index, RegionNode* region);
void generate_limit_guard(Node** ctrl, Node* offset, Node* subseq_length, Node* array_length, RegionNode* region);
@ -174,7 +179,7 @@ private:
Node* src, Node* src_offset,
Node* dest, Node* dest_offset,
Node* copy_length, bool dest_uninitialized);
void generate_unchecked_arraycopy(Node** ctrl, MergeMemNode** mem,
bool generate_unchecked_arraycopy(Node** ctrl, MergeMemNode** mem,
const TypePtr* adr_type,
BasicType basic_elem_type,
bool disjoint_bases,

View File

@ -27,6 +27,7 @@
#include "opto/arraycopynode.hpp"
#include "oops/objArrayKlass.hpp"
#include "opto/convertnode.hpp"
#include "opto/vectornode.hpp"
#include "opto/graphKit.hpp"
#include "opto/macro.hpp"
#include "opto/runtime.hpp"
@ -169,6 +170,98 @@ void PhaseMacroExpand::generate_limit_guard(Node** ctrl, Node* offset, Node* sub
generate_guard(ctrl, bol_lt, region, PROB_MIN);
}
//
// Partial in-lining handling for smaller conjoint/disjoint array copies having
// length(in bytes) less than ArrayCopyPartialInlineSize.
// if (length <= ArrayCopyPartialInlineSize) {
// partial_inlining_block:
// mask = Mask_Gen
// vload = LoadVectorMasked src , mask
// StoreVectorMasked dst, mask, vload
// } else {
// stub_block:
// callstub array_copy
// }
// exit_block:
// Phi = label partial_inlining_block:mem , label stub_block:mem (filled by caller)
// mem = MergeMem (Phi)
// control = stub_block
//
// Exit_block and associated phi(memory) are partially initialized for partial_in-lining_block
// edges. Remaining edges for exit_block coming from stub_block are connected by the caller
// post stub nodes creation.
//
void PhaseMacroExpand::generate_partial_inlining_block(Node** ctrl, MergeMemNode** mem, const TypePtr* adr_type,
RegionNode** exit_block, Node** result_memory, Node* length,
Node* src_start, Node* dst_start, BasicType type) {
const TypePtr *src_adr_type = _igvn.type(src_start)->isa_ptr();
Node* inline_block = NULL;
Node* stub_block = NULL;
int const_len = -1;
const TypeInt* lty = NULL;
uint shift = exact_log2(type2aelembytes(type));
if (length->Opcode() == Op_ConvI2L) {
lty = _igvn.type(length->in(1))->isa_int();
} else {
lty = _igvn.type(length)->isa_int();
}
if (lty && lty->is_con()) {
const_len = lty->get_con() << shift;
}
// Return if copy length is greater than partial inline size limit or
// target does not supports masked load/stores.
int lane_count = ArrayCopyNode::get_partial_inline_vector_lane_count(type, const_len);
if ( const_len > ArrayCopyPartialInlineSize ||
!Matcher::match_rule_supported_vector(Op_LoadVectorMasked, lane_count, type) ||
!Matcher::match_rule_supported_vector(Op_StoreVectorMasked, lane_count, type) ||
!Matcher::match_rule_supported_vector(Op_VectorMaskGen, lane_count, type)) {
return;
}
Node* copy_bytes = new LShiftXNode(length, intcon(shift));
transform_later(copy_bytes);
Node* cmp_le = new CmpULNode(copy_bytes, longcon(ArrayCopyPartialInlineSize));
transform_later(cmp_le);
Node* bol_le = new BoolNode(cmp_le, BoolTest::le);
transform_later(bol_le);
inline_block = generate_guard(ctrl, bol_le, NULL, PROB_FAIR);
stub_block = *ctrl;
Node* mask_gen = new VectorMaskGenNode(length, TypeLong::LONG, Type::get_const_basic_type(type));
transform_later(mask_gen);
unsigned vec_size = lane_count * type2aelembytes(type);
if (C->max_vector_size() < vec_size) {
C->set_max_vector_size(vec_size);
}
const TypeVect * vt = TypeVect::make(type, lane_count);
Node* mm = (*mem)->memory_at(C->get_alias_index(src_adr_type));
Node* masked_load = new LoadVectorMaskedNode(inline_block, mm, src_start,
src_adr_type, vt, mask_gen);
transform_later(masked_load);
mm = (*mem)->memory_at(C->get_alias_index(adr_type));
Node* masked_store = new StoreVectorMaskedNode(inline_block, mm, dst_start,
masked_load, adr_type, mask_gen);
transform_later(masked_store);
// Convergence region for inline_block and stub_block.
*exit_block = new RegionNode(3);
transform_later(*exit_block);
(*exit_block)->init_req(1, inline_block);
*result_memory = new PhiNode(*exit_block, Type::MEMORY, adr_type);
transform_later(*result_memory);
(*result_memory)->init_req(1, masked_store);
*ctrl = stub_block;
}
Node* PhaseMacroExpand::generate_nonpositive_guard(Node** ctrl, Node* index, bool never_negative) {
if ((*ctrl)->is_top()) return NULL;
@ -559,16 +652,17 @@ Node* PhaseMacroExpand::generate_arraycopy(ArrayCopyNode *ac, AllocateArrayNode*
}
}
bool is_partial_array_copy = false;
if (!(*ctrl)->is_top()) {
// Generate the fast path, if possible.
Node* local_ctrl = *ctrl;
MergeMemNode* local_mem = MergeMemNode::make(mem);
transform_later(local_mem);
generate_unchecked_arraycopy(&local_ctrl, &local_mem,
adr_type, copy_type, disjoint_bases,
src, src_offset, dest, dest_offset,
ConvI2X(copy_length), dest_uninitialized);
is_partial_array_copy = generate_unchecked_arraycopy(&local_ctrl, &local_mem,
adr_type, copy_type, disjoint_bases,
src, src_offset, dest, dest_offset,
ConvI2X(copy_length), dest_uninitialized);
// Present the results of the fast call.
result_region->init_req(fast_path, local_ctrl);
@ -715,13 +809,19 @@ Node* PhaseMacroExpand::generate_arraycopy(ArrayCopyNode *ac, AllocateArrayNode*
insert_mem_bar(ctrl, &out_mem, Op_MemBarCPUOrder);
}
if (is_partial_array_copy) {
assert((*ctrl)->is_Proj(), "MemBar control projection");
assert((*ctrl)->in(0)->isa_MemBar(), "MemBar node");
(*ctrl)->in(0)->isa_MemBar()->set_trailing_partial_array_copy();
}
_igvn.replace_node(_memproj_fallthrough, out_mem);
_igvn.replace_node(_ioproj_fallthrough, *io);
_igvn.replace_node(_fallthroughcatchproj, *ctrl);
#ifdef ASSERT
const TypeOopPtr* dest_t = _igvn.type(dest)->is_oopptr();
if (dest_t->is_known_instance()) {
if (dest_t->is_known_instance() && !is_partial_array_copy) {
ArrayCopyNode* ac = NULL;
assert(ArrayCopyNode::may_modify(dest_t, (*ctrl)->in(0)->as_MemBar(), &_igvn, ac), "dependency on arraycopy lost");
assert(ac == NULL, "no arraycopy anymore");
@ -1053,14 +1153,14 @@ Node* PhaseMacroExpand::generate_generic_arraycopy(Node** ctrl, MergeMemNode** m
}
// Helper function; generates the fast out-of-line call to an arraycopy stub.
void PhaseMacroExpand::generate_unchecked_arraycopy(Node** ctrl, MergeMemNode** mem,
bool PhaseMacroExpand::generate_unchecked_arraycopy(Node** ctrl, MergeMemNode** mem,
const TypePtr* adr_type,
BasicType basic_elem_type,
bool disjoint_bases,
Node* src, Node* src_offset,
Node* dest, Node* dest_offset,
Node* copy_length, bool dest_uninitialized) {
if ((*ctrl)->is_top()) return;
if ((*ctrl)->is_top()) return false;
Node* src_start = src;
Node* dest_start = dest;
@ -1075,11 +1175,39 @@ void PhaseMacroExpand::generate_unchecked_arraycopy(Node** ctrl, MergeMemNode**
basictype2arraycopy(basic_elem_type, src_offset, dest_offset,
disjoint_bases, copyfunc_name, dest_uninitialized);
Node* result_memory = NULL;
RegionNode* exit_block = NULL;
if (ArrayCopyPartialInlineSize > 0 && is_subword_type(basic_elem_type) &&
Matcher::vector_width_in_bytes(basic_elem_type) >= 16) {
generate_partial_inlining_block(ctrl, mem, adr_type, &exit_block, &result_memory,
copy_length, src_start, dest_start, basic_elem_type);
}
const TypeFunc* call_type = OptoRuntime::fast_arraycopy_Type();
Node* call = make_leaf_call(*ctrl, *mem, call_type, copyfunc_addr, copyfunc_name, adr_type,
src_start, dest_start, copy_length XTOP);
finish_arraycopy_call(call, ctrl, mem, adr_type);
// Connecting remaining edges for exit_block coming from stub_block.
if (exit_block) {
exit_block->init_req(2, *ctrl);
// Memory edge corresponding to stub_region.
result_memory->init_req(2, *mem);
uint alias_idx = C->get_alias_index(adr_type);
if (alias_idx != Compile::AliasIdxBot) {
*mem = MergeMemNode::make(*mem);
(*mem)->set_memory_at(alias_idx, result_memory);
} else {
*mem = MergeMemNode::make(result_memory);
}
transform_later(*mem);
*ctrl = exit_block;
return true;
}
return false;
}
void PhaseMacroExpand::expand_arraycopy_node(ArrayCopyNode *ac) {

View File

@ -2213,6 +2213,7 @@ bool Matcher::find_shared_visit(MStack& mstack, Node* n, uint opcode, bool& mem_
case Op_FmaVD:
case Op_FmaVF:
case Op_MacroLogicV:
case Op_LoadVectorMasked:
set_shared(n); // Force result into register (it will be anyways)
break;
case Op_ConP: { // Convert pointers above the centerline to NUL
@ -2315,6 +2316,12 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
n->del_req(3);
break;
}
case Op_StoreVectorMasked: {
Node* pair = new BinaryNode(n->in(3), n->in(4));
n->set_req(3, pair);
n->del_req(4);
break;
}
case Op_LoopLimit: {
Node* pair1 = new BinaryNode(n->in(1), n->in(2));
n->set_req(1, pair1);

View File

@ -1190,7 +1190,8 @@ class MemBarNode: public MultiNode {
TrailingStore,
LeadingStore,
TrailingLoadStore,
LeadingLoadStore
LeadingLoadStore,
TrailingPartialArrayCopy
} _kind;
#ifdef ASSERT
@ -1227,6 +1228,8 @@ public:
bool trailing() const { return _kind == TrailingLoad || _kind == TrailingStore || _kind == TrailingLoadStore; }
bool leading() const { return _kind == LeadingStore || _kind == LeadingLoadStore; }
bool standalone() const { return _kind == Standalone; }
void set_trailing_partial_array_copy() { _kind = TrailingPartialArrayCopy; }
bool trailing_partial_array_copy() const { return _kind == TrailingPartialArrayCopy; }
static void set_store_pair(MemBarNode* leading, MemBarNode* trailing);
static void set_load_store_pair(MemBarNode* leading, MemBarNode* trailing);

View File

@ -157,6 +157,8 @@ class TypeNode;
class UnlockNode;
class VectorNode;
class LoadVectorNode;
class LoadVectorMaskedNode;
class StoreVectorMaskedNode;
class LoadVectorGatherNode;
class StoreVectorNode;
class StoreVectorScatterNode;
@ -692,13 +694,15 @@ public:
DEFINE_CLASS_ID(Parm, Proj, 4)
DEFINE_CLASS_ID(MachProj, Proj, 5)
DEFINE_CLASS_ID(Mem, Node, 4)
DEFINE_CLASS_ID(Load, Mem, 0)
DEFINE_CLASS_ID(Mem, Node, 4)
DEFINE_CLASS_ID(Load, Mem, 0)
DEFINE_CLASS_ID(LoadVector, Load, 0)
DEFINE_CLASS_ID(LoadVectorGather, LoadVector, 0)
DEFINE_CLASS_ID(LoadVectorMasked, LoadVector, 1)
DEFINE_CLASS_ID(Store, Mem, 1)
DEFINE_CLASS_ID(StoreVector, Store, 0)
DEFINE_CLASS_ID(StoreVectorScatter, StoreVector, 0)
DEFINE_CLASS_ID(StoreVectorMasked, StoreVector, 1)
DEFINE_CLASS_ID(LoadStore, Mem, 2)
DEFINE_CLASS_ID(LoadStoreConditional, LoadStore, 0)
DEFINE_CLASS_ID(CompareAndSwap, LoadStoreConditional, 0)

View File

@ -759,6 +759,41 @@ StoreVectorNode* StoreVectorNode::make(int opc, Node* ctl, Node* mem,
return new StoreVectorNode(ctl, mem, adr, atyp, val);
}
Node* LoadVectorMaskedNode::Ideal(PhaseGVN* phase, bool can_reshape) {
Node* mask_len = in(3)->in(1);
const TypeLong* ty = phase->type(mask_len)->isa_long();
if (ty && ty->is_con()) {
BasicType mask_bt = ((VectorMaskGenNode*)in(3))->get_elem_type()->array_element_basic_type();
uint load_sz = type2aelembytes(mask_bt) * ty->get_con();
if ( load_sz == 32 || load_sz == 64) {
assert(load_sz == 32 || MaxVectorSize > 32, "Unexpected load size");
Node* ctr = in(MemNode::Control);
Node* mem = in(MemNode::Memory);
Node* adr = in(MemNode::Address);
return phase->transform(new LoadVectorNode(ctr, mem, adr, adr_type(), vect_type()));
}
}
return NULL;
}
Node* StoreVectorMaskedNode::Ideal(PhaseGVN* phase, bool can_reshape) {
Node* mask_len = in(4)->in(1);
const TypeLong* ty = phase->type(mask_len)->isa_long();
if (ty && ty->is_con()) {
BasicType mask_bt = ((VectorMaskGenNode*)in(4))->get_elem_type()->array_element_basic_type();
uint load_sz = type2aelembytes(mask_bt) * ty->get_con();
if ( load_sz == 32 || load_sz == 64) {
assert(load_sz == 32 || MaxVectorSize > 32, "Unexpected store size");
Node* ctr = in(MemNode::Control);
Node* mem = in(MemNode::Memory);
Node* adr = in(MemNode::Address);
Node* val = in(MemNode::ValueIn);
return phase->transform(new StoreVectorNode(ctr, mem, adr, adr_type(), val));
}
}
return NULL;
}
int ExtractNode::opcode(BasicType bt) {
switch (bt) {
case T_BOOLEAN: return Op_ExtractUB;

View File

@ -778,6 +778,56 @@ class StoreVectorNode : public StoreNode {
idx == MemNode::ValueIn + 1; }
};
class StoreVectorMaskedNode : public StoreVectorNode {
public:
StoreVectorMaskedNode(Node* c, Node* mem, Node* dst, Node* src, const TypePtr* at, Node* mask)
: StoreVectorNode(c, mem, dst, at, src) {
assert(mask->bottom_type()->is_long(), "sanity");
init_class_id(Class_StoreVector);
set_mismatched_access();
add_req(mask);
}
virtual int Opcode() const;
virtual uint match_edge(uint idx) const {
return idx > 1;
}
Node* Ideal(PhaseGVN* phase, bool can_reshape);
};
class LoadVectorMaskedNode : public LoadVectorNode {
public:
LoadVectorMaskedNode(Node* c, Node* mem, Node* src, const TypePtr* at, const TypeVect* vt, Node* mask)
: LoadVectorNode(c, mem, src, at, vt) {
assert(mask->bottom_type()->is_long(), "sanity");
init_class_id(Class_LoadVector);
set_mismatched_access();
add_req(mask);
}
virtual int Opcode() const;
virtual uint match_edge(uint idx) const {
return idx > 1;
}
Node* Ideal(PhaseGVN* phase, bool can_reshape);
};
class VectorMaskGenNode : public TypeNode {
public:
VectorMaskGenNode(Node* length, const Type* ty, const Type* ety): TypeNode(ty, 2), _elemType(ety) {
init_req(1, length);
}
virtual int Opcode() const;
const Type* get_elem_type() { return _elemType;}
virtual uint size_of() const { return sizeof(VectorMaskGenNode); }
private:
const Type* _elemType;
};
//=========================Promote_Scalar_to_Vector============================
//------------------------------ReplicateBNode---------------------------------