8325155: C2 SuperWord: remove alignment boundaries

Reviewed-by: chagedorn, kvn
This commit is contained in:
Emanuel Peter 2024-06-07 05:01:23 +00:00
parent d8af58941b
commit 944aeb81b1
7 changed files with 722 additions and 470 deletions

View File

@ -43,7 +43,6 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
_vloop_analyzer(vloop_analyzer),
_vloop(vloop_analyzer.vloop()),
_arena(mtCompiler),
_node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
_clone_map(phase()->C->clone_map()), // map of nodes created in cloning
_pairset(&_arena, _vloop_analyzer),
_packset(&_arena, _vloop_analyzer
@ -453,11 +452,8 @@ bool SuperWord::transform_loop() {
bool SuperWord::SLP_extract() {
assert(cl()->is_main_loop(), "SLP should only work on main loops");
// Ensure extra info is allocated.
initialize_node_info();
// Attempt vectorization
find_adjacent_refs();
// Find "seed" pairs.
create_adjacent_memop_pairs();
if (_pairset.is_empty()) {
#ifndef PRODUCT
@ -491,245 +487,133 @@ bool SuperWord::SLP_extract() {
return output();
}
//------------------------------find_adjacent_refs---------------------------
// Find the adjacent memory references and create pack pairs for them.
// We can find adjacent memory references by comparing their relative
// alignment. Whether the final vectors can be aligned is determined later
// once all vectors are extended and combined.
void SuperWord::find_adjacent_refs() {
// Get list of memory operations
Node_List memops;
for (int i = 0; i < body().length(); i++) {
Node* n = body().at(i);
if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
is_java_primitive(n->as_Mem()->memory_type())) {
int align = memory_alignment(n->as_Mem(), 0);
if (align != bottom_align) {
memops.push(n);
}
}
}
// Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
void SuperWord::create_adjacent_memop_pairs() {
ResourceMark rm;
GrowableArray<const VPointer*> vpointers;
collect_valid_vpointers(vpointers);
// Sort the VPointers. This does 2 things:
// - Separate the VPointer into groups: all memops that have the same opcode and the same
// VPointer, except for the offset. Adjacent memops must have the same opcode and the
// same VPointer, except for a shift in the offset. Thus, two memops can only be adjacent
// if they are in the same group. This decreases the work.
// - Sort by offset inside the groups. This decreases the work needed to determine adjacent
// memops inside a group.
vpointers.sort(VPointer::cmp_for_sort);
#ifndef PRODUCT
if (is_trace_superword_adjacent_memops()) {
tty->print_cr("\nfind_adjacent_refs found %d memops", memops.size());
tty->print_cr("\nSuperWord::create_adjacent_memop_pairs:");
}
#endif
int max_idx;
while (memops.size() != 0) {
// Find a memory reference to align to.
MemNode* mem_ref = find_align_to_ref(memops, max_idx);
if (mem_ref == nullptr) break;
int iv_adjustment = get_iv_adjustment(mem_ref);
const VPointer& align_to_ref_p = vpointer(mem_ref);
// Set alignment relative to "align_to_ref" for all related memory operations.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
if (isomorphic(s, mem_ref) &&
(!_do_vector_loop || same_origin_idx(s, mem_ref))) {
const VPointer& p2 = vpointer(s);
if (p2.comparable(align_to_ref_p)) {
int align = memory_alignment(s, iv_adjustment);
set_alignment(s, align);
}
}
}
// Create initial pack pairs of memory operations for which alignment was set.
for (uint i = 0; i < memops.size(); i++) {
Node* s1 = memops.at(i);
int align = alignment(s1);
if (align == top_align) continue;
for (uint j = 0; j < memops.size(); j++) {
Node* s2 = memops.at(j);
if (alignment(s2) == top_align) continue;
if (s1 != s2 && are_adjacent_refs(s1, s2)) {
if (stmts_can_pack(s1, s2, align)) {
if (!_do_vector_loop || same_origin_idx(s1, s2)) {
_pairset.add_pair(s1, s2);
}
}
}
}
}
// Remove used mem nodes.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* m = memops.at(i)->as_Mem();
if (alignment(m) != top_align) {
memops.remove(i);
}
}
} // while (memops.size() != 0)
create_adjacent_memop_pairs_in_all_groups(vpointers);
#ifndef PRODUCT
if (is_trace_superword_packset()) {
tty->print_cr("\nAfter Superword::find_adjacent_refs");
tty->print_cr("\nAfter Superword::create_adjacent_memop_pairs");
_pairset.print();
}
#endif
}
//------------------------------find_align_to_ref---------------------------
// Find a memory reference to align the loop induction variable to.
// Looks first at stores then at loads, looking for a memory reference
// with the largest number of references similar to it.
MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
// Count number of comparable memory ops
for (uint i = 0; i < memops.size(); i++) {
MemNode* s1 = memops.at(i)->as_Mem();
const VPointer& p1 = vpointer(s1);
for (uint j = i+1; j < memops.size(); j++) {
MemNode* s2 = memops.at(j)->as_Mem();
if (isomorphic(s1, s2)) {
const VPointer& p2 = vpointer(s2);
if (p1.comparable(p2)) {
(*cmp_ct.adr_at(i))++;
(*cmp_ct.adr_at(j))++;
}
}
// Collect all memops vpointers that could potentially be vectorized.
void SuperWord::collect_valid_vpointers(GrowableArray<const VPointer*>& vpointers) {
for_each_mem([&] (const MemNode* mem, int bb_idx) {
const VPointer& p = vpointer(mem);
if (p.valid() &&
!mem->is_LoadStore() &&
is_java_primitive(mem->memory_type())) {
vpointers.append(&p);
}
}
// Find Store (or Load) with the greatest number of "comparable" references,
// biggest vector size, smallest data size and smallest iv offset.
int max_ct = 0;
int max_vw = 0;
int max_idx = -1;
int min_size = max_jint;
int min_iv_offset = max_jint;
for (uint j = 0; j < memops.size(); j++) {
MemNode* s = memops.at(j)->as_Mem();
if (s->is_Store()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
const VPointer& p = vpointer(s);
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
(vw == max_vw &&
( data_size(s) < min_size ||
(data_size(s) == min_size &&
p.offset_in_bytes() < min_iv_offset)))))) {
max_ct = cmp_ct.at(j);
max_vw = vw;
max_idx = j;
min_size = data_size(s);
min_iv_offset = p.offset_in_bytes();
}
}
}
// If no stores, look at loads
if (max_ct == 0) {
for (uint j = 0; j < memops.size(); j++) {
MemNode* s = memops.at(j)->as_Mem();
if (s->is_Load()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
const VPointer& p = vpointer(s);
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
(vw == max_vw &&
( data_size(s) < min_size ||
(data_size(s) == min_size &&
p.offset_in_bytes() < min_iv_offset)))))) {
max_ct = cmp_ct.at(j);
max_vw = vw;
max_idx = j;
min_size = data_size(s);
min_iv_offset = p.offset_in_bytes();
}
}
}
}
#ifndef PRODUCT
if (is_trace_superword_verbose()) {
tty->print_cr("\nVector memops after find_align_to_ref");
for (uint i = 0; i < memops.size(); i++) {
MemNode* s = memops.at(i)->as_Mem();
s->dump();
}
}
#endif
idx = max_idx;
if (max_ct > 0) {
#ifndef PRODUCT
if (is_trace_superword_adjacent_memops()) {
tty->print("SuperWord::find_align_to_ref: ");
memops.at(max_idx)->as_Mem()->dump();
}
#endif
return memops.at(max_idx)->as_Mem();
}
return nullptr;
});
}
//---------------------------get_vw_bytes_special------------------------
int SuperWord::get_vw_bytes_special(MemNode* s) {
// Get the vector width in bytes.
int vw = vector_width_in_bytes(s);
// Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
BasicType btype = velt_basic_type(s);
if (type2aelembytes(btype) == 2) {
bool should_combine_adjacent = true;
for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
Node* user = s->fast_out(i);
if (!VectorNode::is_muladds2i(user)) {
should_combine_adjacent = false;
}
}
if (should_combine_adjacent) {
vw = MIN2(Matcher::max_vector_size_auto_vectorization(btype)*type2aelembytes(btype), vw * 2);
}
// For each group, find the adjacent memops.
void SuperWord::create_adjacent_memop_pairs_in_all_groups(const GrowableArray<const VPointer*> &vpointers) {
int group_start = 0;
while (group_start < vpointers.length()) {
int group_end = find_group_end(vpointers, group_start);
create_adjacent_memop_pairs_in_one_group(vpointers, group_start, group_end);
group_start = group_end;
}
// Check for special case where there is a type conversion between different data size.
int vectsize = max_vector_size_in_def_use_chain(s);
if (vectsize < Matcher::max_vector_size_auto_vectorization(btype)) {
vw = MIN2(vectsize * type2aelembytes(btype), vw);
}
return vw;
}
//---------------------------get_iv_adjustment---------------------------
// Calculate loop's iv adjustment for this memory ops.
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
const VPointer& align_to_ref_p = vpointer(mem_ref);
int offset = align_to_ref_p.offset_in_bytes();
int scale = align_to_ref_p.scale_in_bytes();
int elt_size = align_to_ref_p.memory_size();
int vw = get_vw_bytes_special(mem_ref);
assert(vw > 1, "sanity");
int iv_adjustment;
if (scale != 0) {
int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
// At least one iteration is executed in pre-loop by default. As result
// several iterations are needed to align memory operations in main-loop even
// if offset is 0.
int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
iv_adjustment = iv_adjustment_in_bytes/elt_size;
} else {
// This memory op is not dependent on iv (scale == 0)
iv_adjustment = 0;
// Step forward until we find a VPointer of another group, or we reach the end of the array.
int SuperWord::find_group_end(const GrowableArray<const VPointer*>& vpointers, int group_start) {
int group_end = group_start + 1;
while (group_end < vpointers.length() &&
VPointer::cmp_for_sort_by_group(
vpointers.adr_at(group_start),
vpointers.adr_at(group_end)
) == 0) {
group_end++;
}
return group_end;
}
// Find adjacent memops for a single group, e.g. for all LoadI of the same base, invar, etc.
// Create pairs and add them to the pairset.
void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray<const VPointer*>& vpointers, const int group_start, const int group_end) {
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
mem_ref->dump();
if (is_trace_superword_adjacent_memops()) {
tty->print_cr(" group:");
for (int i = group_start; i < group_end; i++) {
const VPointer* p = vpointers.at(i);
tty->print(" ");
p->print();
}
}
#endif
return iv_adjustment;
MemNode* first = vpointers.at(group_start)->mem();
int element_size = data_size(first);
// For each ref in group: find others that can be paired:
for (int i = group_start; i < group_end; i++) {
const VPointer* p1 = vpointers.at(i);
MemNode* mem1 = p1->mem();
bool found = false;
// For each ref in group with larger or equal offset:
for (int j = i + 1; j < group_end; j++) {
const VPointer* p2 = vpointers.at(j);
MemNode* mem2 = p2->mem();
assert(mem1 != mem2, "look only at pair of different memops");
// Check for correct distance.
assert(data_size(mem1) == element_size, "all nodes in group must have the same element size");
assert(data_size(mem2) == element_size, "all nodes in group must have the same element size");
assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset");
if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; }
if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; }
// Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize)
if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; }
if (!can_pack_into_pair(mem1, mem2)) { continue; }
#ifndef PRODUCT
if (is_trace_superword_adjacent_memops()) {
if (found) {
tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:");
} else {
tty->print_cr(" pair:");
}
tty->print(" ");
p1->print();
tty->print(" ");
p2->print();
}
#endif
if (!found) {
_pairset.add_pair(mem1, mem2);
}
}
}
}
void VLoopMemorySlices::find_memory_slices() {
@ -809,10 +693,8 @@ void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail,
#endif
}
//------------------------------stmts_can_pack---------------------------
// Can s1 and s2 be in a pack with s1 immediately preceding s2 and
// s1 aligned at "align"
bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
// Check if two nodes can be packed into a pair.
bool SuperWord::can_pack_into_pair(Node* s1, Node* s2) {
// Do not use superword for non-primitives
BasicType bt1 = velt_basic_type(s1);
@ -831,13 +713,7 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) {
if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
int s1_align = alignment(s1);
int s2_align = alignment(s2);
if (s1_align == top_align || s1_align == align) {
if (s2_align == top_align || s2_align == align + data_size(s1)) {
return true;
}
}
return true;
}
}
}
@ -1013,16 +889,6 @@ bool VLoopReductions::is_marked_reduction_pair(const Node* s1, const Node* s2) c
return false;
}
//------------------------------set_alignment---------------------------
void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
set_alignment(s1, align);
if (align == top_align || align == bottom_align) {
set_alignment(s2, align);
} else {
set_alignment(s2, align + data_size(s1));
}
}
// Extend pairset by following use->def and def->use links from pair members.
void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
bool changed;
@ -1058,57 +924,25 @@ void SuperWord::extend_pairset_with_more_pairs_by_following_use_and_def() {
#endif
}
//------------------------------adjust_alignment_for_type_conversion---------------------------------
// Adjust the target alignment if conversion between different data size exists in def-use nodes.
int SuperWord::adjust_alignment_for_type_conversion(Node* s, Node* t, int align) {
// Do not use superword for non-primitives
BasicType bt1 = velt_basic_type(s);
BasicType bt2 = velt_basic_type(t);
if (!is_java_primitive(bt1) || !is_java_primitive(bt2)) {
return align;
}
if (longer_type_for_conversion(s) != T_ILLEGAL ||
longer_type_for_conversion(t) != T_ILLEGAL) {
align = align / data_size(s) * data_size(t);
}
return align;
}
bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2) {
assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
assert(s1->req() == s2->req(), "just checking");
assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
if (s1->is_Load()) return false;
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_def: s1 %d, align %d",
s1->_idx, alignment(s1));
}
#endif
bool changed = false;
int start = s1->is_Store() ? MemNode::ValueIn : 1;
int end = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
for (int j = start; j < end; j++) {
int align = alignment(s1);
Node* t1 = s1->in(j);
Node* t2 = s2->in(j);
if (!in_bb(t1) || !in_bb(t2) || t1->is_Mem() || t2->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
align = adjust_alignment_for_type_conversion(s1, t1, align);
if (stmts_can_pack(t1, t2, align)) {
if (can_pack_into_pair(t1, t2)) {
if (estimate_cost_savings_when_packing_as_pair(t1, t2) >= 0) {
_pairset.add_pair(t1, t2);
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_def: set_alignment(%d, %d, %d)",
t1->_idx, t2->_idx, align);
}
#endif
set_alignment(t1, t2, align);
changed = true;
}
}
@ -1122,17 +956,9 @@ bool SuperWord::extend_pairset_with_more_pairs_by_following_def(Node* s1, Node*
bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node* s2) {
assert(_pairset.is_pair(s1, s2), "(s1, s2) must be a pair");
assert(s1->req() == s2->req(), "just checking");
assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
if (s1->is_Store()) return false;
int align = alignment(s1);
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_use: s1 %d, align %d",
s1->_idx, align);
}
#endif
int savings = -1;
Node* u1 = nullptr;
Node* u2 = nullptr;
@ -1150,28 +976,18 @@ bool SuperWord::extend_pairset_with_more_pairs_by_following_use(Node* s1, Node*
}
if (t2->Opcode() == Op_AddI && t2 == cl()->incr()) continue; // don't mess with the iv
if (order_inputs_of_uses_to_match_def_pair(s1, s2, t1, t2) != PairOrderStatus::Ordered) { continue; }
int adjusted_align = alignment(s1);
adjusted_align = adjust_alignment_for_type_conversion(s1, t1, adjusted_align);
if (stmts_can_pack(t1, t2, adjusted_align)) {
if (can_pack_into_pair(t1, t2)) {
int my_savings = estimate_cost_savings_when_packing_as_pair(t1, t2);
if (my_savings > savings) {
savings = my_savings;
u1 = t1;
u2 = t2;
align = adjusted_align;
}
}
}
}
if (savings >= 0) {
_pairset.add_pair(u1, u2);
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print_cr("SuperWord::extend_pairset_with_more_pairs_by_following_use: set_alignment(%d, %d, %d)",
u1->_idx, u2->_idx, align);
}
#endif
set_alignment(u1, u2, align);
return true; // changed
}
return false; // no change
@ -1814,6 +1630,11 @@ uint SuperWord::max_implemented_size(const Node_List* pack) {
}
}
// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
// returns int type, but Vector API for them returns long type. To unify
// the implementation in backend, superword splits the vector implementation
// for Java API into an execution node with long type plus another node
// converting long to int.
bool SuperWord::requires_long_to_int_conversion(int opc) {
switch(opc) {
case Op_PopCountL:
@ -2948,7 +2769,17 @@ uint SuperWord::find_use_def_boundary(const Node_List* pack) const {
bool SuperWord::is_vector_use(Node* use, int u_idx) const {
Node_List* u_pk = get_pack(use);
if (u_pk == nullptr) return false;
if (is_marked_reduction(use)) return true;
// Reduction: first input is internal connection.
if (is_marked_reduction(use) && u_idx == 1) {
#ifdef ASSERT
for (uint i = 1; i < u_pk->size(); i++) {
assert(u_pk->at(i - 1) == u_pk->at(i)->in(1), "internal connection");
}
#endif
return true;
}
Node* def = use->in(u_idx);
Node_List* d_pk = get_pack(def);
if (d_pk == nullptr) {
@ -2975,51 +2806,64 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) const {
return true;
}
if (!is_velt_basic_type_compatible_use_def(use, def)) {
return false;
}
if (VectorNode::is_muladds2i(use)) {
// MulAddS2I takes shorts and produces ints - hence the special checks
// on alignment and size.
// MulAddS2I takes shorts and produces ints.
if (u_pk->size() * 2 != d_pk->size()) {
return false;
}
for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
Node* ui = u_pk->at(i);
Node* di = d_pk->at(i);
if (alignment(ui) != alignment(di) * 2) {
return false;
}
}
return true;
}
if (u_pk->size() != d_pk->size())
if (u_pk->size() != d_pk->size()) {
return false;
if (longer_type_for_conversion(use) != T_ILLEGAL) {
// These opcodes take a type of a kind of size and produce a type of
// another size - hence the special checks on alignment and size.
for (uint i = 0; i < u_pk->size(); i++) {
Node* ui = u_pk->at(i);
Node* di = d_pk->at(i);
if (ui->in(u_idx) != di) {
return false;
}
if (alignment(ui) / type2aelembytes(velt_basic_type(ui)) !=
alignment(di) / type2aelembytes(velt_basic_type(di))) {
return false;
}
}
return true;
}
for (uint i = 0; i < u_pk->size(); i++) {
Node* ui = u_pk->at(i);
Node* di = d_pk->at(i);
if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
if (ui->in(u_idx) != di) {
return false;
}
}
return true;
}
// Check if the output type of def is compatible with the input type of use, i.e. if the
// types have the same size.
bool SuperWord::is_velt_basic_type_compatible_use_def(Node* use, Node* def) const {
assert(in_bb(def) && in_bb(use), "both use and def are in loop");
// Conversions are trivially compatible.
if (VectorNode::is_convert_opcode(use->Opcode())) {
return true;
}
BasicType use_bt = velt_basic_type(use);
BasicType def_bt = velt_basic_type(def);
assert(is_java_primitive(use_bt), "sanity %s", type2name(use_bt));
assert(is_java_primitive(def_bt), "sanity %s", type2name(def_bt));
// Nodes like Long.bitCount: expect long input, and int output.
if (requires_long_to_int_conversion(use->Opcode())) {
return type2aelembytes(def_bt) == 8 &&
type2aelembytes(use_bt) == 4;
}
// MulAddS2I: expect short input, and int output.
if (VectorNode::is_muladds2i(use)) {
return type2aelembytes(def_bt) == 2 &&
type2aelembytes(use_bt) == 4;
}
// Default case: input size of use equals output size of def.
return type2aelembytes(use_bt) == type2aelembytes(def_bt);
}
// Return nullptr if success, else failure message
VStatus VLoopBody::construct() {
assert(_body.is_empty(), "body is empty");
@ -3150,12 +2994,6 @@ VStatus VLoopBody::construct() {
return VStatus::make_success();
}
// Initialize per node info
void SuperWord::initialize_node_info() {
Node* last = body().at(body().length() - 1);
grow_node_info(bb_idx(last));
}
BasicType SuperWord::longer_type_for_conversion(Node* n) const {
if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
requires_long_to_int_conversion(n->Opcode())) ||
@ -3177,34 +3015,6 @@ BasicType SuperWord::longer_type_for_conversion(Node* n) const {
: (src_size > dst_size ? src_t : dst_t);
}
int SuperWord::max_vector_size_in_def_use_chain(Node* n) {
BasicType bt = velt_basic_type(n);
BasicType vt = bt;
// find the longest type among def nodes.
uint start, end;
VectorNode::vector_operands(n, &start, &end);
for (uint i = start; i < end; ++i) {
Node* input = n->in(i);
if (!in_bb(input)) continue;
BasicType newt = longer_type_for_conversion(input);
vt = (newt == T_ILLEGAL) ? vt : newt;
}
// find the longest type among use nodes.
for (uint i = 0; i < n->outcnt(); ++i) {
Node* output = n->raw_out(i);
if (!in_bb(output)) continue;
BasicType newt = longer_type_for_conversion(output);
vt = (newt == T_ILLEGAL) ? vt : newt;
}
int max = Matcher::max_vector_size_auto_vectorization(vt);
// If now there is no vectors for the longest type, the nodes with the longest
// type in the def-use chain are not packed in SuperWord::stmts_can_pack.
return max < 2 ? Matcher::max_vector_size_auto_vectorization(bt) : max;
}
void VLoopTypes::compute_vector_element_type() {
#ifndef PRODUCT
if (_vloop.is_trace_vector_element_type()) {
@ -3308,36 +3118,6 @@ void VLoopTypes::compute_vector_element_type() {
#endif
}
//------------------------------memory_alignment---------------------------
// Alignment within a vector memory reference
int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump();
}
#endif
const VPointer& p = vpointer(s);
if (!p.valid()) {
NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
return bottom_align;
}
int vw = get_vw_bytes_special(s);
if (vw < 2) {
NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
return bottom_align; // No vectors for this type
}
int offset = p.offset_in_bytes();
offset += iv_adjust*p.memory_size();
int off_rem = offset % vw;
int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
#ifndef PRODUCT
if (is_trace_superword_alignment()) {
tty->print_cr("SuperWord::memory_alignment: off_rem = %d, off_mod = %d (offset = %d)", off_rem, off_mod, offset);
}
#endif
return off_mod;
}
// Smallest type containing range of values
const Type* VLoopTypes::container_type(Node* n) const {
if (n->is_Mem()) {
@ -3794,10 +3574,6 @@ void VLoopBody::print() const {
}
#endif
// ========================= SWNodeInfo =====================
const SWNodeInfo SWNodeInfo::initial;
//
// --------------------------------- vectorization/simd -----------------------------------
//

View File

@ -384,18 +384,6 @@ public:
NOT_PRODUCT(static void print_pack(Node_List* pack);)
};
// ========================= SuperWord =====================
// -----------------------------SWNodeInfo---------------------------------
// Per node info needed by SuperWord
class SWNodeInfo {
public:
int _alignment; // memory alignment for a node
SWNodeInfo() : _alignment(-1) {}
static const SWNodeInfo initial;
};
// -----------------------------SuperWord---------------------------------
// Transforms scalar operations into packed (superword) operations.
class SuperWord : public ResourceObj {
@ -407,9 +395,6 @@ class SuperWord : public ResourceObj {
// VSharedData, and reused over many AutoVectorizations.
Arena _arena;
enum consts { top_align = -1, bottom_align = -666 };
GrowableArray<SWNodeInfo> _node_info; // Info needed per node
CloneMap& _clone_map; // map of nodes created in cloning
PairSet _pairset;
@ -461,6 +446,11 @@ class SuperWord : public ResourceObj {
return _vloop_analyzer.body().bb_idx(n);
}
template<typename Callback>
void for_each_mem(Callback callback) const {
return _vloop_analyzer.body().for_each_mem(callback);
}
// VLoopTypes accessors
const Type* velt_type(Node* n) const {
return _vloop_analyzer.types().velt_type(n);
@ -506,11 +496,6 @@ class SuperWord : public ResourceObj {
#ifndef PRODUCT
// TraceAutoVectorization and TraceSuperWord
bool is_trace_superword_alignment() const {
// Too verbose for TraceSuperWord
return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
}
bool is_trace_superword_adjacent_memops() const {
return TraceSuperWord ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
@ -531,15 +516,9 @@ class SuperWord : public ResourceObj {
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
}
bool is_trace_superword_verbose() const {
// Too verbose for TraceSuperWord
return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
}
bool is_trace_superword_any() const {
return TraceSuperWord ||
is_trace_align_vector() ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
@ -549,7 +528,7 @@ class SuperWord : public ResourceObj {
bool is_trace_align_vector() const {
return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
is_trace_superword_verbose();
_vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
}
#endif
@ -566,37 +545,28 @@ class SuperWord : public ResourceObj {
// Accessors
Arena* arena() { return &_arena; }
int get_vw_bytes_special(MemNode* s);
// Ensure node_info contains element "i"
void grow_node_info(int i) { if (i >= _node_info.length()) _node_info.at_put_grow(i, SWNodeInfo::initial); }
// should we align vector memory references on this platform?
bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }
// memory alignment for a node
int alignment(Node* n) const { return _node_info.adr_at(bb_idx(n))->_alignment; }
void set_alignment(Node* n, int a) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_alignment = a; }
// is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
static bool requires_long_to_int_conversion(int opc);
// For pack p, are all idx operands the same?
bool same_inputs(const Node_List* p, int idx) const;
// CloneMap utilities
bool same_origin_idx(Node* a, Node* b) const;
bool same_generation(Node* a, Node* b) const;
private:
bool SLP_extract();
// Find the adjacent memory references and create pack pairs for them.
void find_adjacent_refs();
// Find a memory reference to align the loop induction variable to.
MemNode* find_align_to_ref(Node_List &memops, int &idx);
// Calculate loop's iv adjustment for this memory ops.
int get_iv_adjustment(MemNode* mem);
// Can s1 and s2 be in a pack with s1 immediately preceding s2 and s1 aligned at "align"
bool stmts_can_pack(Node* s1, Node* s2, int align);
// Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
void create_adjacent_memop_pairs();
void collect_valid_vpointers(GrowableArray<const VPointer*>& vpointers);
void create_adjacent_memop_pairs_in_all_groups(const GrowableArray<const VPointer*>& vpointers);
static int find_group_end(const GrowableArray<const VPointer*>& vpointers, int group_start);
void create_adjacent_memop_pairs_in_one_group(const GrowableArray<const VPointer*>& vpointers, const int group_start, int group_end);
// Various methods to check if we can pack two nodes.
bool can_pack_into_pair(Node* s1, Node* s2);
// Is s1 immediately before s2 in memory?
bool are_adjacent_refs(Node* s1, Node* s2) const;
// Are s1 and s2 similar?
@ -606,8 +576,6 @@ private:
// For a node pair (s1, s2) which is isomorphic and independent,
// do s1 and s2 have similar input edges?
bool have_similar_inputs(Node* s1, Node* s2);
void set_alignment(Node* s1, Node* s2, int align);
int adjust_alignment_for_type_conversion(Node* s, Node* t, int align);
void extend_pairset_with_more_pairs_by_following_use_and_def();
bool extend_pairset_with_more_pairs_by_following_def(Node* s1, Node* s2);
@ -661,16 +629,15 @@ private:
// Is use->in(u_idx) a vector use?
bool is_vector_use(Node* use, int u_idx) const;
// Initialize per node info
void initialize_node_info();
// Return the longer type for vectorizable type-conversion node or illegal type for other nodes.
BasicType longer_type_for_conversion(Node* n) const;
// Find the longest type in def-use chain for packed nodes, and then compute the max vector size.
int max_vector_size_in_def_use_chain(Node* n);
static bool requires_long_to_int_conversion(int opc);
bool is_velt_basic_type_compatible_use_def(Node* use, Node* def) const;
static LoadNode::ControlDependency control_dependency(Node_List* p);
// Alignment within a vector memory reference
int memory_alignment(MemNode* s, int iv_adjust);
// Ensure that the main loop vectors are aligned by adjusting the pre loop limit.
void determine_mem_ref_and_aw_for_main_loop_alignment();
void adjust_pre_loop_limit_to_align_main_loop_vectors();

View File

@ -37,8 +37,7 @@
flags(TYPES, "Trace VLoopTypes") \
flags(POINTERS, "Trace VLoopPointers") \
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
@ -115,7 +114,6 @@ class TraceAutoVectorizationTagValidator {
} else if (ALL == tag) {
_tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
} else if (SW_VERBOSE == tag) {
_tags.at_put(SW_ALIGNMENT, set_bit);
_tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
_tags.at_put(SW_REJECTIONS, set_bit);
_tags.at_put(SW_PACKSET, set_bit);

View File

@ -202,7 +202,7 @@ void VLoopVPointers::allocate_vpointers_array() {
void VLoopVPointers::compute_and_cache_vpointers() {
int pointers_idx = 0;
_body.for_each_mem([&] (const MemNode* mem, int bb_idx) {
_body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
// Placement new: construct directly into the array.
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
_bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
@ -410,7 +410,7 @@ void VLoopDependencyGraph::PredsIterator::next() {
int VPointer::Tracer::_depth = 0;
#endif
VPointer::VPointer(const MemNode* mem, const VLoop& vloop,
VPointer::VPointer(MemNode* const mem, const VLoop& vloop,
Node_Stack* nstack, bool analyze_only) :
_mem(mem), _vloop(vloop),
_base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr),
@ -807,10 +807,50 @@ void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
_invar = register_if_new(add);
}
// To be in the same group, two VPointers must be the same,
// except for the offset.
int VPointer::cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2) {
const VPointer* a = *p1;
const VPointer* b = *p2;
int cmp_base = a->base()->_idx - b->base()->_idx;
if (cmp_base != 0) { return cmp_base; }
int cmp_opcode = a->mem()->Opcode() - b->mem()->Opcode();
if (cmp_opcode != 0) { return cmp_opcode; }
int cmp_scale = a->scale_in_bytes() - b->scale_in_bytes();
if (cmp_scale != 0) { return cmp_scale; }
int cmp_invar = (a->invar() == nullptr ? 0 : a->invar()->_idx) -
(b->invar() == nullptr ? 0 : b->invar()->_idx);
return cmp_invar;
}
// We compare by group, then by offset, and finally by node idx.
int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) {
int cmp_group = cmp_for_sort_by_group(p1, p2);
if (cmp_group != 0) { return cmp_group; }
const VPointer* a = *p1;
const VPointer* b = *p2;
int cmp_offset = a->offset_in_bytes() - b->offset_in_bytes();
if (cmp_offset != 0) { return cmp_offset; }
return a->mem()->_idx - b->mem()->_idx;
}
#ifndef PRODUCT
// Function for printing the fields of a VPointer
void VPointer::print() const {
tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name());
if (!valid()) {
tty->print_cr("invalid]");
return;
}
tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0);
tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0);

View File

@ -669,7 +669,7 @@ private:
// operation in a counted loop for vectorizable analysis.
class VPointer : public ArenaObj {
protected:
const MemNode* _mem; // My memory reference node
MemNode* const _mem; // My memory reference node
const VLoop& _vloop;
Node* _base; // null if unsafe nonheap reference
@ -711,12 +711,12 @@ class VPointer : public ArenaObj {
NotComparable = (Less | Greater | Equal)
};
VPointer(const MemNode* mem, const VLoop& vloop) :
VPointer(MemNode* const mem, const VLoop& vloop) :
VPointer(mem, vloop, nullptr, false) {}
VPointer(const MemNode* mem, const VLoop& vloop, Node_Stack* nstack) :
VPointer(MemNode* const mem, const VLoop& vloop, Node_Stack* nstack) :
VPointer(mem, vloop, nstack, true) {}
private:
VPointer(const MemNode* mem, const VLoop& vloop,
VPointer(MemNode* const mem, const VLoop& vloop,
Node_Stack* nstack, bool analyze_only);
// Following is used to create a temporary object during
// the pattern match of an address expression.
@ -729,7 +729,7 @@ class VPointer : public ArenaObj {
Node* base() const { return _base; }
Node* adr() const { return _adr; }
const MemNode* mem() const { return _mem; }
MemNode* mem() const { return _mem; }
int scale_in_bytes() const { return _scale; }
Node* invar() const { return _invar; }
int offset_in_bytes() const { return _offset; }
@ -781,6 +781,11 @@ class VPointer : public ArenaObj {
static bool equal(int cmp) { return cmp == Equal; }
static bool comparable(int cmp) { return cmp < NotComparable; }
// We need to be able to sort the VPointer to efficiently group the
// memops into groups, and to find adjacent memops.
static int cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2);
static int cmp_for_sort(const VPointer** p1, const VPointer** p2);
NOT_PRODUCT( void print() const; )
#ifndef PRODUCT

View File

@ -0,0 +1,474 @@
/*
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
import jdk.test.lib.Utils;
import jdk.test.whitebox.WhiteBox;
import java.lang.reflect.Array;
import java.util.Map;
import java.util.HashMap;
import java.util.Random;
import java.nio.ByteOrder;
/*
* @test
* @bug 8325155
* @summary Test some cases that vectorize after the removal of the alignment boundaries code.
* Now, we instead check if use-def connections have compatible type size.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestCompatibleUseDefTypeSize
*/
public class TestCompatibleUseDefTypeSize {
static int RANGE = 1024*8;
private static final Random RANDOM = Utils.getRandomInstance();
// Inputs
byte[] aB;
byte[] bB;
short[] aS;
short[] bS;
char[] aC;
char[] bC;
int[] aI;
int[] bI;
long[] aL;
long[] bL;
float[] aF;
float[] bF;
double[] aD;
double[] bD;
// List of tests
Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
// List of gold, the results from the first run before compilation
Map<String,Object[]> golds = new HashMap<String,Object[]>();
interface TestFunction {
Object[] run();
}
public static void main(String[] args) {
TestFramework.run();
}
public TestCompatibleUseDefTypeSize() {
// Generate input once
aB = generateB();
bB = generateB();
aS = generateS();
bS = generateS();
aC = generateC();
bC = generateC();
aI = generateI();
bI = generateI();
aL = generateL();
bL = generateL();
aF = generateF();
bF = generateF();
aD = generateD();
bD = generateD();
// Add all tests to list
tests.put("test0", () -> { return test0(aB.clone(), bC.clone()); });
tests.put("test1", () -> { return test1(aB.clone(), bC.clone()); });
tests.put("test2", () -> { return test2(aB.clone(), bC.clone()); });
tests.put("test3", () -> { return test3(aI.clone(), bI.clone()); });
tests.put("test4", () -> { return test4(aI.clone(), bI.clone()); });
tests.put("test5", () -> { return test5(aI.clone(), bF.clone()); });
tests.put("test6", () -> { return test6(aI.clone(), bF.clone()); });
tests.put("test7", () -> { return test7(aI.clone(), bF.clone()); });
tests.put("test8", () -> { return test8(aL.clone(), bD.clone()); });
tests.put("test9", () -> { return test9(aL.clone(), bD.clone()); });
tests.put("test10", () -> { return test10(aL.clone(), bD.clone()); });
tests.put("test11", () -> { return test11(aC.clone()); });
// Compute gold value for all test methods before compilation
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
String name = entry.getKey();
TestFunction test = entry.getValue();
Object[] gold = test.run();
golds.put(name, gold);
}
}
@Warmup(100)
@Run(test = {"test0",
"test1",
"test2",
"test3",
"test4",
"test5",
"test6",
"test7",
"test8",
"test9",
"test10",
"test11"})
public void runTests() {
for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
String name = entry.getKey();
TestFunction test = entry.getValue();
// Recall gold value from before compilation
Object[] gold = golds.get(name);
// Compute new result
Object[] result = test.run();
// Compare gold and new result
verify(name, gold, result);
}
}
static byte[] generateB() {
byte[] a = new byte[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = (byte)RANDOM.nextInt();
}
return a;
}
static short[] generateS() {
short[] a = new short[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = (short)RANDOM.nextInt();
}
return a;
}
static char[] generateC() {
char[] a = new char[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = (char)RANDOM.nextInt();
}
return a;
}
static int[] generateI() {
int[] a = new int[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = RANDOM.nextInt();
}
return a;
}
static long[] generateL() {
long[] a = new long[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = RANDOM.nextLong();
}
return a;
}
static float[] generateF() {
float[] a = new float[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = Float.intBitsToFloat(RANDOM.nextInt());
}
return a;
}
static double[] generateD() {
double[] a = new double[RANGE];
for (int i = 0; i < a.length; i++) {
a[i] = Double.longBitsToDouble(RANDOM.nextLong());
}
return a;
}
static void verify(String name, Object[] gold, Object[] result) {
if (gold.length != result.length) {
throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
gold.length + ", result.length = " + result.length);
}
for (int i = 0; i < gold.length; i++) {
Object g = gold[i];
Object r = result[i];
if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
throw new RuntimeException("verify " + name + ": must both be array of same type:" +
" gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
" result[" + i + "].getClass() = " + r.getClass().getSimpleName());
}
if (g == r) {
throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
" gold[" + i + "] == result[" + i + "]");
}
if (Array.getLength(g) != Array.getLength(r)) {
throw new RuntimeException("verify " + name + ": arrays must have same length:" +
" gold[" + i + "].length = " + Array.getLength(g) +
" result[" + i + "].length = " + Array.getLength(r));
}
Class c = g.getClass().getComponentType();
if (c == byte.class) {
verifyB(name, i, (byte[])g, (byte[])r);
} else if (c == short.class) {
verifyS(name, i, (short[])g, (short[])r);
} else if (c == char.class) {
verifyC(name, i, (char[])g, (char[])r);
} else if (c == int.class) {
verifyI(name, i, (int[])g, (int[])r);
} else if (c == long.class) {
verifyL(name, i, (long[])g, (long[])r);
} else if (c == float.class) {
verifyF(name, i, (float[])g, (float[])r);
} else if (c == double.class) {
verifyD(name, i, (double[])g, (double[])r);
} else {
throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
" gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
" result[" + i + "].getClass() = " + r.getClass().getSimpleName());
}
}
}
static void verifyB(String name, int i, byte[] g, byte[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyS(String name, int i, short[] g, short[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyC(String name, int i, char[] g, char[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyI(String name, int i, int[] g, int[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyL(String name, int i, long[] g, long[] r) {
for (int j = 0; j < g.length; j++) {
if (g[j] != r[j]) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyF(String name, int i, float[] g, float[] r) {
for (int j = 0; j < g.length; j++) {
if (Float.floatToIntBits(g[j]) != Float.floatToIntBits(r[j])) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
static void verifyD(String name, int i, double[] g, double[] r) {
for (int j = 0; j < g.length; j++) {
if (Double.doubleToLongBits(g[j]) != Double.doubleToLongBits(r[j])) {
throw new RuntimeException("verify " + name + ": arrays must have same content:" +
" gold[" + i + "][" + j + "] = " + g[j] +
" result[" + i + "][" + j + "] = " + r[j]);
}
}
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// "inflate" method: 1 byte -> 2 byte.
// Java scalar code has no explicit conversion.
// Vector code would need a conversion. We may add this in the future.
static Object[] test0(byte[] src, char[] dst) {
for (int i = 0; i < src.length; i++) {
dst[i] = (char)(src[i] & 0xff);
}
return new Object[]{ src, dst };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// "inflate" method: 1 byte -> 2 byte.
// Java scalar code has no explicit conversion.
// Vector code would need a conversion. We may add this in the future.
static Object[] test1(byte[] src, char[] dst) {
for (int i = 0; i < src.length; i++) {
dst[i] = (char)(src[i]);
}
return new Object[]{ src, dst };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// "deflate" method: 2 byte -> 1 byte.
// Java scalar code has no explicit conversion.
// Vector code would need a conversion. We may add this in the future.
static Object[] test2(byte[] src, char[] dst) {
for (int i = 0; i < src.length; i++) {
src[i] = (byte)(dst[i]);
}
return new Object[]{ src, dst };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Used to not vectorize because of "alignment boundaries".
// Assume 64 byte vector width:
// a[i+0:i+15] and a[i+1:i+16], each are 4 * 16 = 64 byte.
// The alignment boundary is every 64 byte, so one of the two vectors gets cut up.
static Object[] test3(int[] a, int[] b) {
for (int i = 0; i < a.length-1; i++) {
a[i] = (int)(b[i] + a[i+1]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_VI, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// same as test3, but hand-unrolled
static Object[] test4(int[] a, int[] b) {
for (int i = 0; i < a.length-2; i+=2) {
a[i+0] = (int)(b[i+0] + a[i+1]);
a[i+1] = (int)(b[i+1] + a[i+2]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// In theory, one would expect this to be a simple 4byte -> 4byte conversion.
// But there is a CmpF and CMove here because we check for isNaN. Plus a MoveF2I.
//
// Would be nice to vectorize: Missing support for CmpF, CMove and MoveF2I.
static Object[] test5(int[] a, float[] b) {
for (int i = 0; i < a.length; i++) {
a[i] = Float.floatToIntBits(b[i]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Missing support for MoveF2I
static Object[] test6(int[] a, float[] b) {
for (int i = 0; i < a.length; i++) {
a[i] = Float.floatToRawIntBits(b[i]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Missing support for MoveI2F
static Object[] test7(int[] a, float[] b) {
for (int i = 0; i < a.length; i++) {
b[i] = Float.intBitsToFloat(a[i]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Missing support for Needs CmpD, CMove and MoveD2L
static Object[] test8(long[] a, double[] b) {
for (int i = 0; i < a.length; i++) {
a[i] = Double.doubleToLongBits(b[i]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Missing support for MoveD2L
static Object[] test9(long[] a, double[] b) {
for (int i = 0; i < a.length; i++) {
a[i] = Double.doubleToRawLongBits(b[i]);
}
return new Object[]{ a, b };
}
@Test
@IR(counts = {IRNode.STORE_VECTOR, "= 0"},
applyIfPlatform = {"64-bit", "true"},
applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
// Missing support for MoveL2D
static Object[] test10(long[] a, double[] b) {
for (int i = 0; i < a.length; i++) {
b[i] = Double.longBitsToDouble(a[i]);
}
return new Object[]{ a, b };
}
@Test
// MaxI reduction is with char type, but the MaxI char vector is not implemented.
static Object[] test11(char[] a) {
char m = 0;
for (int i = 0; i < a.length; i++) {
m = (char)Math.max(m, a[i]);
a[i] = 0;
}
return new Object[]{ a, new char[] { m } };
}
}

View File

@ -390,9 +390,9 @@ public class TestSplitPacks {
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
@ -405,8 +405,6 @@ public class TestSplitPacks {
// | | \ \ \ \
// 0 1 - - 4 5 6 7
//
// The 4-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2a(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
@ -428,9 +426,9 @@ public class TestSplitPacks {
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "= 0",
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
@ -444,8 +442,6 @@ public class TestSplitPacks {
// | | | | \ \
// 0 1 2 3 -- 6 7
//
// The 2-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2b(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
@ -468,9 +464,9 @@ public class TestSplitPacks {
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
applyIfPlatform = {"64-bit", "true"},
@ -483,8 +479,6 @@ public class TestSplitPacks {
// | | / / / /
// 0 1 2 3 4 5 - -
//
// The 4-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2c(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;
@ -506,9 +500,9 @@ public class TestSplitPacks {
}
@Test
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "= 0",
@IR(counts = {IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.LOAD_VECTOR_I, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "= 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_2, "> 0",
IRNode.AND_VI, IRNode.VECTOR_SIZE_4, "> 0",
IRNode.STORE_VECTOR, "> 0"},
applyIf = {"MaxVectorSize", ">=32"},
@ -522,8 +516,6 @@ public class TestSplitPacks {
// | | | | / /
// 0 1 2 3 4 5 - -
//
// The 2-pack does not vectorize. This is a technical limitation that
// we can hopefully soon remove. Load and store offsets are different.
static Object[] test2d(int[] a, int[] b, int mask) {
for (int i = 0; i < RANGE; i+=8) {
int b0 = a[i+0] & mask;