From cfa25b71a65bfff1b31efe0d37ded37c50a98247 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 21 May 2024 10:31:14 +0000 Subject: [PATCH] 8328544: Improve handling of vectorization Co-authored-by: Christian Hagedorn Reviewed-by: mschoene, kvn, chagedorn, rhalade --- src/hotspot/share/opto/vectorization.cpp | 472 +++++++++++++++++- src/hotspot/share/opto/vectorization.hpp | 89 +++- .../TestVectorizationMismatchedAccess.java | 337 +++++++++++-- .../loopopts/superword/TestAlignVector.java | 8 +- .../superword/TestAlignVectorFuzzer.java | 20 +- ...tIndependentPacksWithCyclicDependency.java | 140 +++--- ...IndependentPacksWithCyclicDependency2.java | 28 +- .../TestScheduleReordersScalarMemops.java | 8 +- 8 files changed, 936 insertions(+), 166 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 8d2d3868fe6..f013abb38fb 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -416,6 +416,10 @@ VPointer::VPointer(MemNode* const mem, const VLoop& vloop, #ifdef ASSERT _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr), #endif + _has_int_index_after_convI2L(false), + _int_index_after_convI2L_offset(0), + _int_index_after_convI2L_invar(nullptr), + _int_index_after_convI2L_scale(0), _nstack(nstack), _analyze_only(analyze_only), _stack_idx(0) #ifndef PRODUCT , _tracer(vloop.is_trace_pointer_analysis()) @@ -495,6 +499,11 @@ VPointer::VPointer(MemNode* const mem, const VLoop& vloop, return; } + if (!is_safe_to_use_as_simple_form(base, adr)) { + assert(!valid(), "does not have simple form"); + return; + } + _base = base; _adr = adr; assert(valid(), "Usable"); @@ -508,6 +517,10 @@ VPointer::VPointer(VPointer* p) : #ifdef ASSERT _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr), #endif + _has_int_index_after_convI2L(false), + _int_index_after_convI2L_offset(0), + _int_index_after_convI2L_invar(nullptr), + _int_index_after_convI2L_scale(0), _nstack(p->_nstack), _analyze_only(p->_analyze_only), _stack_idx(p->_stack_idx) #ifndef PRODUCT , _tracer(p->_tracer._is_trace_alignment) @@ -530,6 +543,354 @@ int VPointer::invar_factor() const { return 1; } +// We would like to make decisions about aliasing (i.e. removing memory edges) and adjacency +// (i.e. which loads/stores can be packed) based on the simple form: +// +// s_pointer = adr + offset + invar + scale * ConvI2L(iv) +// +// However, we parse the compound-long-int form: +// +// c_pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_index) +// int_index = int_offset + int_invar + int_scale * iv +// +// In general, the simple and the compound-long-int form do not always compute the same pointer +// at runtime. For example, the simple form would give a different result due to an overflow +// in the int_index. +// +// Example: +// For both forms, we have: +// iv = 0 +// scale = 1 +// +// We now account the offset and invar once to the long part and once to the int part: +// Pointer 1 (long offset and long invar): +// long_offset = min_int +// long_invar = min_int +// int_offset = 0 +// int_invar = 0 +// +// Pointer 2 (int offset and int invar): +// long_offset = 0 +// long_invar = 0 +// int_offset = min_int +// int_invar = min_int +// +// This gives us the following pointers: +// Compound-long-int form pointers: +// Form: +// c_pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_offset + int_invar + int_scale * iv) +// +// Pointers: +// c_pointer1 = adr + min_int + min_int + 1 * ConvI2L(0 + 0 + 1 * 0) +// = adr + min_int + min_int +// = adr - 2^32 +// +// c_pointer2 = adr + 0 + 0 + 1 * ConvI2L(min_int + min_int + 1 * 0) +// = adr + ConvI2L(min_int + min_int) +// = adr + 0 +// = adr +// +// Simple form pointers: +// Form: +// s_pointer = adr + offset + invar + scale * ConvI2L(iv) +// s_pointer = adr + (long_offset + int_offset) + (long_invar + int_invar) + (long_scale * int_scale) * ConvI2L(iv) +// +// Pointers: +// s_pointer1 = adr + (min_int + 0 ) + (min_int + 0 ) + 1 * 0 +// = adr + min_int + min_int +// = adr - 2^32 +// s_pointer2 = adr + (0 + min_int ) + (0 + min_int ) + 1 * 0 +// = adr + min_int + min_int +// = adr - 2^32 +// +// We see that the two addresses are actually 2^32 bytes apart (derived from the c_pointers), but their simple form look identical. +// +// Hence, we need to determine in which cases it is safe to make decisions based on the simple +// form, rather than the compound-long-int form. If we cannot prove that using the simple form +// is safe (i.e. equivalent to the compound-long-int form), then we do not get a valid VPointer, +// and the associated memop cannot be vectorized. +bool VPointer::is_safe_to_use_as_simple_form(Node* base, Node* adr) const { +#ifndef _LP64 + // On 32-bit platforms, there is never an explicit int_index with ConvI2L for the iv. Thus, the + // parsed pointer form is always the simple form, with int operations: + // + // pointer = adr + offset + invar + scale * iv + // + assert(!_has_int_index_after_convI2L, "32-bit never has an int_index with ConvI2L for the iv"); + return true; +#else + + // Array accesses that are not Unsafe always have a RangeCheck which ensures that there is no + // int_index overflow. This implies that the conversion to long can be done separately: + // + // ConvI2L(int_index) = ConvI2L(int_offset) + ConvI2L(int_invar) + ConvI2L(scale) * ConvI2L(iv) + // + // And hence, the simple form is guaranteed to be identical to the compound-long-int form at + // runtime and the VPointer is safe/valid to be used. + const TypeAryPtr* ary_ptr_t = _mem->adr_type()->isa_aryptr(); + if (ary_ptr_t != nullptr) { + if (!_mem->is_unsafe_access()) { + return true; + } + } + + // We did not find the int_index. Just to be safe, reject this VPointer. + if (!_has_int_index_after_convI2L) { + return false; + } + + int int_offset = _int_index_after_convI2L_offset; + Node* int_invar = _int_index_after_convI2L_invar; + int int_scale = _int_index_after_convI2L_scale; + int long_scale = _scale / int_scale; + + // If "int_index = iv", then the simple form is identical to the compound-long-int form. + // + // int_index = int_offset + int_invar + int_scale * iv + // = 0 0 1 * iv + // = iv + if (int_offset == 0 && int_invar == nullptr && int_scale == 1) { + return true; + } + + // Intuition: What happens if the int_index overflows? Let us look at two pointers on the "overflow edge": + // + // pointer1 = adr + ConvI2L(int_index1) + // pointer2 = adr + ConvI2L(int_index2) + // + // int_index1 = max_int + 0 = max_int -> very close to but before the overflow + // int_index2 = max_int + 1 = min_int -> just enough to get the overflow + // + // When looking at the difference of pointer1 and pointer2, we notice that it is very large + // (almost 2^32). Since arrays have at most 2^31 elements, chances are high that pointer2 is + // an actual out-of-bounds access at runtime. These would normally be prevented by range checks + // at runtime. However, if the access was done by using Unsafe, where range checks are omitted, + // then an out-of-bounds access constitutes undefined behavior. This means that we are allowed to + // do anything, including changing the behavior. + // + // If we can set the right conditions, we have a guarantee that an overflow is either impossible + // (no overflow or range checks preventing that) or undefined behavior. In both cases, we are + // safe to do a vectorization. + // + // Approach: We want to prove a lower bound for the distance between these two pointers, and an + // upper bound for the size of a memory object. We can derive such an upper bound for + // arrays. We know they have at most 2^31 elements. If we know the size of the elements + // in bytes, we have: + // + // array_element_size_in_bytes * 2^31 >= max_possible_array_size_in_bytes + // >= array_size_in_bytes (ARR) + // + // If some small difference "delta" leads to an int_index overflow, we know that the + // int_index1 before overflow must have been close to max_int, and the int_index2 after + // the overflow must be close to min_int: + // + // pointer1 = adr + long_offset + long_invar + long_scale * ConvI2L(int_index1) + // =approx adr + long_offset + long_invar + long_scale * max_int + // + // pointer2 = adr + long_offset + long_invar + long_scale * ConvI2L(int_index2) + // =approx adr + long_offset + long_invar + long_scale * min_int + // + // We realize that the pointer difference is very large: + // + // difference =approx long_scale * 2^32 + // + // Hence, if we set the right condition for long_scale and array_element_size_in_bytes, + // we can prove that an overflow is impossible (or would imply undefined behaviour). + // + // We must now take this intuition, and develop a rigorous proof. We start by stating the problem + // more precisely, with the help of some definitions and the Statement we are going to prove. + // + // Definition: + // Two VPointers are "comparable" (i.e. VPointer::comparable is true, set with VPointer::cmp()), + // iff all of these conditions apply for the simple form: + // 1) Both VPointers are valid. + // 2) The adr are identical, or both are array bases of different arrays. + // 3) They have identical scale. + // 4) They have identical invar. + // 5) The difference in offsets is limited: abs(offset1 - offset2) < 2^31. (DIFF) + // + // For the Vectorization Optimization, we pair-wise compare VPointers and determine if they are: + // 1) "not comparable": + // We do not optimize them (assume they alias, not assume adjacency). + // + // Whenever we chose this option based on the simple form, it is also correct based on the + // compound-long-int form, since we make no optimizations based on it. + // + // 2) "comparable" with different array bases at runtime: + // We assume they do not alias (remove memory edges), but not assume adjacency. + // + // Whenever we have two different array bases for the simple form, we also have different + // array bases for the compound-long-form. Since VPointers provably point to different + // memory objects, they can never alias. + // + // 3) "comparable" with the same base address: + // We compute the relative pointer difference, and based on the load/store size we can + // compute aliasing and adjacency. + // + // We must find a condition under which the pointer difference of the simple form is + // identical to the pointer difference of the compound-long-form. We do this with the + // Statement below, which we then proceed to prove. + // + // Statement: + // If two VPointers satisfy these 3 conditions: + // 1) They are "comparable". + // 2) They have the same base address. + // 3) Their long_scale is a multiple of the array element size in bytes: + // + // abs(long_scale) % array_element_size_in_bytes = 0 (A) + // + // Then their pointer difference of the simple form is identical to the pointer difference + // of the compound-long-int form. + // + // More precisely: + // Such two VPointers by definition have identical adr, invar, and scale. + // Their simple form is: + // + // s_pointer1 = adr + offset1 + invar + scale * ConvI2L(iv) (B1) + // s_pointer2 = adr + offset2 + invar + scale * ConvI2L(iv) (B2) + // + // Thus, the pointer difference of the simple forms collapses to the difference in offsets: + // + // s_difference = s_pointer1 - s_pointer2 = offset1 - offset2 (C) + // + // Their compound-long-int form for these VPointer is: + // + // c_pointer1 = adr + long_offset1 + long_invar1 + long_scale1 * ConvI2L(int_index1) (D1) + // int_index1 = int_offset1 + int_invar1 + int_scale1 * iv (D2) + // + // c_pointer2 = adr + long_offset2 + long_invar2 + long_scale2 * ConvI2L(int_index2) (D3) + // int_index2 = int_offset2 + int_invar2 + int_scale2 * iv (D4) + // + // And these are the offset1, offset2, invar and scale from the simple form (B1) and (B2): + // + // offset1 = long_offset1 + long_scale1 * ConvI2L(int_offset1) (D5) + // offset2 = long_offset2 + long_scale2 * ConvI2L(int_offset2) (D6) + // + // invar = long_invar1 + long_scale1 * ConvI2L(int_invar1) + // = long_invar2 + long_scale2 * ConvI2L(int_invar2) (D7) + // + // scale = long_scale1 * ConvI2L(int_scale1) + // = long_scale2 * ConvI2L(int_scale2) (D8) + // + // The pointer difference of the compound-long-int form is defined as: + // + // c_difference = c_pointer1 - c_pointer2 + // + // Thus, the statement claims that for the two VPointer we have: + // + // s_difference = c_difference (Statement) + // + // We prove the Statement with the help of a Lemma: + // + // Lemma: + // There is some integer x, such that: + // + // c_difference = s_difference + array_element_size_in_bytes * x * 2^32 (Lemma) + // + // From condition (DIFF), we can derive: + // + // abs(s_difference) < 2^31 (E) + // + // Assuming the Lemma, we prove the Statement: + // If "x = 0" (intuitively: the int_index does not overflow), then: + // c_difference = s_difference + // and hence the simple form computes the same pointer difference as the compound-long-int form. + // If "x != 0" (intuitively: the int_index overflows), then: + // abs(c_difference) >= abs(s_difference + array_element_size_in_bytes * x * 2^32) + // >= array_element_size_in_bytes * 2^32 - abs(s_difference) + // -- apply (E) -- + // > array_element_size_in_bytes * 2^32 - 2^31 + // >= array_element_size_in_bytes * 2^31 + // -- apply (ARR) -- + // >= max_possible_array_size_in_bytes + // >= array_size_in_bytes + // + // This shows that c_pointer1 and c_pointer2 have a distance that exceeds the maximum array size. + // Thus, at least one of the two pointers must be outside of the array bounds. But we can assume + // that out-of-bounds accesses do not happen. If they still do, it is undefined behavior. Hence, + // we are allowed to do anything. We can also "safely" use the simple form in this case even though + // it might not match the compound-long-int form at runtime. + // QED Statement. + // + // We must now prove the Lemma. + // + // ConvI2L always truncates by some power of 2^32, i.e. there is some integer y such that: + // + // ConvI2L(y1 + y2) = ConvI2L(y1) + ConvI2L(y2) + 2^32 * y (F) + // + // It follows, that there is an integer y1 such that: + // + // ConvI2L(int_index1) = ConvI2L(int_offset1 + int_invar1 + int_scale1 * iv) + // -- apply (F) -- + // = ConvI2L(int_offset1) + // + ConvI2L(int_invar1) + // + ConvI2L(int_scale1) * ConvI2L(iv) + // + y1 * 2^32 (G) + // + // Thus, we can write the compound-long-int form (D1) as: + // + // c_pointer1 = adr + long_offset1 + long_invar1 + long_scale1 * ConvI2L(int_index1) + // -- apply (G) -- + // = adr + // + long_offset1 + // + long_invar1 + // + long_scale1 * ConvI2L(int_offset1) + // + long_scale1 * ConvI2L(int_invar1) + // + long_scale1 * ConvI2L(int_scale1) * ConvI2L(iv) + // + long_scale1 * y1 * 2^32 (H) + // + // And we can write the simple form as: + // + // s_pointer1 = adr + offset1 + invar + scale * ConvI2L(iv) + // -- apply (D5, D7, D8) -- + // = adr + // + long_offset1 + // + long_scale1 * ConvI2L(int_offset1) + // + long_invar1 + // + long_scale1 * ConvI2L(int_invar1) + // + long_scale1 * ConvI2L(int_scale1) * ConvI2L(iv) (K) + // + // We now compute the pointer difference between the simple (K) and compound-long-int form (H). + // Most terms cancel out immediately: + // + // sc_difference1 = c_pointer1 - s_pointer1 = long_scale1 * y1 * 2^32 (L) + // + // Rearranging the equation (L), we get: + // + // c_pointer1 = s_pointer1 + long_scale1 * y1 * 2^32 (M) + // + // And since long_scale1 is a multiple of array_element_size_in_bytes, there is some integer + // x1, such that (M) implies: + // + // c_pointer1 = s_pointer1 + array_element_size_in_bytes * x1 * 2^32 (N) + // + // With an analogue equation for c_pointer2, we can now compute the pointer difference for + // the compound-long-int form: + // + // c_difference = c_pointer1 - c_pointer2 + // -- apply (N) -- + // = s_pointer1 + array_element_size_in_bytes * x1 * 2^32 + // -(s_pointer2 + array_element_size_in_bytes * x2 * 2^32) + // -- where "x = x1 - x2" -- + // = s_pointer1 - s_pointer2 + array_element_size_in_bytes * x * 2^32 + // -- apply (C) -- + // = s_difference + array_element_size_in_bytes * x * 2^32 + // QED Lemma. + if (ary_ptr_t != nullptr) { + BasicType array_element_bt = ary_ptr_t->elem()->array_element_basic_type(); + if (is_java_primitive(array_element_bt)) { + int array_element_size_in_bytes = type2aelembytes(array_element_bt); + if (abs(long_scale) % array_element_size_in_bytes == 0) { + return true; + } + } + } + + // General case: we do not know if it is safe to use the simple form. + return false; +#endif +} + bool VPointer::is_loop_member(Node* n) const { Node* n_c = phase()->get_ctrl(n); return lpt()->is_member(phase()->get_loop(n_c)); @@ -632,6 +993,37 @@ bool VPointer::scaled_iv(Node* n) { NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);) return true; } + } else if (opc == Op_ConvI2L && !has_iv()) { + // So far we have not found the iv yet, and are about to enter a ConvI2L subgraph, + // which may be the int index (that might overflow) for the memory access, of the form: + // + // int_index = int_offset + int_invar + int_scale * iv + // + // If we simply continue parsing with the current VPointer, then the int_offset and + // int_invar simply get added to the long offset and invar. But for the checks in + // VPointer::is_safe_to_use_as_simple_form() we need to have explicit access to the + // int_index. Thus, we must parse it explicitly here. For this, we use a temporary + // VPointer, to pattern match the int_index sub-expression of the address. + + NOT_PRODUCT(Tracer::Depth dddd;) + VPointer tmp(this); + NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);) + + if (tmp.scaled_iv_plus_offset(n->in(1)) && tmp.has_iv()) { + // We successfully matched an integer index, of the form: + // int_index = int_offset + int_invar + int_scale * iv + _has_int_index_after_convI2L = true; + _int_index_after_convI2L_offset = tmp._offset; + _int_index_after_convI2L_invar = tmp._invar; + _int_index_after_convI2L_scale = tmp._scale; + } + + // Now parse it again for the real VPointer. This makes sure that the int_offset, int_invar, + // and int_scale are properly added to the final VPointer's offset, invar, and scale. + if (scaled_iv_plus_offset(n->in(1))) { + NOT_PRODUCT(_tracer.scaled_iv_7(n);) + return true; + } } else if (opc == Op_ConvI2L || opc == Op_CastII) { if (scaled_iv_plus_offset(n->in(1))) { NOT_PRODUCT(_tracer.scaled_iv_7(n);) @@ -648,8 +1040,17 @@ bool VPointer::scaled_iv(Node* n) { if (tmp.scaled_iv_plus_offset(n->in(1))) { int scale = n->in(2)->get_int(); + // Accumulate scale. _scale = tmp._scale << scale; - _offset += tmp._offset << scale; + // Accumulate offset. + int shifted_offset = 0; + if (!try_LShiftI_no_overflow(tmp._offset, scale, shifted_offset)) { + return false; // shift overflow. + } + if (!try_AddI_no_overflow(_offset, shifted_offset, _offset)) { + return false; // add overflow. + } + // Accumulate invar. if (tmp._invar != nullptr) { BasicType bt = tmp._invar->bottom_type()->basic_type(); assert(bt == T_INT || bt == T_LONG, ""); @@ -658,6 +1059,13 @@ bool VPointer::scaled_iv(Node* n) { _debug_invar_scale = n->in(2); #endif } + + // Forward info about the int_index: + _has_int_index_after_convI2L = tmp._has_int_index_after_convI2L; + _int_index_after_convI2L_offset = tmp._int_index_after_convI2L_offset; + _int_index_after_convI2L_invar = tmp._int_index_after_convI2L_invar; + _int_index_after_convI2L_scale = tmp._int_index_after_convI2L_scale; + NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar);) return true; } @@ -675,7 +1083,9 @@ bool VPointer::offset_plus_k(Node* n, bool negate) { int opc = n->Opcode(); if (opc == Op_ConI) { - _offset += negate ? -(n->get_int()) : n->get_int(); + if (!try_AddSubI_no_overflow(_offset, n->get_int(), negate, _offset)) { + return false; // add/sub overflow. + } NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);) return true; } else if (opc == Op_ConL) { @@ -684,7 +1094,9 @@ bool VPointer::offset_plus_k(Node* n, bool negate) { if (t->higher_equal(TypeLong::INT)) { jlong loff = n->get_long(); jint off = (jint)loff; - _offset += negate ? -off : loff; + if (!try_AddSubI_no_overflow(_offset, off, negate, _offset)) { + return false; // add/sub overflow. + } NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);) return true; } @@ -699,11 +1111,15 @@ bool VPointer::offset_plus_k(Node* n, bool negate) { if (opc == Op_AddI) { if (n->in(2)->is_Con() && invariant(n->in(1))) { maybe_add_to_invar(n->in(1), negate); - _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int(); + if (!try_AddSubI_no_overflow(_offset, n->in(2)->get_int(), negate, _offset)) { + return false; // add/sub overflow. + } NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, negate, _offset);) return true; } else if (n->in(1)->is_Con() && invariant(n->in(2))) { - _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int(); + if (!try_AddSubI_no_overflow(_offset, n->in(1)->get_int(), negate, _offset)) { + return false; // add/sub overflow. + } maybe_add_to_invar(n->in(2), negate); NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, negate, _offset);) return true; @@ -712,11 +1128,15 @@ bool VPointer::offset_plus_k(Node* n, bool negate) { if (opc == Op_SubI) { if (n->in(2)->is_Con() && invariant(n->in(1))) { maybe_add_to_invar(n->in(1), negate); - _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int(); + if (!try_AddSubI_no_overflow(_offset, n->in(2)->get_int(), !negate, _offset)) { + return false; // add/sub overflow. + } NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, negate, _offset);) return true; } else if (n->in(1)->is_Con() && invariant(n->in(2))) { - _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int(); + if (!try_AddSubI_no_overflow(_offset, n->in(1)->get_int(), negate, _offset)) { + return false; // add/sub overflow. + } maybe_add_to_invar(n->in(2), !negate); NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, !negate, _offset);) return true; @@ -806,6 +1226,44 @@ void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) { _invar = register_if_new(add); } +bool VPointer::try_AddI_no_overflow(int offset1, int offset2, int& result) { + jlong long_offset = java_add((jlong)(offset1), (jlong)(offset2)); + jint int_offset = java_add( offset1, offset2); + if (long_offset != int_offset) { + return false; + } + result = int_offset; + return true; +} + +bool VPointer::try_SubI_no_overflow(int offset1, int offset2, int& result) { + jlong long_offset = java_subtract((jlong)(offset1), (jlong)(offset2)); + jint int_offset = java_subtract( offset1, offset2); + if (long_offset != int_offset) { + return false; + } + result = int_offset; + return true; +} + +bool VPointer::try_AddSubI_no_overflow(int offset1, int offset2, bool is_sub, int& result) { + if (is_sub) { + return try_SubI_no_overflow(offset1, offset2, result); + } else { + return try_AddI_no_overflow(offset1, offset2, result); + } +} + +bool VPointer::try_LShiftI_no_overflow(int offset, int shift, int& result) { + jlong long_offset = java_shift_left((jlong)(offset), shift); + jint int_offset = java_shift_left( offset, shift); + if (long_offset != int_offset) { + return false; + } + result = int_offset; + return true; +} + // We use two comparisons, because a subtraction could underflow. #define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ if (a < b) { return -1; } \ diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 3984407c565..7de29444e5f 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -670,13 +670,51 @@ private: // A vectorization pointer (VPointer) has information about an address for // dependence checking and vector alignment. It's usually bound to a memory // operation in a counted loop for vectorizable analysis. +// +// We parse and represent pointers of the simple form: +// +// pointer = adr + offset + invar + scale * ConvI2L(iv) +// +// Where: +// +// adr: the base address of an array (base = adr) +// OR +// some address to off-heap memory (base = TOP) +// +// offset: a constant offset +// invar: a runtime variable, which is invariant during the loop +// scale: scaling factor +// iv: loop induction variable +// +// But more precisely, we parse the composite-long-int form: +// +// pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_offset + inv_invar + int_scale * iv) +// +// pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_index) +// int_index = int_offset + int_invar + int_scale * iv +// +// However, for aliasing and adjacency checks (e.g. VPointer::cmp()) we always use the simple form to make +// decisions. Hence, we must make sure to only create a "valid" VPointer if the optimisations based on the +// simple form produce the same result as the compound-long-int form would. Intuitively, this depends on +// if the int_index overflows, but the precise conditions are given in VPointer::is_safe_to_use_as_simple_form(). +// +// ConvI2L(int_index) = ConvI2L(int_offset + int_invar + int_scale * iv) +// = Convi2L(int_offset) + ConvI2L(int_invar) + ConvI2L(int_scale) * ConvI2L(iv) +// +// scale = long_scale * ConvI2L(int_scale) +// offset = long_offset + long_scale * ConvI2L(int_offset) +// invar = long_invar + long_scale * ConvI2L(int_invar) +// +// pointer = adr + offset + invar + scale * ConvI2L(iv) +// class VPointer : public ArenaObj { protected: MemNode* const _mem; // My memory reference node const VLoop& _vloop; - Node* _base; // null if unsafe nonheap reference - Node* _adr; // address pointer + // Components of the simple form: + Node* _base; // Base address of an array OR null if some off-heap memory. + Node* _adr; // Same as _base if an array pointer OR some off-heap memory pointer. int _scale; // multiplier for iv (in bytes), 0 if no loop iv int _offset; // constant offset (in bytes) @@ -687,6 +725,13 @@ class VPointer : public ArenaObj { Node* _debug_invar_scale; // multiplier for invariant #endif + // The int_index components of the compound-long-int form. Used to decide if it is safe to use the + // simple form rather than the compound-long-int form that was parsed. + bool _has_int_index_after_convI2L; + int _int_index_after_convI2L_offset; + Node* _int_index_after_convI2L_invar; + int _int_index_after_convI2L_scale; + Node_Stack* _nstack; // stack used to record a vpointer trace of variants bool _analyze_only; // Used in loop unrolling only for vpointer trace uint _stack_idx; // Used in loop unrolling only for vpointer trace @@ -726,6 +771,8 @@ class VPointer : public ArenaObj { VPointer(VPointer* p); NONCOPYABLE(VPointer); + bool is_safe_to_use_as_simple_form(Node* base, Node* adr) const; + public: bool valid() const { return _adr != nullptr; } bool has_iv() const { return _scale != 0; } @@ -751,10 +798,43 @@ class VPointer : public ArenaObj { return _invar == q._invar; } + // We compute if and how two VPointers can alias at runtime, i.e. if the two addressed regions of memory can + // ever overlap. There are essentially 3 relevant return states: + // - NotComparable: Synonymous to "unknown aliasing". + // We have no information about how the two VPointers can alias. They could overlap, refer + // to another location in the same memory object, or point to a completely different object. + // -> Memory edge required. Aliasing unlikely but possible. + // + // - Less / Greater: Synonymous to "never aliasing". + // The two VPointers may point into the same memory object, but be non-aliasing (i.e. we + // know both address regions inside the same memory object, but these regions are non- + // overlapping), or the VPointers point to entirely different objects. + // -> No memory edge required. Aliasing impossible. + // + // - Equal: Synonymous to "overlap, or point to different memory objects". + // The two VPointers either overlap on the same memory object, or point to two different + // memory objects. + // -> Memory edge required. Aliasing likely. + // + // In a future refactoring, we can simplify to two states: + // - NeverAlias: instead of Less / Greater + // - MayAlias: instead of Equal / NotComparable + // + // Two VPointer are "comparable" (Less / Greater / Equal), iff all of these conditions apply: + // 1) Both are valid, i.e. expressible in the compound-long-int or simple form. + // 2) The adr are identical, or both are array bases of different arrays. + // 3) They have identical scale. + // 4) They have identical invar. + // 5) The difference in offsets is limited: abs(offset0 - offset1) < 2^31. int cmp(const VPointer& q) const { if (valid() && q.valid() && (_adr == q._adr || (_base == _adr && q._base == q._adr)) && _scale == q._scale && invar_equals(q)) { + jlong difference = abs(java_subtract((jlong)_offset, (jlong)q._offset)); + jlong max_diff = (jlong)1 << 31; + if (difference >= max_diff) { + return NotComparable; + } bool overlap = q._offset < _offset + memory_size() && _offset < q._offset + q.memory_size(); return overlap ? Equal : (_offset < q._offset ? Less : Greater); @@ -859,6 +939,11 @@ class VPointer : public ArenaObj { void maybe_add_to_invar(Node* new_invar, bool negate); + static bool try_AddI_no_overflow(int offset1, int offset2, int& result); + static bool try_SubI_no_overflow(int offset1, int offset2, int& result); + static bool try_AddSubI_no_overflow(int offset1, int offset2, bool is_sub, int& result); + static bool try_LShiftI_no_overflow(int offset1, int offset2, int& result); + Node* register_if_new(Node* n) const; }; diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizationMismatchedAccess.java b/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizationMismatchedAccess.java index b68ddfe2799..2fdbb0816ad 100644 --- a/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizationMismatchedAccess.java +++ b/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizationMismatchedAccess.java @@ -1,5 +1,6 @@ /* * Copyright (c) 2023, Red Hat, Inc. All rights reserved. + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -35,7 +36,6 @@ import java.nio.ByteOrder; * @test * @bug 8300258 * @key randomness - * @requires (os.simpleArch == "x64") | (os.simpleArch == "aarch64") * @summary C2: vectorization fails on simple ByteBuffer loop * @modules java.base/jdk.internal.misc * @library /test/lib / @@ -147,193 +147,420 @@ public class TestVectorizationMismatchedAccess { } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testByteLong1(byte[] dest, long[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. + public static void testByteLong1a(byte[] dest, long[] src) { for (int i = 0; i < src.length; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i, src[i]); } } - @Run(test = "testByteLong1") - public static void testByteLong1_runner() { - runAndVerify(() -> testByteLong1(byteArray, longArray), 0); + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteLong1b(byte[] dest, long[] src) { + for (int i = 0; i < src.length; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i, src[i]); + } } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testByteLong2(byte[] dest, long[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}) + public static void testByteLong1c(byte[] dest, long[] src) { + long base = 64; // make sure it is big enough and 8 byte aligned (required for 32-bit) + for (int i = 0; i < src.length - 8; i++) { + UNSAFE.putLongUnaligned(dest, base + 8 * i, src[i]); + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteLong1d(byte[] dest, long[] src) { + long base = 64; // make sure it is big enough and 8 byte aligned (required for 32-bit) + for (int i = 0; i < src.length - 8; i++) { + UNSAFE.putLongUnaligned(dest, base + 8L * i, src[i]); + } + } + + @Run(test = {"testByteLong1a", "testByteLong1b", "testByteLong1c", "testByteLong1d"}) + public static void testByteLong1_runner() { + runAndVerify(() -> testByteLong1a(byteArray, longArray), 0); + runAndVerify(() -> testByteLong1b(byteArray, longArray), 0); + testByteLong1c(byteArray, longArray); + testByteLong1d(byteArray, longArray); + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. + public static void testByteLong2a(byte[] dest, long[] src) { for (int i = 1; i < src.length; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i - 1), src[i]); } } - @Run(test = "testByteLong2") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteLong2b(byte[] dest, long[] src) { + for (int i = 1; i < src.length; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i - 1), src[i]); + } + } + + @Run(test = {"testByteLong2a", "testByteLong2b"}) public static void testByteLong2_runner() { - runAndVerify(() -> testByteLong2(byteArray, longArray), -8); + runAndVerify(() -> testByteLong2a(byteArray, longArray), -8); + runAndVerify(() -> testByteLong2b(byteArray, longArray), -8); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testByteLong3(byte[] dest, long[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. + public static void testByteLong3a(byte[] dest, long[] src) { for (int i = 0; i < src.length - 1; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + 1), src[i]); } } - @Run(test = "testByteLong3") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteLong3b(byte[] dest, long[] src) { + for (int i = 0; i < src.length - 1; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + 1), src[i]); + } + } + + @Run(test = {"testByteLong3a", "testByteLong3b"}) public static void testByteLong3_runner() { - runAndVerify(() -> testByteLong3(byteArray, longArray), 8); + runAndVerify(() -> testByteLong3a(byteArray, longArray), 8); + runAndVerify(() -> testByteLong3b(byteArray, longArray), 8); } @Test @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, applyIf = {"AlignVector", "false"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. // AlignVector cannot guarantee that invar is aligned. - public static void testByteLong4(byte[] dest, long[] src, int start, int stop) { + public static void testByteLong4a(byte[] dest, long[] src, int start, int stop) { for (int i = start; i < stop; i++) { UNSAFE.putLongUnaligned(dest, 8 * i + baseOffset, src[i]); } } - @Run(test = "testByteLong4") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + // AlignVector cannot guarantee that invar is aligned. + public static void testByteLong4b(byte[] dest, long[] src, int start, int stop) { + for (int i = start; i < stop; i++) { + UNSAFE.putLongUnaligned(dest, 8L * i + baseOffset, src[i]); + } + } + + @Run(test = {"testByteLong4a", "testByteLong4b"}) public static void testByteLong4_runner() { baseOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET; - runAndVerify(() -> testByteLong4(byteArray, longArray, 0, size), 0); + runAndVerify(() -> testByteLong4a(byteArray, longArray, 0, size), 0); + runAndVerify(() -> testByteLong4b(byteArray, longArray, 0, size), 0); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testByteLong5(byte[] dest, long[] src, int start, int stop) { + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. + public static void testByteLong5a(byte[] dest, long[] src, int start, int stop) { for (int i = start; i < stop; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + baseOffset), src[i]); } } - @Run(test = "testByteLong5") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteLong5b(byte[] dest, long[] src, int start, int stop) { + for (int i = start; i < stop; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + baseOffset), src[i]); + } + } + + @Run(test = {"testByteLong5a", "testByteLong5b"}) public static void testByteLong5_runner() { baseOffset = 1; - runAndVerify(() -> testByteLong5(byteArray, longArray, 0, size-1), 8); + runAndVerify(() -> testByteLong5a(byteArray, longArray, 0, size-1), 8); + runAndVerify(() -> testByteLong5b(byteArray, longArray, 0, size-1), 8); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testByteByte1(byte[] dest, byte[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. + public static void testByteByte1a(byte[] dest, byte[] src) { for (int i = 0; i < src.length / 8; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i)); } } - @Run(test = "testByteByte1") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteByte1b(byte[] dest, byte[] src) { + for (int i = 0; i < src.length / 8; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i)); + } + } + + @Run(test = {"testByteByte1a", "testByteByte1b"}) public static void testByteByte1_runner() { - runAndVerify2(() -> testByteByte1(byteArray, byteArray), 0); + runAndVerify2(() -> testByteByte1a(byteArray, byteArray), 0); + runAndVerify2(() -> testByteByte1b(byteArray, byteArray), 0); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testByteByte2(byte[] dest, byte[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned). + // might get fixed with JDK-8325155. + public static void testByteByte2a(byte[] dest, byte[] src) { for (int i = 1; i < src.length / 8; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i - 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i)); } } - @Run(test = "testByteByte2") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"}, + applyIfPlatform = {"64-bit", "true"}) + // 32-bit: address has ConvL2I for cast of long to address, not supported. + public static void testByteByte2b(byte[] dest, byte[] src) { + for (int i = 1; i < src.length / 8; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i - 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i)); + } + } + + @Run(test = {"testByteByte2a", "testByteByte2b"}) public static void testByteByte2_runner() { - runAndVerify2(() -> testByteByte2(byteArray, byteArray), -8); + runAndVerify2(() -> testByteByte2a(byteArray, byteArray), -8); + runAndVerify2(() -> testByteByte2b(byteArray, byteArray), -8); } @Test @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR }) - public static void testByteByte3(byte[] dest, byte[] src) { + public static void testByteByte3a(byte[] dest, byte[] src) { for (int i = 0; i < src.length / 8 - 1; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i)); } } - @Run(test = "testByteByte3") + @Test + @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR }) + public static void testByteByte3b(byte[] dest, byte[] src) { + for (int i = 0; i < src.length / 8 - 1; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i)); + } + } + + @Run(test = {"testByteByte3a", "testByteByte3b"}) public static void testByteByte3_runner() { - runAndVerify2(() -> testByteByte3(byteArray, byteArray), 8); + runAndVerify2(() -> testByteByte3a(byteArray, byteArray), 8); + runAndVerify2(() -> testByteByte3b(byteArray, byteArray), 8); } @Test @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR }) - public static void testByteByte4(byte[] dest, byte[] src, int start, int stop) { + public static void testByteByte4a(byte[] dest, byte[] src, int start, int stop) { for (int i = start; i < stop; i++) { UNSAFE.putLongUnaligned(dest, 8 * i + baseOffset, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i)); } } - @Run(test = "testByteByte4") + @Test + @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR }) + public static void testByteByte4b(byte[] dest, byte[] src, int start, int stop) { + for (int i = start; i < stop; i++) { + UNSAFE.putLongUnaligned(dest, 8L * i + baseOffset, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i)); + } + } + + @Run(test = {"testByteByte4a", "testByteByte4b"}) public static void testByteByte4_runner() { baseOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET; - runAndVerify2(() -> testByteByte4(byteArray, byteArray, 0, size), 0); + runAndVerify2(() -> testByteByte4a(byteArray, byteArray, 0, size), 0); + runAndVerify2(() -> testByteByte4b(byteArray, byteArray, 0, size), 0); } @Test @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR }) - public static void testByteByte5(byte[] dest, byte[] src, int start, int stop) { + public static void testByteByte5a(byte[] dest, byte[] src, int start, int stop) { for (int i = start; i < stop; i++) { UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + baseOffset), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i)); } } - @Run(test = "testByteByte5") + @Test + @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR }) + public static void testByteByte5b(byte[] dest, byte[] src, int start, int stop) { + for (int i = start; i < stop; i++) { + UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + baseOffset), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i)); + } + } + + @Run(test = {"testByteByte5a", "testByteByte5b"}) public static void testByteByte5_runner() { baseOffset = 1; - runAndVerify2(() -> testByteByte5(byteArray, byteArray, 0, size-1), 8); + runAndVerify2(() -> testByteByte5a(byteArray, byteArray, 0, size-1), 8); + runAndVerify2(() -> testByteByte5b(byteArray, byteArray, 0, size-1), 8); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testOffHeapLong1(long dest, long[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) + // FAILS: adr is CastX2P(dest + 8 * (i + int_con)) + // See: JDK-8331576 + public static void testOffHeapLong1a(long dest, long[] src) { for (int i = 0; i < src.length; i++) { UNSAFE.putLongUnaligned(null, dest + 8 * i, src[i]); } } - @Run(test = "testOffHeapLong1") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) + // FAILS: adr is CastX2P(dest + 8L * (i + int_con)) + // See: JDK-8331576 + public static void testOffHeapLong1b(long dest, long[] src) { + for (int i = 0; i < src.length; i++) { + UNSAFE.putLongUnaligned(null, dest + 8L * i, src[i]); + } + } + + @Run(test = {"testOffHeapLong1a", "testOffHeapLong1b"}) public static void testOffHeapLong1_runner() { - runAndVerify3(() -> testOffHeapLong1(baseOffHeap, longArray), 0); + runAndVerify3(() -> testOffHeapLong1a(baseOffHeap, longArray), 0); + runAndVerify3(() -> testOffHeapLong1b(baseOffHeap, longArray), 0); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testOffHeapLong2(long dest, long[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) + // FAILS: adr is CastX2P + // See: JDK-8331576 + public static void testOffHeapLong2a(long dest, long[] src) { for (int i = 1; i < src.length; i++) { UNSAFE.putLongUnaligned(null, dest + 8 * (i - 1), src[i]); } } - @Run(test = "testOffHeapLong2") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) + // FAILS: adr is CastX2P + // See: JDK-8331576 + public static void testOffHeapLong2b(long dest, long[] src) { + for (int i = 1; i < src.length; i++) { + UNSAFE.putLongUnaligned(null, dest + 8L * (i - 1), src[i]); + } + } + + @Run(test = {"testOffHeapLong2a", "testOffHeapLong2b"}) public static void testOffHeapLong2_runner() { - runAndVerify3(() -> testOffHeapLong2(baseOffHeap, longArray), -8); + runAndVerify3(() -> testOffHeapLong2a(baseOffHeap, longArray), -8); + runAndVerify3(() -> testOffHeapLong2b(baseOffHeap, longArray), -8); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) - public static void testOffHeapLong3(long dest, long[] src) { + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) + // FAILS: adr is CastX2P + // See: JDK-8331576 + public static void testOffHeapLong3a(long dest, long[] src) { for (int i = 0; i < src.length - 1; i++) { UNSAFE.putLongUnaligned(null, dest + 8 * (i + 1), src[i]); } } - @Run(test = "testOffHeapLong3") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }) + // FAILS: adr is CastX2P + // See: JDK-8331576 + public static void testOffHeapLong3b(long dest, long[] src) { + for (int i = 0; i < src.length - 1; i++) { + UNSAFE.putLongUnaligned(null, dest + 8L * (i + 1), src[i]); + } + } + + @Run(test = {"testOffHeapLong3a", "testOffHeapLong3b"}) public static void testOffHeapLong3_runner() { - runAndVerify3(() -> testOffHeapLong3(baseOffHeap, longArray), 8); + runAndVerify3(() -> testOffHeapLong3a(baseOffHeap, longArray), 8); + runAndVerify3(() -> testOffHeapLong3b(baseOffHeap, longArray), 8); } @Test - @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, - applyIf = {"AlignVector", "false"}) + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + // applyIf = {"AlignVector", "false"}) + // FAILS: adr is CastX2P + // See: JDK-8331576 // AlignVector cannot guarantee that invar is aligned. - public static void testOffHeapLong4(long dest, long[] src, int start, int stop) { + public static void testOffHeapLong4a(long dest, long[] src, int start, int stop) { for (int i = start; i < stop; i++) { UNSAFE.putLongUnaligned(null, dest + 8 * i + baseOffset, src[i]); } } - @Run(test = "testOffHeapLong4") + @Test + @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary + // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" }, + // applyIf = {"AlignVector", "false"}) + // FAILS: adr is CastX2P + // See: JDK-8331576 + // AlignVector cannot guarantee that invar is aligned. + public static void testOffHeapLong4b(long dest, long[] src, int start, int stop) { + for (int i = start; i < stop; i++) { + UNSAFE.putLongUnaligned(null, dest + 8L * i + baseOffset, src[i]); + } + } + + @Run(test = {"testOffHeapLong4a", "testOffHeapLong4b"}) public static void testOffHeapLong4_runner() { baseOffset = 8; - runAndVerify3(() -> testOffHeapLong4(baseOffHeap, longArray, 0, size-1), 8); + runAndVerify3(() -> testOffHeapLong4a(baseOffHeap, longArray, 0, size-1), 8); + runAndVerify3(() -> testOffHeapLong4b(baseOffHeap, longArray, 0, size-1), 8); } } diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java index fd5c2969074..c77f4f6fa2e 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java @@ -1363,7 +1363,7 @@ public class TestAlignVector { static Object[] test17a(long[] a) { // Unsafe: vectorizes with profiling (not xcomp) for (int i = 0; i < RANGE; i++) { - int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i; + long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i; long v = UNSAFE.getLongUnaligned(a, adr); UNSAFE.putLongUnaligned(a, adr, v + 1); } @@ -1375,7 +1375,7 @@ public class TestAlignVector { static Object[] test17b(long[] a) { // Not alignable for (int i = 0; i < RANGE-1; i++) { - int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1; + long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1; long v = UNSAFE.getLongUnaligned(a, adr); UNSAFE.putLongUnaligned(a, adr, v + 1); } @@ -1392,7 +1392,7 @@ public class TestAlignVector { static Object[] test17c(long[] a) { // Unsafe: aligned vectorizes for (int i = 0; i < RANGE-1; i+=4) { - int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i; + long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i; long v0 = UNSAFE.getLongUnaligned(a, adr + 0); long v1 = UNSAFE.getLongUnaligned(a, adr + 8); UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1); @@ -1422,7 +1422,7 @@ public class TestAlignVector { static Object[] test17d(long[] a) { // Not alignable for (int i = 0; i < RANGE-1; i+=4) { - int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1; + long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1; long v0 = UNSAFE.getLongUnaligned(a, adr + 0); long v1 = UNSAFE.getLongUnaligned(a, adr + 8); UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1); diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java index 7b95781905e..d75db965ea3 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java @@ -1090,11 +1090,11 @@ public class TestAlignVectorFuzzer { int init = init_con_or_var(); int limit = limit_con_or_var(); int stride = stride_con(); - int scale = scale_con(); - int offset = offset1_con_or_var(); + long scale = scale_con(); + long offset = offset1_con_or_var(); for (int i = init; i < limit; i += stride) { - int adr = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset + i * scale; + long adr = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset + i * scale; int v = UNSAFE.getIntUnaligned(a, adr); UNSAFE.putIntUnaligned(a, adr, v + 1); } @@ -1105,19 +1105,19 @@ public class TestAlignVectorFuzzer { int init = init_con_or_var(); int limit = limit_con_or_var(); int stride = stride_con(); - int scale = scale_con(); - int offset1 = offset1_con_or_var(); - int offset2 = offset2_con_or_var(); - int offset3 = offset3_con_or_var(); + long scale = scale_con(); + long offset1 = offset1_con_or_var(); + long offset2 = offset2_con_or_var(); + long offset3 = offset3_con_or_var(); int h1 = hand_unrolling1_con(); int h2 = hand_unrolling2_con(); int h3 = hand_unrolling3_con(); for (int i = init; i < limit; i += stride) { - int adr1 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset1 + i * scale; - int adr2 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset2 + i * scale; - int adr3 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset3 + i * scale; + long adr1 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset1 + i * scale; + long adr2 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset2 + i * scale; + long adr3 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset3 + i * scale; if (h1 >= 1) { UNSAFE.putIntUnaligned(a, adr1 + 0*4, UNSAFE.getIntUnaligned(a, adr1 + 0*4) + 1); } if (h1 >= 2) { UNSAFE.putIntUnaligned(a, adr1 + 1*4, UNSAFE.getIntUnaligned(a, adr1 + 1*4) + 1); } diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java index 65398e8adfd..197ae08b6d8 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java @@ -172,10 +172,10 @@ public class TestIndependentPacksWithCyclicDependency { static void test2(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { for (int i = 0; i < RANGE; i+=2) { // int and float arrays are two slices. But we pretend both are of type int. - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); - dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); - dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, dataIa[i+0] + 1); + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, dataIa[i+1] + 1); + dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0); + dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4); } } @@ -248,10 +248,10 @@ public class TestIndependentPacksWithCyclicDependency { for (int i = 0; i < RANGE; i+=2) { // same as test2, except that reordering leads to different semantics // explanation analogue to test4 - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A - dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X - dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, dataIa[i+0] + 1); // A + dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0); // X + dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4); // Y + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, dataIa[i+1] + 1); // B } } @@ -275,18 +275,18 @@ public class TestIndependentPacksWithCyclicDependency { long[] dataLa, long[] dataLb) { for (int i = 0; i < RANGE; i+=2) { // Chain of parallelizable op and conversion - int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; - int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); - int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; - int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); - float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; - float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00); + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21); } } @@ -307,18 +307,18 @@ public class TestIndependentPacksWithCyclicDependency { long[] dataLa, long[] dataLb) { for (int i = 0; i < RANGE; i+=2) { // Cycle involving 3 memory slices - int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); - int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; - int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); - float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; - float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); - int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // moved down - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; // moved down + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01); } } @@ -340,19 +340,19 @@ public class TestIndependentPacksWithCyclicDependency { long[] dataLa, long[] dataLb) { for (int i = 0; i < RANGE; i+=2) { // 2-cycle, with more ops after - int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); - int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; - int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); - int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01); // more stuff after - float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; - float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21); } } @@ -373,19 +373,19 @@ public class TestIndependentPacksWithCyclicDependency { long[] dataLa, long[] dataLb) { for (int i = 0; i < RANGE; i+=2) { // 2-cycle, with more stuff before - float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; - float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21); // 2-cycle - int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); - int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; - int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); - int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01); } } @@ -423,18 +423,18 @@ public class TestIndependentPacksWithCyclicDependency { // // The cycle thus does not only go via packs, but also scalar ops. // - int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; // A - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); - int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; // R: constant mismatch - int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) + 43; // S - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); - float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; // U - float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; // V - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); - int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // B: moved down - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; // A + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; // R: constant mismatch + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) + 43; // S + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; // U + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; // V + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; // B: moved down + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01); } } @@ -463,8 +463,8 @@ public class TestIndependentPacksWithCyclicDependency { static void verify(String name, float[] data, float[] gold) { for (int i = 0; i < RANGE; i++) { - int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); - int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i); + int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i); if (datav != goldv) { throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv); } diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java index 32d69689a42..2be7d52c780 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java @@ -58,18 +58,18 @@ public class TestIndependentPacksWithCyclicDependency2 { long[] dataLa, long[] dataLb) { for (int i = 0; i < RANGE; i+=2) { // For explanation, see test 10 in TestIndependentPacksWithCyclicDependency.java - int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); - int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; - int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) + 43; - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); - unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); - float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; - float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); - unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); - int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // moved down - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) + 43; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; // moved down + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01); } } @@ -83,8 +83,8 @@ public class TestIndependentPacksWithCyclicDependency2 { static void verify(String name, float[] data, float[] gold) { for (int i = 0; i < RANGE; i++) { - int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); - int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i); + int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i); if (datav != goldv) { throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv); } diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java index f2ed8b6aec2..c54a684c691 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java @@ -124,10 +124,10 @@ public class TestScheduleReordersScalarMemops { for (int i = 0; i < RANGE; i+=2) { // Do the same as test0, but without int-float conversion. // This should reproduce on machines where conversion is not implemented. - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A +1 - dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X - dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y - unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11 + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, dataIa[i+0] + 1); // A +1 + dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0); // X + dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4); // Y + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, dataIa[i+1] * 11); // B *11 } }