8328544: Improve handling of vectorization

Co-authored-by: Christian Hagedorn <chagedorn@openjdk.org> Reviewed-by: mschoene, kvn, chagedorn, rhalade
2024-05-21 10:31:14 +00:00 · 2024-05-21 10:31:14 +00:00 · cfa25b71a6
commit cfa25b71a6
parent 03bc6b359f
8 changed files with 936 additions and 166 deletions
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -416,6 +416,10 @@ VPointer::VPointer(MemNode* const mem, const VLoop& vloop,
 #ifdef ASSERT
  _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
 #endif
+  _has_int_index_after_convI2L(false),
+  _int_index_after_convI2L_offset(0),
+  _int_index_after_convI2L_invar(nullptr),
+  _int_index_after_convI2L_scale(0),
  _nstack(nstack), _analyze_only(analyze_only), _stack_idx(0)
 #ifndef PRODUCT
  , _tracer(vloop.is_trace_pointer_analysis())
@ -495,6 +499,11 @@ VPointer::VPointer(MemNode* const mem, const VLoop& vloop,
    return;
  }

+  if (!is_safe_to_use_as_simple_form(base, adr)) {
+    assert(!valid(), "does not have simple form");
+    return;
+  }
+
  _base = base;
  _adr  = adr;
  assert(valid(), "Usable");
@ -508,6 +517,10 @@ VPointer::VPointer(VPointer* p) :
 #ifdef ASSERT
  _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr),
 #endif
+  _has_int_index_after_convI2L(false),
+  _int_index_after_convI2L_offset(0),
+  _int_index_after_convI2L_invar(nullptr),
+  _int_index_after_convI2L_scale(0),
  _nstack(p->_nstack), _analyze_only(p->_analyze_only), _stack_idx(p->_stack_idx)
 #ifndef PRODUCT
  , _tracer(p->_tracer._is_trace_alignment)
@ -530,6 +543,354 @@ int VPointer::invar_factor() const {
  return 1;
 }

+// We would like to make decisions about aliasing (i.e. removing memory edges) and adjacency
+// (i.e. which loads/stores can be packed) based on the simple form:
+//
+//   s_pointer = adr + offset + invar + scale * ConvI2L(iv)
+//
+// However, we parse the compound-long-int form:
+//
+//   c_pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_index)
+//   int_index =       int_offset  + int_invar  + int_scale  * iv
+//
+// In general, the simple and the compound-long-int form do not always compute the same pointer
+// at runtime. For example, the simple form would give a different result due to an overflow
+// in the int_index.
+//
+// Example:
+//   For both forms, we have:
+//     iv = 0
+//     scale = 1
+//
+//   We now account the offset and invar once to the long part and once to the int part:
+//     Pointer 1 (long offset and long invar):
+//       long_offset = min_int
+//       long_invar  = min_int
+//       int_offset  = 0
+//       int_invar   = 0
+//
+//     Pointer 2 (int offset and int invar):
+//       long_offset = 0
+//       long_invar  = 0
+//       int_offset  = min_int
+//       int_invar   = min_int
+//
+//   This gives us the following pointers:
+//     Compound-long-int form pointers:
+//       Form:
+//         c_pointer   = adr + long_offset + long_invar + long_scale * ConvI2L(int_offset + int_invar + int_scale * iv)
+//
+//       Pointers:
+//         c_pointer1  = adr + min_int     + min_int    + 1          * ConvI2L(0          + 0         + 1         * 0)
+//                     = adr + min_int + min_int
+//                     = adr - 2^32
+//
+//         c_pointer2  = adr + 0           + 0          + 1          * ConvI2L(min_int    + min_int   + 1         * 0)
+//                     = adr + ConvI2L(min_int + min_int)
+//                     = adr + 0
+//                     = adr
+//
+//     Simple form pointers:
+//       Form:
+//         s_pointer  = adr + offset                     + invar                     + scale                    * ConvI2L(iv)
+//         s_pointer  = adr + (long_offset + int_offset) + (long_invar  + int_invar) + (long_scale * int_scale) * ConvI2L(iv)
+//
+//       Pointers:
+//         s_pointer1 = adr + (min_int     + 0         ) + (min_int     + 0        ) + 1                        * 0
+//                    = adr + min_int + min_int
+//                    = adr - 2^32
+//         s_pointer2 = adr + (0           + min_int   ) + (0           + min_int  ) + 1                        * 0
+//                    = adr + min_int + min_int
+//                    = adr - 2^32
+//
+//   We see that the two addresses are actually 2^32 bytes apart (derived from the c_pointers), but their simple form look identical.
+//
+// Hence, we need to determine in which cases it is safe to make decisions based on the simple
+// form, rather than the compound-long-int form. If we cannot prove that using the simple form
+// is safe (i.e. equivalent to the compound-long-int form), then we do not get a valid VPointer,
+// and the associated memop cannot be vectorized.
+bool VPointer::is_safe_to_use_as_simple_form(Node* base, Node* adr) const {
+#ifndef _LP64
+  // On 32-bit platforms, there is never an explicit int_index with ConvI2L for the iv. Thus, the
+  // parsed pointer form is always the simple form, with int operations:
+  //
+  //   pointer = adr + offset + invar + scale * iv
+  //
+  assert(!_has_int_index_after_convI2L, "32-bit never has an int_index with ConvI2L for the iv");
+  return true;
+#else
+
+  // Array accesses that are not Unsafe always have a RangeCheck which ensures that there is no
+  // int_index overflow. This implies that the conversion to long can be done separately:
+  //
+  //   ConvI2L(int_index) = ConvI2L(int_offset) + ConvI2L(int_invar) + ConvI2L(scale) * ConvI2L(iv)
+  //
+  // And hence, the simple form is guaranteed to be identical to the compound-long-int form at
+  // runtime and the VPointer is safe/valid to be used.
+  const TypeAryPtr* ary_ptr_t = _mem->adr_type()->isa_aryptr();
+  if (ary_ptr_t != nullptr) {
+    if (!_mem->is_unsafe_access()) {
+      return true;
+    }
+  }
+
+  // We did not find the int_index. Just to be safe, reject this VPointer.
+  if (!_has_int_index_after_convI2L) {
+    return false;
+  }
+
+  int int_offset  = _int_index_after_convI2L_offset;
+  Node* int_invar = _int_index_after_convI2L_invar;
+  int int_scale   = _int_index_after_convI2L_scale;
+  int long_scale  = _scale / int_scale;
+
+  // If "int_index = iv", then the simple form is identical to the compound-long-int form.
+  //
+  //   int_index = int_offset + int_invar + int_scale * iv
+  //             = 0            0           1         * iv
+  //             =                                      iv
+  if (int_offset == 0 && int_invar == nullptr && int_scale == 1) {
+    return true;
+  }
+
+  // Intuition: What happens if the int_index overflows? Let us look at two pointers on the "overflow edge":
+  //
+  //              pointer1 = adr + ConvI2L(int_index1)
+  //              pointer2 = adr + ConvI2L(int_index2)
+  //
+  //              int_index1 = max_int + 0 = max_int  -> very close to but before the overflow
+  //              int_index2 = max_int + 1 = min_int  -> just enough to get the overflow
+  //
+  //            When looking at the difference of pointer1 and pointer2, we notice that it is very large
+  //            (almost 2^32). Since arrays have at most 2^31 elements, chances are high that pointer2 is
+  //            an actual out-of-bounds access at runtime. These would normally be prevented by range checks
+  //            at runtime. However, if the access was done by using Unsafe, where range checks are omitted,
+  //            then an out-of-bounds access constitutes undefined behavior. This means that we are allowed to
+  //            do anything, including changing the behavior.
+  //
+  //            If we can set the right conditions, we have a guarantee that an overflow is either impossible
+  //            (no overflow or range checks preventing that) or undefined behavior. In both cases, we are
+  //            safe to do a vectorization.
+  //
+  // Approach:  We want to prove a lower bound for the distance between these two pointers, and an
+  //            upper bound for the size of a memory object. We can derive such an upper bound for
+  //            arrays. We know they have at most 2^31 elements. If we know the size of the elements
+  //            in bytes, we have:
+  //
+  //              array_element_size_in_bytes * 2^31 >= max_possible_array_size_in_bytes
+  //                                                 >= array_size_in_bytes                      (ARR)
+  //
+  //            If some small difference "delta" leads to an int_index overflow, we know that the
+  //            int_index1 before overflow must have been close to max_int, and the int_index2 after
+  //            the overflow must be close to min_int:
+  //
+  //              pointer1 =        adr + long_offset + long_invar + long_scale * ConvI2L(int_index1)
+  //                       =approx  adr + long_offset + long_invar + long_scale * max_int
+  //
+  //              pointer2 =        adr + long_offset + long_invar + long_scale * ConvI2L(int_index2)
+  //                       =approx  adr + long_offset + long_invar + long_scale * min_int
+  //
+  //            We realize that the pointer difference is very large:
+  //
+  //              difference =approx  long_scale * 2^32
+  //
+  //            Hence, if we set the right condition for long_scale and array_element_size_in_bytes,
+  //            we can prove that an overflow is impossible (or would imply undefined behaviour).
+  //
+  // We must now take this intuition, and develop a rigorous proof. We start by stating the problem
+  // more precisely, with the help of some definitions and the Statement we are going to prove.
+  //
+  // Definition:
+  //   Two VPointers are "comparable" (i.e. VPointer::comparable is true, set with VPointer::cmp()),
+  //   iff all of these conditions apply for the simple form:
+  //     1) Both VPointers are valid.
+  //     2) The adr are identical, or both are array bases of different arrays.
+  //     3) They have identical scale.
+  //     4) They have identical invar.
+  //     5) The difference in offsets is limited: abs(offset1 - offset2) < 2^31.                 (DIFF)
+  //
+  // For the Vectorization Optimization, we pair-wise compare VPointers and determine if they are:
+  //   1) "not comparable":
+  //        We do not optimize them (assume they alias, not assume adjacency).
+  //
+  //        Whenever we chose this option based on the simple form, it is also correct based on the
+  //        compound-long-int form, since we make no optimizations based on it.
+  //
+  //   2) "comparable" with different array bases at runtime:
+  //        We assume they do not alias (remove memory edges), but not assume adjacency.
+  //
+  //        Whenever we have two different array bases for the simple form, we also have different
+  //        array bases for the compound-long-form. Since VPointers provably point to different
+  //        memory objects, they can never alias.
+  //
+  //   3) "comparable" with the same base address:
+  //        We compute the relative pointer difference, and based on the load/store size we can
+  //        compute aliasing and adjacency.
+  //
+  //        We must find a condition under which the pointer difference of the simple form is
+  //        identical to the pointer difference of the compound-long-form. We do this with the
+  //        Statement below, which we then proceed to prove.
+  //
+  // Statement:
+  //   If two VPointers satisfy these 3 conditions:
+  //     1) They are "comparable".
+  //     2) They have the same base address.
+  //     3) Their long_scale is a multiple of the array element size in bytes:
+  //
+  //          abs(long_scale) % array_element_size_in_bytes = 0                                     (A)
+  //
+  //   Then their pointer difference of the simple form is identical to the pointer difference
+  //   of the compound-long-int form.
+  //
+  //   More precisely:
+  //     Such two VPointers by definition have identical adr, invar, and scale.
+  //     Their simple form is:
+  //
+  //       s_pointer1 = adr + offset1 + invar + scale * ConvI2L(iv)                                 (B1)
+  //       s_pointer2 = adr + offset2 + invar + scale * ConvI2L(iv)                                 (B2)
+  //
+  //     Thus, the pointer difference of the simple forms collapses to the difference in offsets:
+  //
+  //       s_difference = s_pointer1 - s_pointer2 = offset1 - offset2                               (C)
+  //
+  //     Their compound-long-int form for these VPointer is:
+  //
+  //       c_pointer1 = adr + long_offset1 + long_invar1 + long_scale1 * ConvI2L(int_index1)        (D1)
+  //       int_index1 = int_offset1 + int_invar1 + int_scale1 * iv                                  (D2)
+  //
+  //       c_pointer2 = adr + long_offset2 + long_invar2 + long_scale2 * ConvI2L(int_index2)        (D3)
+  //       int_index2 = int_offset2 + int_invar2 + int_scale2 * iv                                  (D4)
+  //
+  //     And these are the offset1, offset2, invar and scale from the simple form (B1) and (B2):
+  //
+  //       offset1 = long_offset1 + long_scale1 * ConvI2L(int_offset1)                              (D5)
+  //       offset2 = long_offset2 + long_scale2 * ConvI2L(int_offset2)                              (D6)
+  //
+  //       invar   = long_invar1 + long_scale1 * ConvI2L(int_invar1)
+  //               = long_invar2 + long_scale2 * ConvI2L(int_invar2)                                (D7)
+  //
+  //       scale   = long_scale1 * ConvI2L(int_scale1)
+  //               = long_scale2 * ConvI2L(int_scale2)                                              (D8)
+  //
+  //     The pointer difference of the compound-long-int form is defined as:
+  //
+  //       c_difference = c_pointer1 - c_pointer2
+  //
+  //   Thus, the statement claims that for the two VPointer we have:
+  //
+  //     s_difference = c_difference                                                                (Statement)
+  //
+  // We prove the Statement with the help of a Lemma:
+  //
+  // Lemma:
+  //   There is some integer x, such that:
+  //
+  //     c_difference = s_difference + array_element_size_in_bytes * x * 2^32                       (Lemma)
+  //
+  // From condition (DIFF), we can derive:
+  //
+  //   abs(s_difference) < 2^31                                                                     (E)
+  //
+  // Assuming the Lemma, we prove the Statement:
+  //   If "x = 0" (intuitively: the int_index does not overflow), then:
+  //     c_difference = s_difference
+  //     and hence the simple form computes the same pointer difference as the compound-long-int form.
+  //   If "x != 0" (intuitively: the int_index overflows), then:
+  //     abs(c_difference) >= abs(s_difference + array_element_size_in_bytes * x * 2^32)
+  //                       >= array_element_size_in_bytes * 2^32 - abs(s_difference)
+  //                                                               --  apply (E)  --
+  //                       >  array_element_size_in_bytes * 2^32 - 2^31
+  //                       >= array_element_size_in_bytes * 2^31
+  //                              --  apply (ARR)  --
+  //                       >= max_possible_array_size_in_bytes
+  //                       >= array_size_in_bytes
+  //
+  //     This shows that c_pointer1 and c_pointer2 have a distance that exceeds the maximum array size.
+  //     Thus, at least one of the two pointers must be outside of the array bounds. But we can assume
+  //     that out-of-bounds accesses do not happen. If they still do, it is undefined behavior. Hence,
+  //     we are allowed to do anything. We can also "safely" use the simple form in this case even though
+  //     it might not match the compound-long-int form at runtime.
+  // QED Statement.
+  //
+  // We must now prove the Lemma.
+  //
+  // ConvI2L always truncates by some power of 2^32, i.e. there is some integer y such that:
+  //
+  //   ConvI2L(y1 + y2) = ConvI2L(y1) + ConvI2L(y2) + 2^32 * y                                  (F)
+  //
+  // It follows, that there is an integer y1 such that:
+  //
+  //   ConvI2L(int_index1) =  ConvI2L(int_offset1 + int_invar1 + int_scale1 * iv)
+  //                          -- apply (F) --
+  //                       =  ConvI2L(int_offset1)
+  //                        + ConvI2L(int_invar1)
+  //                        + ConvI2L(int_scale1) * ConvI2L(iv)
+  //                        + y1 * 2^32                                                         (G)
+  //
+  // Thus, we can write the compound-long-int form (D1) as:
+  //
+  //   c_pointer1 =   adr + long_offset1 + long_invar1 + long_scale1 * ConvI2L(int_index1)
+  //                  -- apply (G) --
+  //              =   adr
+  //                + long_offset1
+  //                + long_invar1
+  //                + long_scale1 * ConvI2L(int_offset1)
+  //                + long_scale1 * ConvI2L(int_invar1)
+  //                + long_scale1 * ConvI2L(int_scale1) * ConvI2L(iv)
+  //                + long_scale1 * y1 * 2^32                                                    (H)
+  //
+  // And we can write the simple form as:
+  //
+  //   s_pointer1 =   adr + offset1 + invar + scale * ConvI2L(iv)
+  //                  -- apply (D5, D7, D8) --
+  //              =   adr
+  //                + long_offset1
+  //                + long_scale1 * ConvI2L(int_offset1)
+  //                + long_invar1
+  //                + long_scale1 * ConvI2L(int_invar1)
+  //                + long_scale1 * ConvI2L(int_scale1) * ConvI2L(iv)                            (K)
+  //
+  // We now compute the pointer difference between the simple (K) and compound-long-int form (H).
+  // Most terms cancel out immediately:
+  //
+  //   sc_difference1 = c_pointer1 - s_pointer1 = long_scale1 * y1 * 2^32                        (L)
+  //
+  // Rearranging the equation (L), we get:
+  //
+  //   c_pointer1 = s_pointer1 + long_scale1 * y1 * 2^32                                         (M)
+  //
+  // And since long_scale1 is a multiple of array_element_size_in_bytes, there is some integer
+  // x1, such that (M) implies:
+  //
+  //   c_pointer1 = s_pointer1 + array_element_size_in_bytes * x1 * 2^32                         (N)
+  //
+  // With an analogue equation for c_pointer2, we can now compute the pointer difference for
+  // the compound-long-int form:
+  //
+  //   c_difference =  c_pointer1 - c_pointer2
+  //                   -- apply (N) --
+  //                =  s_pointer1 + array_element_size_in_bytes * x1 * 2^32
+  //                 -(s_pointer2 + array_element_size_in_bytes * x2 * 2^32)
+  //                   -- where "x = x1 - x2" --
+  //                =  s_pointer1 - s_pointer2 + array_element_size_in_bytes * x * 2^32
+  //                   -- apply (C) --
+  //                =  s_difference            + array_element_size_in_bytes * x * 2^32
+  // QED Lemma.
+  if (ary_ptr_t != nullptr) {
+    BasicType array_element_bt = ary_ptr_t->elem()->array_element_basic_type();
+    if (is_java_primitive(array_element_bt)) {
+      int array_element_size_in_bytes = type2aelembytes(array_element_bt);
+      if (abs(long_scale) % array_element_size_in_bytes == 0) {
+        return true;
+      }
+    }
+  }
+
+  // General case: we do not know if it is safe to use the simple form.
+  return false;
+#endif
+}
+
 bool VPointer::is_loop_member(Node* n) const {
  Node* n_c = phase()->get_ctrl(n);
  return lpt()->is_member(phase()->get_loop(n_c));
@ -632,6 +993,37 @@ bool VPointer::scaled_iv(Node* n) {
      NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
      return true;
    }
+  } else if (opc == Op_ConvI2L && !has_iv()) {
+    // So far we have not found the iv yet, and are about to enter a ConvI2L subgraph,
+    // which may be the int index (that might overflow) for the memory access, of the form:
+    //
+    //   int_index = int_offset + int_invar + int_scale * iv
+    //
+    // If we simply continue parsing with the current VPointer, then the int_offset and
+    // int_invar simply get added to the long offset and invar. But for the checks in
+    // VPointer::is_safe_to_use_as_simple_form() we need to have explicit access to the
+    // int_index. Thus, we must parse it explicitly here. For this, we use a temporary
+    // VPointer, to pattern match the int_index sub-expression of the address.
+
+    NOT_PRODUCT(Tracer::Depth dddd;)
+    VPointer tmp(this);
+    NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
+
+    if (tmp.scaled_iv_plus_offset(n->in(1)) && tmp.has_iv()) {
+      // We successfully matched an integer index, of the form:
+      //   int_index = int_offset + int_invar + int_scale * iv
+      _has_int_index_after_convI2L = true;
+      _int_index_after_convI2L_offset = tmp._offset;
+      _int_index_after_convI2L_invar  = tmp._invar;
+      _int_index_after_convI2L_scale  = tmp._scale;
+    }
+
+    // Now parse it again for the real VPointer. This makes sure that the int_offset, int_invar,
+    // and int_scale are properly added to the final VPointer's offset, invar, and scale.
+    if (scaled_iv_plus_offset(n->in(1))) {
+      NOT_PRODUCT(_tracer.scaled_iv_7(n);)
+      return true;
+    }
  } else if (opc == Op_ConvI2L || opc == Op_CastII) {
    if (scaled_iv_plus_offset(n->in(1))) {
      NOT_PRODUCT(_tracer.scaled_iv_7(n);)
@ -648,8 +1040,17 @@ bool VPointer::scaled_iv(Node* n) {

      if (tmp.scaled_iv_plus_offset(n->in(1))) {
        int scale = n->in(2)->get_int();
+        // Accumulate scale.
        _scale   = tmp._scale  << scale;
-        _offset += tmp._offset << scale;
+        // Accumulate offset.
+        int shifted_offset = 0;
+        if (!try_LShiftI_no_overflow(tmp._offset, scale, shifted_offset)) {
+          return false; // shift overflow.
+        }
+        if (!try_AddI_no_overflow(_offset, shifted_offset, _offset)) {
+          return false; // add overflow.
+        }
+        // Accumulate invar.
        if (tmp._invar != nullptr) {
          BasicType bt = tmp._invar->bottom_type()->basic_type();
          assert(bt == T_INT || bt == T_LONG, "");
@ -658,6 +1059,13 @@ bool VPointer::scaled_iv(Node* n) {
          _debug_invar_scale = n->in(2);
 #endif
        }
+
+        // Forward info about the int_index:
+        _has_int_index_after_convI2L = tmp._has_int_index_after_convI2L;
+        _int_index_after_convI2L_offset = tmp._int_index_after_convI2L_offset;
+        _int_index_after_convI2L_invar  = tmp._int_index_after_convI2L_invar;
+        _int_index_after_convI2L_scale  = tmp._int_index_after_convI2L_scale;
+
        NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar);)
        return true;
      }
@ -675,7 +1083,9 @@ bool VPointer::offset_plus_k(Node* n, bool negate) {

  int opc = n->Opcode();
  if (opc == Op_ConI) {
-    _offset += negate ? -(n->get_int()) : n->get_int();
+    if (!try_AddSubI_no_overflow(_offset, n->get_int(), negate, _offset)) {
+      return false; // add/sub overflow.
+    }
    NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
    return true;
  } else if (opc == Op_ConL) {
@ -684,7 +1094,9 @@ bool VPointer::offset_plus_k(Node* n, bool negate) {
    if (t->higher_equal(TypeLong::INT)) {
      jlong loff = n->get_long();
      jint  off  = (jint)loff;
-      _offset += negate ? -off : loff;
+      if (!try_AddSubI_no_overflow(_offset, off, negate, _offset)) {
+        return false; // add/sub overflow.
+      }
      NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
      return true;
    }
@ -699,11 +1111,15 @@ bool VPointer::offset_plus_k(Node* n, bool negate) {
  if (opc == Op_AddI) {
    if (n->in(2)->is_Con() && invariant(n->in(1))) {
      maybe_add_to_invar(n->in(1), negate);
-      _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
+      if (!try_AddSubI_no_overflow(_offset, n->in(2)->get_int(), negate, _offset)) {
+        return false; // add/sub overflow.
+      }
      NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, negate, _offset);)
      return true;
    } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
-      _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
+      if (!try_AddSubI_no_overflow(_offset, n->in(1)->get_int(), negate, _offset)) {
+        return false; // add/sub overflow.
+      }
      maybe_add_to_invar(n->in(2), negate);
      NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, negate, _offset);)
      return true;
@ -712,11 +1128,15 @@ bool VPointer::offset_plus_k(Node* n, bool negate) {
  if (opc == Op_SubI) {
    if (n->in(2)->is_Con() && invariant(n->in(1))) {
      maybe_add_to_invar(n->in(1), negate);
-      _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
+      if (!try_AddSubI_no_overflow(_offset, n->in(2)->get_int(), !negate, _offset)) {
+        return false; // add/sub overflow.
+      }
      NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, negate, _offset);)
      return true;
    } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
-      _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
+      if (!try_AddSubI_no_overflow(_offset, n->in(1)->get_int(), negate, _offset)) {
+        return false; // add/sub overflow.
+      }
      maybe_add_to_invar(n->in(2), !negate);
      NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, !negate, _offset);)
      return true;
@ -806,6 +1226,44 @@ void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) {
  _invar = register_if_new(add);
 }

+bool VPointer::try_AddI_no_overflow(int offset1, int offset2, int& result) {
+  jlong long_offset = java_add((jlong)(offset1), (jlong)(offset2));
+  jint  int_offset  = java_add(        offset1,          offset2);
+  if (long_offset != int_offset) {
+    return false;
+  }
+  result = int_offset;
+  return true;
+}
+
+bool VPointer::try_SubI_no_overflow(int offset1, int offset2, int& result) {
+  jlong long_offset = java_subtract((jlong)(offset1), (jlong)(offset2));
+  jint  int_offset  = java_subtract(        offset1,          offset2);
+  if (long_offset != int_offset) {
+    return false;
+  }
+  result = int_offset;
+  return true;
+}
+
+bool VPointer::try_AddSubI_no_overflow(int offset1, int offset2, bool is_sub, int& result) {
+  if (is_sub) {
+    return try_SubI_no_overflow(offset1, offset2, result);
+  } else {
+    return try_AddI_no_overflow(offset1, offset2, result);
+  }
+}
+
+bool VPointer::try_LShiftI_no_overflow(int offset, int shift, int& result) {
+  jlong long_offset = java_shift_left((jlong)(offset), shift);
+  jint  int_offset  = java_shift_left(        offset,  shift);
+  if (long_offset != int_offset) {
+    return false;
+  }
+  result = int_offset;
+  return true;
+}
+
 // We use two comparisons, because a subtraction could underflow.
 #define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \
  if (a < b) { return -1; }                 \
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -670,13 +670,51 @@ private:
 // A vectorization pointer (VPointer) has information about an address for
 // dependence checking and vector alignment. It's usually bound to a memory
 // operation in a counted loop for vectorizable analysis.
+//
+// We parse and represent pointers of the simple form:
+//
+//   pointer   = adr + offset + invar + scale * ConvI2L(iv)
+//
+// Where:
+//
+//   adr: the base address of an array (base = adr)
+//        OR
+//        some address to off-heap memory (base = TOP)
+//
+//   offset: a constant offset
+//   invar:  a runtime variable, which is invariant during the loop
+//   scale:  scaling factor
+//   iv:     loop induction variable
+//
+// But more precisely, we parse the composite-long-int form:
+//
+//   pointer   = adr + long_offset + long_invar + long_scale * ConvI2L(int_offset + inv_invar + int_scale * iv)
+//
+//   pointer   = adr + long_offset + long_invar + long_scale * ConvI2L(int_index)
+//   int_index =       int_offset  + int_invar  + int_scale  * iv
+//
+// However, for aliasing and adjacency checks (e.g. VPointer::cmp()) we always use the simple form to make
+// decisions. Hence, we must make sure to only create a "valid" VPointer if the optimisations based on the
+// simple form produce the same result as the compound-long-int form would. Intuitively, this depends on
+// if the int_index overflows, but the precise conditions are given in VPointer::is_safe_to_use_as_simple_form().
+//
+//   ConvI2L(int_index) = ConvI2L(int_offset  + int_invar  + int_scale  * iv)
+//                      = Convi2L(int_offset) + ConvI2L(int_invar) + ConvI2L(int_scale) * ConvI2L(iv)
+//
+//   scale  = long_scale * ConvI2L(int_scale)
+//   offset = long_offset + long_scale * ConvI2L(int_offset)
+//   invar  = long_invar  + long_scale * ConvI2L(int_invar)
+//
+//   pointer   = adr + offset + invar + scale * ConvI2L(iv)
+//
 class VPointer : public ArenaObj {
 protected:
  MemNode* const  _mem;      // My memory reference node
  const VLoop&    _vloop;

-  Node* _base;               // null if unsafe nonheap reference
-  Node* _adr;                // address pointer
+  // Components of the simple form:
+  Node* _base;               // Base address of an array OR null if some off-heap memory.
+  Node* _adr;                // Same as _base if an array pointer OR some off-heap memory pointer.
  int   _scale;              // multiplier for iv (in bytes), 0 if no loop iv
  int   _offset;             // constant offset (in bytes)

@ -687,6 +725,13 @@ class VPointer : public ArenaObj {
  Node* _debug_invar_scale;  // multiplier for invariant
 #endif

+  // The int_index components of the compound-long-int form. Used to decide if it is safe to use the
+  // simple form rather than the compound-long-int form that was parsed.
+  bool  _has_int_index_after_convI2L;
+  int   _int_index_after_convI2L_offset;
+  Node* _int_index_after_convI2L_invar;
+  int   _int_index_after_convI2L_scale;
+
  Node_Stack* _nstack;       // stack used to record a vpointer trace of variants
  bool        _analyze_only; // Used in loop unrolling only for vpointer trace
  uint        _stack_idx;    // Used in loop unrolling only for vpointer trace
@ -726,6 +771,8 @@ class VPointer : public ArenaObj {
  VPointer(VPointer* p);
  NONCOPYABLE(VPointer);

+  bool is_safe_to_use_as_simple_form(Node* base, Node* adr) const;
+
 public:
  bool valid()             const { return _adr != nullptr; }
  bool has_iv()            const { return _scale != 0; }
@ -751,10 +798,43 @@ class VPointer : public ArenaObj {
    return _invar == q._invar;
  }

+  // We compute if and how two VPointers can alias at runtime, i.e. if the two addressed regions of memory can
+  // ever overlap. There are essentially 3 relevant return states:
+  //  - NotComparable:  Synonymous to "unknown aliasing".
+  //                    We have no information about how the two VPointers can alias. They could overlap, refer
+  //                    to another location in the same memory object, or point to a completely different object.
+  //                    -> Memory edge required. Aliasing unlikely but possible.
+  //
+  //  - Less / Greater: Synonymous to "never aliasing".
+  //                    The two VPointers may point into the same memory object, but be non-aliasing (i.e. we
+  //                    know both address regions inside the same memory object, but these regions are non-
+  //                    overlapping), or the VPointers point to entirely different objects.
+  //                    -> No memory edge required. Aliasing impossible.
+  //
+  //  - Equal:          Synonymous to "overlap, or point to different memory objects".
+  //                    The two VPointers either overlap on the same memory object, or point to two different
+  //                    memory objects.
+  //                    -> Memory edge required. Aliasing likely.
+  //
+  // In a future refactoring, we can simplify to two states:
+  //  - NeverAlias:     instead of Less / Greater
+  //  - MayAlias:       instead of Equal / NotComparable
+  //
+  // Two VPointer are "comparable" (Less / Greater / Equal), iff all of these conditions apply:
+  //   1) Both are valid, i.e. expressible in the compound-long-int or simple form.
+  //   2) The adr are identical, or both are array bases of different arrays.
+  //   3) They have identical scale.
+  //   4) They have identical invar.
+  //   5) The difference in offsets is limited: abs(offset0 - offset1) < 2^31.
  int cmp(const VPointer& q) const {
    if (valid() && q.valid() &&
        (_adr == q._adr || (_base == _adr && q._base == q._adr)) &&
        _scale == q._scale   && invar_equals(q)) {
+      jlong difference = abs(java_subtract((jlong)_offset, (jlong)q._offset));
+      jlong max_diff = (jlong)1 << 31;
+      if (difference >= max_diff) {
+        return NotComparable;
+      }
      bool overlap = q._offset <   _offset +   memory_size() &&
                       _offset < q._offset + q.memory_size();
      return overlap ? Equal : (_offset < q._offset ? Less : Greater);
@ -859,6 +939,11 @@ class VPointer : public ArenaObj {

  void maybe_add_to_invar(Node* new_invar, bool negate);

+  static bool try_AddI_no_overflow(int offset1, int offset2, int& result);
+  static bool try_SubI_no_overflow(int offset1, int offset2, int& result);
+  static bool try_AddSubI_no_overflow(int offset1, int offset2, bool is_sub, int& result);
+  static bool try_LShiftI_no_overflow(int offset1, int offset2, int& result);
+
  Node* register_if_new(Node* n) const;
 };

--- a/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizationMismatchedAccess.java
+++ b/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizationMismatchedAccess.java
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2023, Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -35,7 +36,6 @@ import java.nio.ByteOrder;
 * @test
 * @bug 8300258
 * @key randomness
- * @requires (os.simpleArch == "x64") | (os.simpleArch == "aarch64")
 * @summary C2: vectorization fails on simple ByteBuffer loop
 * @modules java.base/jdk.internal.misc
 * @library /test/lib /
@ -147,193 +147,420 @@ public class TestVectorizationMismatchedAccess {
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testByteLong1(byte[] dest, long[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
+    public static void testByteLong1a(byte[] dest, long[] src) {
        for (int i = 0; i < src.length; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i, src[i]);
        }
    }

-    @Run(test = "testByteLong1")
-    public static void testByteLong1_runner() {
-        runAndVerify(() -> testByteLong1(byteArray, longArray), 0);
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteLong1b(byte[] dest, long[] src) {
+        for (int i = 0; i < src.length; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i, src[i]);
+        }
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testByteLong2(byte[] dest, long[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"})
+    public static void testByteLong1c(byte[] dest, long[] src) {
+        long base = 64; // make sure it is big enough and 8 byte aligned (required for 32-bit)
+        for (int i = 0; i < src.length - 8; i++) {
+            UNSAFE.putLongUnaligned(dest, base + 8 * i, src[i]);
+        }
+    }
+
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteLong1d(byte[] dest, long[] src) {
+        long base = 64; // make sure it is big enough and 8 byte aligned (required for 32-bit)
+        for (int i = 0; i < src.length - 8; i++) {
+            UNSAFE.putLongUnaligned(dest, base + 8L * i, src[i]);
+        }
+    }
+
+    @Run(test = {"testByteLong1a", "testByteLong1b", "testByteLong1c", "testByteLong1d"})
+    public static void testByteLong1_runner() {
+        runAndVerify(() -> testByteLong1a(byteArray, longArray), 0);
+        runAndVerify(() -> testByteLong1b(byteArray, longArray), 0);
+        testByteLong1c(byteArray, longArray);
+        testByteLong1d(byteArray, longArray);
+    }
+
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
+    public static void testByteLong2a(byte[] dest, long[] src) {
        for (int i = 1; i < src.length; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i - 1), src[i]);
        }
    }

-    @Run(test = "testByteLong2")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteLong2b(byte[] dest, long[] src) {
+        for (int i = 1; i < src.length; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i - 1), src[i]);
+        }
+    }
+
+    @Run(test = {"testByteLong2a", "testByteLong2b"})
    public static void testByteLong2_runner() {
-        runAndVerify(() -> testByteLong2(byteArray, longArray), -8);
+        runAndVerify(() -> testByteLong2a(byteArray, longArray), -8);
+        runAndVerify(() -> testByteLong2b(byteArray, longArray), -8);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testByteLong3(byte[] dest, long[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
+    public static void testByteLong3a(byte[] dest, long[] src) {
        for (int i = 0; i < src.length - 1; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + 1), src[i]);
        }
    }

-    @Run(test = "testByteLong3")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteLong3b(byte[] dest, long[] src) {
+        for (int i = 0; i < src.length - 1; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + 1), src[i]);
+        }
+    }
+
+    @Run(test = {"testByteLong3a", "testByteLong3b"})
    public static void testByteLong3_runner() {
-        runAndVerify(() -> testByteLong3(byteArray, longArray), 8);
+        runAndVerify(() -> testByteLong3a(byteArray, longArray), 8);
+        runAndVerify(() -> testByteLong3b(byteArray, longArray), 8);
    }

    @Test
    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
        applyIf = {"AlignVector", "false"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
    // AlignVector cannot guarantee that invar is aligned.
-    public static void testByteLong4(byte[] dest, long[] src, int start, int stop) {
+    public static void testByteLong4a(byte[] dest, long[] src, int start, int stop) {
        for (int i = start; i < stop; i++) {
            UNSAFE.putLongUnaligned(dest, 8 * i + baseOffset, src[i]);
        }
    }

-    @Run(test = "testByteLong4")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"},
+        applyIf = {"AlignVector", "false"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    // AlignVector cannot guarantee that invar is aligned.
+    public static void testByteLong4b(byte[] dest, long[] src, int start, int stop) {
+        for (int i = start; i < stop; i++) {
+            UNSAFE.putLongUnaligned(dest, 8L * i + baseOffset, src[i]);
+        }
+    }
+
+    @Run(test = {"testByteLong4a", "testByteLong4b"})
    public static void testByteLong4_runner() {
        baseOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET;
-        runAndVerify(() -> testByteLong4(byteArray, longArray, 0, size), 0);
+        runAndVerify(() -> testByteLong4a(byteArray, longArray, 0, size), 0);
+        runAndVerify(() -> testByteLong4b(byteArray, longArray, 0, size), 0);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testByteLong5(byte[] dest, long[] src, int start, int stop) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
+    public static void testByteLong5a(byte[] dest, long[] src, int start, int stop) {
        for (int i = start; i < stop; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + baseOffset), src[i]);
        }
    }

-    @Run(test = "testByteLong5")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteLong5b(byte[] dest, long[] src, int start, int stop) {
+        for (int i = start; i < stop; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + baseOffset), src[i]);
+        }
+    }
+
+    @Run(test = {"testByteLong5a", "testByteLong5b"})
    public static void testByteLong5_runner() {
        baseOffset = 1;
-        runAndVerify(() -> testByteLong5(byteArray, longArray, 0, size-1), 8);
+        runAndVerify(() -> testByteLong5a(byteArray, longArray, 0, size-1), 8);
+        runAndVerify(() -> testByteLong5b(byteArray, longArray, 0, size-1), 8);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testByteByte1(byte[] dest, byte[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
+    public static void testByteByte1a(byte[] dest, byte[] src) {
        for (int i = 0; i < src.length / 8; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i));
        }
    }

-    @Run(test = "testByteByte1")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteByte1b(byte[] dest, byte[] src) {
+        for (int i = 0; i < src.length / 8; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i));
+        }
+    }
+
+    @Run(test = {"testByteByte1a", "testByteByte1b"})
    public static void testByteByte1_runner() {
-        runAndVerify2(() -> testByteByte1(byteArray, byteArray), 0);
+        runAndVerify2(() -> testByteByte1a(byteArray, byteArray), 0);
+        runAndVerify2(() -> testByteByte1b(byteArray, byteArray), 0);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testByteByte2(byte[] dest, byte[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: offsets are badly aligned (UNSAFE.ARRAY_BYTE_BASE_OFFSET is 4 byte aligned, but not 8 byte aligned).
+    //         might get fixed with JDK-8325155.
+    public static void testByteByte2a(byte[] dest, byte[] src) {
        for (int i = 1; i < src.length / 8; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i - 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i));
        }
    }

-    @Run(test = "testByteByte2")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+        applyIfCPUFeatureOr = {"sse2", "true", "asimd", "true"},
+        applyIfPlatform = {"64-bit", "true"})
+    // 32-bit: address has ConvL2I for cast of long to address, not supported.
+    public static void testByteByte2b(byte[] dest, byte[] src) {
+        for (int i = 1; i < src.length / 8; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i - 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i));
+        }
+    }
+
+    @Run(test = {"testByteByte2a", "testByteByte2b"})
    public static void testByteByte2_runner() {
-        runAndVerify2(() -> testByteByte2(byteArray, byteArray), -8);
+        runAndVerify2(() -> testByteByte2a(byteArray, byteArray), -8);
+        runAndVerify2(() -> testByteByte2b(byteArray, byteArray), -8);
    }

    @Test
    @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR })
-    public static void testByteByte3(byte[] dest, byte[] src) {
+    public static void testByteByte3a(byte[] dest, byte[] src) {
        for (int i = 0; i < src.length / 8 - 1; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i));
        }
    }

-    @Run(test = "testByteByte3")
+    @Test
+    @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR })
+    public static void testByteByte3b(byte[] dest, byte[] src) {
+        for (int i = 0; i < src.length / 8 - 1; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + 1), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i));
+        }
+    }
+
+    @Run(test = {"testByteByte3a", "testByteByte3b"})
    public static void testByteByte3_runner() {
-        runAndVerify2(() -> testByteByte3(byteArray, byteArray), 8);
+        runAndVerify2(() -> testByteByte3a(byteArray, byteArray), 8);
+        runAndVerify2(() -> testByteByte3b(byteArray, byteArray), 8);
    }

    @Test
    @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR })
-    public static void testByteByte4(byte[] dest, byte[] src, int start, int stop) {
+    public static void testByteByte4a(byte[] dest, byte[] src, int start, int stop) {
        for (int i = start; i < stop; i++) {
            UNSAFE.putLongUnaligned(dest, 8 * i + baseOffset, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i));
        }
    }

-    @Run(test = "testByteByte4")
+    @Test
+    @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR })
+    public static void testByteByte4b(byte[] dest, byte[] src, int start, int stop) {
+        for (int i = start; i < stop; i++) {
+            UNSAFE.putLongUnaligned(dest, 8L * i + baseOffset, UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i));
+        }
+    }
+
+    @Run(test = {"testByteByte4a", "testByteByte4b"})
    public static void testByteByte4_runner() {
        baseOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET;
-        runAndVerify2(() -> testByteByte4(byteArray, byteArray, 0, size), 0);
+        runAndVerify2(() -> testByteByte4a(byteArray, byteArray, 0, size), 0);
+        runAndVerify2(() -> testByteByte4b(byteArray, byteArray, 0, size), 0);
    }

    @Test
    @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR })
-    public static void testByteByte5(byte[] dest, byte[] src, int start, int stop) {
+    public static void testByteByte5a(byte[] dest, byte[] src, int start, int stop) {
        for (int i = start; i < stop; i++) {
            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * (i + baseOffset), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8 * i));
        }
    }

-    @Run(test = "testByteByte5")
+    @Test
+    @IR(failOn = { IRNode.LOAD_VECTOR_L, IRNode.STORE_VECTOR })
+    public static void testByteByte5b(byte[] dest, byte[] src, int start, int stop) {
+        for (int i = start; i < stop; i++) {
+            UNSAFE.putLongUnaligned(dest, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * (i + baseOffset), UNSAFE.getLongUnaligned(src, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 8L * i));
+        }
+    }
+
+    @Run(test = {"testByteByte5a", "testByteByte5b"})
    public static void testByteByte5_runner() {
        baseOffset = 1;
-        runAndVerify2(() -> testByteByte5(byteArray, byteArray, 0, size-1), 8);
+        runAndVerify2(() -> testByteByte5a(byteArray, byteArray, 0, size-1), 8);
+        runAndVerify2(() -> testByteByte5b(byteArray, byteArray, 0, size-1), 8);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testOffHeapLong1(long dest, long[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
+    // FAILS: adr is CastX2P(dest + 8 * (i + int_con))
+    // See: JDK-8331576
+    public static void testOffHeapLong1a(long dest, long[] src) {
        for (int i = 0; i < src.length; i++) {
            UNSAFE.putLongUnaligned(null, dest + 8 * i, src[i]);
        }
    }

-    @Run(test = "testOffHeapLong1")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
+    // FAILS: adr is CastX2P(dest + 8L * (i + int_con))
+    // See: JDK-8331576
+    public static void testOffHeapLong1b(long dest, long[] src) {
+        for (int i = 0; i < src.length; i++) {
+            UNSAFE.putLongUnaligned(null, dest + 8L * i, src[i]);
+        }
+    }
+
+    @Run(test = {"testOffHeapLong1a", "testOffHeapLong1b"})
    public static void testOffHeapLong1_runner() {
-        runAndVerify3(() -> testOffHeapLong1(baseOffHeap, longArray), 0);
+        runAndVerify3(() -> testOffHeapLong1a(baseOffHeap, longArray), 0);
+        runAndVerify3(() -> testOffHeapLong1b(baseOffHeap, longArray), 0);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testOffHeapLong2(long dest, long[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
+    // FAILS: adr is CastX2P
+    // See: JDK-8331576
+    public static void testOffHeapLong2a(long dest, long[] src) {
        for (int i = 1; i < src.length; i++) {
            UNSAFE.putLongUnaligned(null, dest + 8 * (i - 1), src[i]);
        }
    }

-    @Run(test = "testOffHeapLong2")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
+    // FAILS: adr is CastX2P
+    // See: JDK-8331576
+    public static void testOffHeapLong2b(long dest, long[] src) {
+        for (int i = 1; i < src.length; i++) {
+            UNSAFE.putLongUnaligned(null, dest + 8L * (i - 1), src[i]);
+        }
+    }
+
+    @Run(test = {"testOffHeapLong2a", "testOffHeapLong2b"})
    public static void testOffHeapLong2_runner() {
-        runAndVerify3(() -> testOffHeapLong2(baseOffHeap, longArray), -8);
+        runAndVerify3(() -> testOffHeapLong2a(baseOffHeap, longArray), -8);
+        runAndVerify3(() -> testOffHeapLong2b(baseOffHeap, longArray), -8);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
-    public static void testOffHeapLong3(long dest, long[] src) {
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
+    // FAILS: adr is CastX2P
+    // See: JDK-8331576
+    public static void testOffHeapLong3a(long dest, long[] src) {
        for (int i = 0; i < src.length - 1; i++) {
            UNSAFE.putLongUnaligned(null, dest + 8 * (i + 1), src[i]);
        }
    }

-    @Run(test = "testOffHeapLong3")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" })
+    // FAILS: adr is CastX2P
+    // See: JDK-8331576
+    public static void testOffHeapLong3b(long dest, long[] src) {
+        for (int i = 0; i < src.length - 1; i++) {
+            UNSAFE.putLongUnaligned(null, dest + 8L * (i + 1), src[i]);
+        }
+    }
+
+    @Run(test = {"testOffHeapLong3a", "testOffHeapLong3b"})
    public static void testOffHeapLong3_runner() {
-        runAndVerify3(() -> testOffHeapLong3(baseOffHeap, longArray), 8);
+        runAndVerify3(() -> testOffHeapLong3a(baseOffHeap, longArray), 8);
+        runAndVerify3(() -> testOffHeapLong3b(baseOffHeap, longArray), 8);
    }

    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
-        applyIf = {"AlignVector", "false"})
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+    //     applyIf = {"AlignVector", "false"})
+    // FAILS: adr is CastX2P
+    // See: JDK-8331576
    // AlignVector cannot guarantee that invar is aligned.
-    public static void testOffHeapLong4(long dest, long[] src, int start, int stop) {
+    public static void testOffHeapLong4a(long dest, long[] src, int start, int stop) {
        for (int i = start; i < stop; i++) {
            UNSAFE.putLongUnaligned(null, dest + 8 * i + baseOffset, src[i]);
        }
    }

-    @Run(test = "testOffHeapLong4")
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_L, "=0", IRNode.STORE_VECTOR, "=0" }) // temporary
+    // @IR(counts = { IRNode.LOAD_VECTOR_L, ">=1", IRNode.STORE_VECTOR, ">=1" },
+    //     applyIf = {"AlignVector", "false"})
+    // FAILS: adr is CastX2P
+    // See: JDK-8331576
+    // AlignVector cannot guarantee that invar is aligned.
+    public static void testOffHeapLong4b(long dest, long[] src, int start, int stop) {
+        for (int i = start; i < stop; i++) {
+            UNSAFE.putLongUnaligned(null, dest + 8L * i + baseOffset, src[i]);
+        }
+    }
+
+    @Run(test = {"testOffHeapLong4a", "testOffHeapLong4b"})
    public static void testOffHeapLong4_runner() {
        baseOffset = 8;
-        runAndVerify3(() -> testOffHeapLong4(baseOffHeap, longArray, 0, size-1), 8);
+        runAndVerify3(() -> testOffHeapLong4a(baseOffHeap, longArray, 0, size-1), 8);
+        runAndVerify3(() -> testOffHeapLong4b(baseOffHeap, longArray, 0, size-1), 8);
    }
 }
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java
@ -1363,7 +1363,7 @@ public class TestAlignVector {
    static Object[] test17a(long[] a) {
        // Unsafe: vectorizes with profiling (not xcomp)
        for (int i = 0; i < RANGE; i++) {
-            int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
+            long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
            long v = UNSAFE.getLongUnaligned(a, adr);
            UNSAFE.putLongUnaligned(a, adr, v + 1);
        }
@ -1375,7 +1375,7 @@ public class TestAlignVector {
    static Object[] test17b(long[] a) {
        // Not alignable
        for (int i = 0; i < RANGE-1; i++) {
-            int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
+            long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
            long v = UNSAFE.getLongUnaligned(a, adr);
            UNSAFE.putLongUnaligned(a, adr, v + 1);
        }
@ -1392,7 +1392,7 @@ public class TestAlignVector {
    static Object[] test17c(long[] a) {
        // Unsafe: aligned vectorizes
        for (int i = 0; i < RANGE-1; i+=4) {
-            int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i;
+            long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i;
            long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
            long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
            UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
@ -1422,7 +1422,7 @@ public class TestAlignVector {
    static Object[] test17d(long[] a) {
        // Not alignable
        for (int i = 0; i < RANGE-1; i+=4) {
-            int adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8 * i + 1;
+            long adr = UNSAFE.ARRAY_LONG_BASE_OFFSET + 8L * i + 1;
            long v0 = UNSAFE.getLongUnaligned(a, adr + 0);
            long v1 = UNSAFE.getLongUnaligned(a, adr + 8);
            UNSAFE.putLongUnaligned(a, adr + 0, v0 + 1);
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java
@ -1090,11 +1090,11 @@ public class TestAlignVectorFuzzer {
        int init   = init_con_or_var();
        int limit  = limit_con_or_var();
        int stride = stride_con();
-        int scale  = scale_con();
-        int offset = offset1_con_or_var();
+        long scale  = scale_con();
+        long offset = offset1_con_or_var();

        for (int i = init; i < limit; i += stride) {
-            int adr = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset + i * scale;
+            long adr = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset + i * scale;
            int v = UNSAFE.getIntUnaligned(a, adr);
            UNSAFE.putIntUnaligned(a, adr, v + 1);
        }
@ -1105,19 +1105,19 @@ public class TestAlignVectorFuzzer {
        int init   = init_con_or_var();
        int limit  = limit_con_or_var();
        int stride = stride_con();
-        int scale  = scale_con();
-        int offset1 = offset1_con_or_var();
-        int offset2 = offset2_con_or_var();
-        int offset3 = offset3_con_or_var();
+        long scale  = scale_con();
+        long offset1 = offset1_con_or_var();
+        long offset2 = offset2_con_or_var();
+        long offset3 = offset3_con_or_var();

        int h1 = hand_unrolling1_con();
        int h2 = hand_unrolling2_con();
        int h3 = hand_unrolling3_con();

        for (int i = init; i < limit; i += stride) {
-            int adr1 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset1 + i * scale;
-            int adr2 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset2 + i * scale;
-            int adr3 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset3 + i * scale;
+            long adr1 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset1 + i * scale;
+            long adr2 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset2 + i * scale;
+            long adr3 = UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset3 + i * scale;

            if (h1 >=  1) { UNSAFE.putIntUnaligned(a, adr1 +  0*4, UNSAFE.getIntUnaligned(a, adr1 +  0*4) + 1); }
            if (h1 >=  2) { UNSAFE.putIntUnaligned(a, adr1 +  1*4, UNSAFE.getIntUnaligned(a, adr1 +  1*4) + 1); }
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java
@ -172,10 +172,10 @@ public class TestIndependentPacksWithCyclicDependency {
    static void test2(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) {
        for (int i = 0; i < RANGE; i+=2) {
            // int and float arrays are two slices. But we pretend both are of type int.
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1);
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1);
-            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0);
-            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4);
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, dataIa[i+0] + 1);
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, dataIa[i+1] + 1);
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0);
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4);
        }
    }

@ -248,10 +248,10 @@ public class TestIndependentPacksWithCyclicDependency {
        for (int i = 0; i < RANGE; i+=2) {
            // same as test2, except that reordering leads to different semantics
            // explanation analogue to test4
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A
-            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X
-            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, dataIa[i+0] + 1); // A
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0); // X
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4); // Y
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, dataIa[i+1] + 1); // B
        }
    }

@ -275,18 +275,18 @@ public class TestIndependentPacksWithCyclicDependency {
                      long[] dataLa, long[] dataLb) {
        for (int i = 0; i < RANGE; i+=2) {
            // Chain of parallelizable op and conversion
-            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3;
-            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00);
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01);
-            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45;
-            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45;
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10);
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11);
-            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f;
-            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f;
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20);
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21);
+            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3;
+            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00);
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01);
+            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45;
+            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45;
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10);
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11);
+            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f;
+            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f;
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20);
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21);
        }
    }

@ -307,18 +307,18 @@ public class TestIndependentPacksWithCyclicDependency {
                      long[] dataLa, long[] dataLb) {
        for (int i = 0; i < RANGE; i+=2) {
            // Cycle involving 3 memory slices
-            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00);
-            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45;
-            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45;
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10);
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11);
-            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f;
-            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f;
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20);
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21);
-            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // moved down
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01);
+            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00);
+            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45;
+            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45;
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10);
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11);
+            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f;
+            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f;
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20);
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21);
+            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; // moved down
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01);
        }
    }

@ -340,19 +340,19 @@ public class TestIndependentPacksWithCyclicDependency {
                      long[] dataLa, long[] dataLb) {
        for (int i = 0; i < RANGE; i+=2) {
            // 2-cycle, with more ops after
-            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00);
-            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45;
-            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45;
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10);
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11);
-            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01);
+            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00);
+            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45;
+            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45;
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10);
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11);
+            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01);
            // more stuff after
-            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f;
-            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f;
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20);
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21);
+            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f;
+            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f;
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20);
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21);
        }
    }

@ -373,19 +373,19 @@ public class TestIndependentPacksWithCyclicDependency {
                      long[] dataLa, long[] dataLb) {
        for (int i = 0; i < RANGE; i+=2) {
            // 2-cycle, with more stuff before
-            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f;
-            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f;
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20);
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21);
+            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f;
+            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f;
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20);
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21);
            // 2-cycle
-            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00);
-            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45;
-            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45;
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10);
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11);
-            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01);
+            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00);
+            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45;
+            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) * 45;
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10);
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11);
+            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01);
        }
    }

@ -423,18 +423,18 @@ public class TestIndependentPacksWithCyclicDependency {
            //
            // The cycle thus does not only go via packs, but also scalar ops.
            //
-            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; // A
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00);
-            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; // R: constant mismatch
-            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) + 43; // S
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10);
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11);
-            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; // U
-            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; // V
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20);
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21);
-            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // B: moved down
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01);
+            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3; // A
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00);
+            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45; // R: constant mismatch
+            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) + 43; // S
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10);
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11);
+            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f; // U
+            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f; // V
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20);
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21);
+            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; // B: moved down
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01);
        }
    }

@ -463,8 +463,8 @@ public class TestIndependentPacksWithCyclicDependency {

    static void verify(String name, float[] data, float[] gold) {
        for (int i = 0; i < RANGE; i++) {
-            int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
-            int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
+            int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i);
+            int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i);
            if (datav != goldv) {
                throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv);
            }
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java
@ -58,18 +58,18 @@ public class TestIndependentPacksWithCyclicDependency2 {
                     long[] dataLa, long[] dataLb) {
        for (int i = 0; i < RANGE; i+=2) {
            // For explanation, see test 10 in TestIndependentPacksWithCyclicDependency.java
-            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3;
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00);
-            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45;
-            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) + 43;
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10);
-            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11);
-            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f;
-            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f;
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20);
-            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21);
-            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // moved down
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01);
+            int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0) + 3;
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, v00);
+            int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0) * 45;
+            int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4) + 43;
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0, v10);
+            unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4, v11);
+            float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 0) + 0.55f;
+            float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4L * i + 4) + 0.55f;
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0, v20);
+            unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4, v21);
+            int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4) + 3; // moved down
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, v01);
        }
    }

@ -83,8 +83,8 @@ public class TestIndependentPacksWithCyclicDependency2 {

    static void verify(String name, float[] data, float[] gold) {
        for (int i = 0; i < RANGE; i++) {
-            int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
-            int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i);
+            int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i);
+            int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i);
            if (datav != goldv) {
                throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv);
            }
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestScheduleReordersScalarMemops.java
@ -124,10 +124,10 @@ public class TestScheduleReordersScalarMemops {
        for (int i = 0; i < RANGE; i+=2) {
            // Do the same as test0, but without int-float conversion.
            // This should reproduce on machines where conversion is not implemented.
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1);  // A +1
-            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0);  // X
-            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4);  // Y
-            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] * 11); // B *11
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 0, dataIa[i+0] + 1);  // A +1
+            dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 0);  // X
+            dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4L * i + 4);  // Y
+            unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4L * i + 4, dataIa[i+1] * 11); // B *11
        }
    }