8293188: x86_64: Introduce stubGenerator_x86_64.hpp

Reviewed-by: dholmes, kvn
This commit is contained in:
Vladimir Ivanov 2022-09-02 17:05:51 +00:00
parent 2baeebbc02
commit 0c6094e796
2 changed files with 7971 additions and 7486 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,454 @@
/*
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#ifndef CPU_X86_STUBGENERATOR_X86_64_HPP
#define CPU_X86_STUBGENERATOR_X86_64_HPP
#include "code/codeBlob.hpp"
#include "runtime/continuation.hpp"
#include "runtime/stubCodeGenerator.hpp"
// Stub Code definitions
class StubGenerator: public StubCodeGenerator {
private:
// Call stubs are used to call Java from C.
address generate_call_stub(address& return_address);
// Return point for a Java call if there's an exception thrown in
// Java code. The exception is caught and transformed into a
// pending exception stored in JavaThread that can be tested from
// within the VM.
//
// Note: Usually the parameters are removed by the callee. In case
// of an exception crossing an activation frame boundary, that is
// not the case if the callee is compiled code => need to setup the
// rsp.
//
// rax: exception oop
address generate_catch_exception();
// Continuation point for runtime calls returning with a pending
// exception. The pending exception check happened in the runtime
// or native call stub. The pending exception in Thread is
// converted into a Java-level exception.
//
// Contract with Java-level exception handlers:
// rax: exception
// rdx: throwing pc
//
// NOTE: At entry of this stub, exception-pc must be on stack !!
address generate_forward_exception();
// Support for intptr_t OrderAccess::fence()
address generate_orderaccess_fence();
// Support for intptr_t get_previous_sp()
//
// This routine is used to find the previous stack pointer for the
// caller.
address generate_get_previous_sp();
//----------------------------------------------------------------------------------------------------
// Support for void verify_mxcsr()
//
// This routine is used with -Xcheck:jni to verify that native
// JNI code does not return to Java code without restoring the
// MXCSR register to our expected state.
address generate_verify_mxcsr();
address generate_f2i_fixup();
address generate_f2l_fixup();
address generate_d2i_fixup();
address generate_d2l_fixup();
address generate_count_leading_zeros_lut(const char *stub_name);
address generate_popcount_avx_lut(const char *stub_name);
address generate_iota_indices(const char *stub_name);
address generate_vector_reverse_bit_lut(const char *stub_name);
address generate_vector_reverse_byte_perm_mask_long(const char *stub_name);
address generate_vector_reverse_byte_perm_mask_int(const char *stub_name);
address generate_vector_reverse_byte_perm_mask_short(const char *stub_name);
address generate_vector_byte_shuffle_mask(const char *stub_name);
address generate_fp_mask(const char *stub_name, int64_t mask);
address generate_vector_mask(const char *stub_name, int64_t mask);
address generate_vector_byte_perm_mask(const char *stub_name);
address generate_vector_fp_mask(const char *stub_name, int64_t mask);
address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
int32_t val0, int32_t val1, int32_t val2, int32_t val3,
int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0);
// Non-destructive plausibility checks for oops
address generate_verify_oop();
// Verify that a register contains clean 32-bits positive value
// (high 32-bits are 0) so it could be used in 64-bits shifts.
void assert_clean_int(Register Rint, Register Rtmp);
// Generate overlap test for array copy stubs
void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf);
void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
assert(no_overlap_target != NULL, "must be generated");
array_overlap_test(no_overlap_target, NULL, sf);
}
void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
array_overlap_test(NULL, &L_no_overlap, sf);
}
// Shuffle first three arg regs on Windows into Linux/Solaris locations.
void setup_arg_regs(int nargs = 3);
void restore_arg_regs();
#ifdef ASSERT
bool _regs_in_thread;
#endif
// This is used in places where r10 is a scratch register, and can
// be adapted if r9 is needed also.
void setup_arg_regs_using_thread();
void restore_arg_regs_using_thread();
// Copy big chunks forward
void copy_bytes_forward(Register end_from, Register end_to,
Register qword_count, Register to,
Label& L_copy_bytes, Label& L_copy_8_bytes);
// Copy big chunks backward
void copy_bytes_backward(Register from, Register dest,
Register qword_count, Register to,
Label& L_copy_bytes, Label& L_copy_8_bytes);
void setup_argument_regs(BasicType type);
void restore_argument_regs(BasicType type);
#if COMPILER2_OR_JVMCI
// Following rules apply to AVX3 optimized arraycopy stubs:
// - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
// for both special cases (various small block sizes) and aligned copy loop. This is the
// default configuration.
// - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
// for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
// - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
// better performance for disjoint copies. For conjoint/backward copy vector based
// copy performs better.
// - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
// 64 byte vector registers (ZMMs).
address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
bool aligned, bool is_oop, bool dest_uninitialized);
address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
address nooverlap_target, bool aligned, bool is_oop,
bool dest_uninitialized);
#endif // COMPILER2_OR_JVMCI
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name);
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
address* entry, const char *name);
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name);
address generate_fill(BasicType t, bool aligned, const char *name);
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
address *entry, const char *name);
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
const char *name, bool dest_uninitialized = false);
address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
address *entry, const char *name,
bool dest_uninitialized = false);
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
const char *name, bool dest_uninitialized = false);
address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
address nooverlap_target, address *entry,
const char *name, bool dest_uninitialized = false);
// Helper for generating a dynamic type check.
// Smashes no registers.
void generate_type_check(Register sub_klass,
Register super_check_offset,
Register super_klass,
Label& L_success);
// Generate checkcasting array copy stub
address generate_checkcast_copy(const char *name, address *entry,
bool dest_uninitialized = false);
// Generate 'unsafe' array copy stub
// Though just as safe as the other stubs, it takes an unscaled
// size_t argument instead of an element count.
//
// Examines the alignment of the operands and dispatches
// to a long, int, short, or byte copy loop.
address generate_unsafe_copy(const char *name,
address byte_copy_entry, address short_copy_entry,
address int_copy_entry, address long_copy_entry);
// Perform range checks on the proposed arraycopy.
// Kills temp, but nothing else.
// Also, clean the sign bits of src_pos and dst_pos.
void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
Register src_pos, // source position (c_rarg1)
Register dst, // destination array oo (c_rarg2)
Register dst_pos, // destination position (c_rarg3)
Register length,
Register temp,
Label& L_failed);
// Generate generic array copy stubs
address generate_generic_copy(const char *name,
address byte_copy_entry, address short_copy_entry,
address int_copy_entry, address oop_copy_entry,
address long_copy_entry, address checkcast_copy_entry);
address generate_data_cache_writeback();
address generate_data_cache_writeback_sync();
void generate_arraycopy_stubs();
// AES intrinsic stubs
enum {
AESBlockSize = 16
};
address generate_key_shuffle_mask();
address generate_counter_shuffle_mask();
// Utility routine for loading a 128-bit key word in little endian format
// can optionally specify that the shuffle mask is already in an xmmregister
void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg);
// Utility routine for increase 128bit counter (iv in CTR mode)
void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block);
address generate_aescrypt_encryptBlock();
address generate_aescrypt_decryptBlock();
address generate_cipherBlockChaining_encryptAESCrypt();
// A version of CBC/AES Decrypt which does 4 blocks in a loop at a time
// to hide instruction latency
address generate_cipherBlockChaining_decryptAESCrypt_Parallel();
address generate_electronicCodeBook_encryptAESCrypt();
address generate_electronicCodeBook_decryptAESCrypt();
// ofs and limit are use for multi-block byte array.
// int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
address generate_md5_implCompress(bool multi_block, const char *name);
address generate_upper_word_mask();
address generate_shuffle_byte_flip_mask();
// ofs and limit are use for multi-block byte array.
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
address generate_sha1_implCompress(bool multi_block, const char *name);
address generate_pshuffle_byte_flip_mask();
// Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
address generate_pshuffle_byte_flip_mask_sha512();
// ofs and limit are use for multi-block byte array.
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
address generate_sha256_implCompress(bool multi_block, const char *name);
address generate_sha512_implCompress(bool multi_block, const char *name);
address ghash_polynomial512_addr();
// Vector AES Galois Counter Mode implementation
address generate_galoisCounterMode_AESCrypt();
// This mask is used for incrementing counter value(linc0, linc4, etc.)
address counter_mask_addr();
// Vector AES Counter implementation
address generate_counterMode_VectorAESCrypt();
// This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
// to hide instruction latency
address generate_counterMode_AESCrypt_Parallel();
void roundDec(XMMRegister xmm_reg);
void roundDeclast(XMMRegister xmm_reg);
void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg);
address generate_cipherBlockChaining_decryptVectorAESCrypt();
// Polynomial x^128+x^127+x^126+x^121+1
address ghash_polynomial_addr();
address ghash_shufflemask_addr();
// Ghash single and multi block operations using AVX instructions
address generate_avx_ghash_processBlocks();
// byte swap x86 long
address generate_ghash_long_swap_mask();
// byte swap x86 byte array
address generate_ghash_byte_swap_mask();
// Single and multi-block ghash operations
address generate_ghash_processBlocks();
address base64_shuffle_addr();
address base64_avx2_shuffle_addr();
address base64_avx2_input_mask_addr();
address base64_avx2_lut_addr();
address base64_encoding_table_addr();
// Code for generating Base64 encoding.
// Intrinsic function prototype in Base64.java:
// private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
address generate_base64_encodeBlock();
// base64 AVX512vbmi tables
address base64_vbmi_lookup_lo_addr();
address base64_vbmi_lookup_hi_addr();
address base64_vbmi_lookup_lo_url_addr();
address base64_vbmi_lookup_hi_url_addr();
address base64_vbmi_pack_vec_addr();
address base64_vbmi_join_0_1_addr();
address base64_vbmi_join_1_2_addr();
address base64_vbmi_join_2_3_addr();
address base64_decoding_table_addr();
// Code for generating Base64 decoding.
//
// Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
//
// Intrinsic function prototype in Base64.java:
// private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME);
address generate_base64_decodeBlock();
address generate_updateBytesCRC32();
address generate_updateBytesCRC32C(bool is_pclmulqdq_supported);
address generate_updateBytesAdler32();
address generate_multiplyToLen();
address generate_vectorizedMismatch();
address generate_squareToLen();
address generate_method_entry_barrier();
address generate_mulAdd();
address generate_bigIntegerRightShift();
address generate_bigIntegerLeftShift();
address generate_libmExp();
address generate_libmLog();
address generate_libmLog10();
address generate_libmPow();
address generate_libmSin();
address generate_libmCos();
address generate_libmTan();
address generate_cont_thaw(const char* label, Continuation::thaw_kind kind);
address generate_cont_thaw();
// TODO: will probably need multiple return barriers depending on return type
address generate_cont_returnBarrier();
address generate_cont_returnBarrier_exception();
#if INCLUDE_JFR
// For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
// It returns a jobject handle to the event writer.
// The handle is dereferenced and the return value is the event writer oop.
RuntimeStub* generate_jfr_write_checkpoint();
#endif // INCLUDE_JFR
// Continuation point for throwing of implicit exceptions that are
// not handled in the current activation. Fabricates an exception
// oop and initiates normal exception dispatching in this
// frame. Since we need to preserve callee-saved values (currently
// only for C2, but done for C1 as well) we need a callee-saved oop
// map and therefore have to make these stubs into RuntimeStubs
// rather than BufferBlobs. If the compiler needs all registers to
// be preserved between the fault point and the exception handler
// then it must assume responsibility for that in
// AbstractCompiler::continuation_for_implicit_null_exception or
// continuation_for_implicit_division_by_zero_exception. All other
// implicit exceptions (e.g., NullPointerException or
// AbstractMethodError on entry) are either at call sites or
// otherwise assume that stack unwinding will be initiated, so
// caller saved registers were assumed volatile in the compiler.
address generate_throw_exception(const char* name,
address runtime_entry,
Register arg1 = noreg,
Register arg2 = noreg);
void create_control_words();
// Initialization
void generate_initial();
void generate_phase1();
void generate_all();
public:
StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
DEBUG_ONLY( _regs_in_thread = false; )
if (phase == 0) {
generate_initial();
} else if (phase == 1) {
generate_phase1(); // stubs that must be available for the interpreter
} else {
generate_all();
}
}
};
#endif // CPU_X86_STUBGENERATOR_X86_64_HPP