From 802664728f06384bedfe8e7761180d8496e9cee5 Mon Sep 17 00:00:00 2001 From: Kim Barrett Date: Tue, 8 Sep 2015 16:00:34 -0400 Subject: [PATCH 01/20] 8134797: Remove explicit casts in CollectorPolicy hierarchy Removed the explicit casts. Reviewed-by: jwilhelm, tschatzl, pliden --- .../gc/cms/concurrentMarkSweepGeneration.cpp | 3 +-- .../src/share/vm/gc/g1/g1CollectedHeap.cpp | 4 ++++ .../src/share/vm/gc/g1/g1CollectedHeap.hpp | 2 +- .../vm/gc/parallel/parallelScavengeHeap.hpp | 2 +- .../share/vm/gc/serial/defNewGeneration.cpp | 2 +- .../share/vm/gc/serial/tenuredGeneration.cpp | 3 +-- .../share/vm/gc/shared/genCollectedHeap.cpp | 19 +++++++++---------- .../share/vm/gc/shared/genCollectedHeap.hpp | 2 +- 8 files changed, 19 insertions(+), 18 deletions(-) diff --git a/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp b/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp index c1a4cdcc732..9806a5e59c5 100644 --- a/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp +++ b/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp @@ -304,8 +304,7 @@ AdaptiveSizePolicy* CMSCollector::size_policy() { void ConcurrentMarkSweepGeneration::initialize_performance_counters() { const char* gen_name = "old"; - GenCollectorPolicy* gcp = (GenCollectorPolicy*) GenCollectedHeap::heap()->collector_policy(); - + GenCollectorPolicy* gcp = GenCollectedHeap::heap()->gen_policy(); // Generation Counters - generation 1, 1 subspace _gen_counters = new GenerationCounters(gen_name, 1, 1, gcp->min_old_size(), gcp->max_old_size(), &_virtual_space); diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp index 9deb40f90c6..ffa778af3ba 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp @@ -2397,6 +2397,10 @@ void G1CollectedHeap::ref_processing_init() { // (for efficiency/performance) } +CollectorPolicy* G1CollectedHeap::collector_policy() const { + return g1_policy(); +} + size_t G1CollectedHeap::capacity() const { return _hrm.length() * HeapRegion::GrainBytes; } diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp index 58231d29a68..c554d7582ac 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp @@ -1057,7 +1057,7 @@ public: // The current policy object for the collector. G1CollectorPolicy* g1_policy() const { return _g1_policy; } - virtual CollectorPolicy* collector_policy() const { return (CollectorPolicy*) g1_policy(); } + virtual CollectorPolicy* collector_policy() const; // Adaptive size policy. No such thing for g1. virtual AdaptiveSizePolicy* size_policy() { return NULL; } diff --git a/hotspot/src/share/vm/gc/parallel/parallelScavengeHeap.hpp b/hotspot/src/share/vm/gc/parallel/parallelScavengeHeap.hpp index b8e4879a801..86953f7572d 100644 --- a/hotspot/src/share/vm/gc/parallel/parallelScavengeHeap.hpp +++ b/hotspot/src/share/vm/gc/parallel/parallelScavengeHeap.hpp @@ -87,7 +87,7 @@ class ParallelScavengeHeap : public CollectedHeap { return CollectedHeap::ParallelScavengeHeap; } - virtual CollectorPolicy* collector_policy() const { return (CollectorPolicy*) _collector_policy; } + virtual CollectorPolicy* collector_policy() const { return _collector_policy; } static PSYoungGen* young_gen() { return _young_gen; } static PSOldGen* old_gen() { return _old_gen; } diff --git a/hotspot/src/share/vm/gc/serial/defNewGeneration.cpp b/hotspot/src/share/vm/gc/serial/defNewGeneration.cpp index f43f4ad5440..8744f74db88 100644 --- a/hotspot/src/share/vm/gc/serial/defNewGeneration.cpp +++ b/hotspot/src/share/vm/gc/serial/defNewGeneration.cpp @@ -213,7 +213,7 @@ DefNewGeneration::DefNewGeneration(ReservedSpace rs, _max_eden_size = size - (2*_max_survivor_size); // allocate the performance counters - GenCollectorPolicy* gcp = (GenCollectorPolicy*)gch->collector_policy(); + GenCollectorPolicy* gcp = gch->gen_policy(); // Generation counters -- generation 0, 3 subspaces _gen_counters = new GenerationCounters("new", 0, 3, diff --git a/hotspot/src/share/vm/gc/serial/tenuredGeneration.cpp b/hotspot/src/share/vm/gc/serial/tenuredGeneration.cpp index 4923022f20b..7f88c6c361e 100644 --- a/hotspot/src/share/vm/gc/serial/tenuredGeneration.cpp +++ b/hotspot/src/share/vm/gc/serial/tenuredGeneration.cpp @@ -57,8 +57,7 @@ TenuredGeneration::TenuredGeneration(ReservedSpace rs, // initialize performance counters const char* gen_name = "old"; - GenCollectorPolicy* gcp = (GenCollectorPolicy*) GenCollectedHeap::heap()->collector_policy(); - + GenCollectorPolicy* gcp = GenCollectedHeap::heap()->gen_policy(); // Generation Counters -- generation 1, 1 subspace _gen_counters = new GenerationCounters(gen_name, 1, 1, gcp->min_old_size(), gcp->max_old_size(), &_virtual_space); diff --git a/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp b/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp index e6649324282..6ceed796f6f 100644 --- a/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp +++ b/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp @@ -172,8 +172,6 @@ char* GenCollectedHeap::allocate(size_t alignment, void GenCollectedHeap::post_initialize() { CollectedHeap::post_initialize(); ref_processing_init(); - GenCollectorPolicy *policy = (GenCollectorPolicy *)collector_policy(); - guarantee(policy->is_generation_policy(), "Illegal policy type"); assert((_young_gen->kind() == Generation::DefNew) || (_young_gen->kind() == Generation::ParNew), "Wrong youngest generation type"); @@ -183,10 +181,10 @@ void GenCollectedHeap::post_initialize() { _old_gen->kind() == Generation::MarkSweepCompact, "Wrong generation kind"); - policy->initialize_size_policy(def_new_gen->eden()->capacity(), - _old_gen->capacity(), - def_new_gen->from()->capacity()); - policy->initialize_gc_policy_counters(); + _gen_policy->initialize_size_policy(def_new_gen->eden()->capacity(), + _old_gen->capacity(), + def_new_gen->from()->capacity()); + _gen_policy->initialize_gc_policy_counters(); } void GenCollectedHeap::ref_processing_init() { @@ -822,10 +820,11 @@ bool GenCollectedHeap::create_cms_collector() { "Unexpected generation kinds"); // Skip two header words in the block content verification NOT_PRODUCT(_skip_header_HeapWords = CMSCollector::skip_header_HeapWords();) - CMSCollector* collector = new CMSCollector( - (ConcurrentMarkSweepGeneration*)_old_gen, - _rem_set->as_CardTableRS(), - (ConcurrentMarkSweepPolicy*) collector_policy()); + assert(_gen_policy->is_concurrent_mark_sweep_policy(), "Unexpected policy type"); + CMSCollector* collector = + new CMSCollector((ConcurrentMarkSweepGeneration*)_old_gen, + _rem_set->as_CardTableRS(), + _gen_policy->as_concurrent_mark_sweep_policy()); if (collector == NULL || !collector->completed_initialization()) { if (collector) { diff --git a/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp b/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp index bcef2c9c0fe..ac290c9332d 100644 --- a/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp +++ b/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp @@ -153,7 +153,7 @@ public: // The generational collector policy. GenCollectorPolicy* gen_policy() const { return _gen_policy; } - virtual CollectorPolicy* collector_policy() const { return (CollectorPolicy*) gen_policy(); } + virtual CollectorPolicy* collector_policy() const { return gen_policy(); } // Adaptive size policy virtual AdaptiveSizePolicy* size_policy() { From 2dbd4dd578d6e21b772ac9994be5cd4c7c6bd515 Mon Sep 17 00:00:00 2001 From: Mikael Gerdin Date: Wed, 9 Sep 2015 10:34:22 +0200 Subject: [PATCH 02/20] 8135152: Create a G1ParScanThreadStateSet class for managing G1 GC per thread states Reviewed-by: tschatzl, ehelin --- .../src/share/vm/gc/g1/g1CollectedHeap.cpp | 70 ++++++++----------- .../src/share/vm/gc/g1/g1CollectedHeap.hpp | 11 ++- .../share/vm/gc/g1/g1CollectedHeap_ext.cpp | 4 -- .../src/share/vm/gc/g1/g1CollectorPolicy.hpp | 4 +- .../share/vm/gc/g1/g1ParScanThreadState.cpp | 28 +++++++- .../share/vm/gc/g1/g1ParScanThreadState.hpp | 32 +++++++++ .../vm/gc/g1/g1ParScanThreadState_ext.cpp | 31 ++++++++ hotspot/src/share/vm/gc/shared/ageTable.cpp | 7 -- hotspot/src/share/vm/gc/shared/ageTable.hpp | 1 - 9 files changed, 127 insertions(+), 61 deletions(-) create mode 100644 hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp index ffa778af3ba..b3d29bdc53f 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp @@ -4164,8 +4164,9 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) { // Initialize the GC alloc regions. _allocator->init_gc_alloc_regions(evacuation_info); + G1ParScanThreadStateSet per_thread_states(this, workers()->active_workers()); // Actually do the work... - evacuate_collection_set(evacuation_info); + evacuate_collection_set(evacuation_info, &per_thread_states); free_collection_set(g1_policy()->collection_set(), evacuation_info); @@ -4545,15 +4546,15 @@ class G1KlassScanClosure : public KlassClosure { class G1ParTask : public AbstractGangTask { protected: - G1CollectedHeap* _g1h; - G1ParScanThreadState** _pss; - RefToScanQueueSet* _queues; - G1RootProcessor* _root_processor; - ParallelTaskTerminator _terminator; - uint _n_workers; + G1CollectedHeap* _g1h; + G1ParScanThreadStateSet* _pss; + RefToScanQueueSet* _queues; + G1RootProcessor* _root_processor; + ParallelTaskTerminator _terminator; + uint _n_workers; public: - G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers) + G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers) : AbstractGangTask("G1 collection"), _g1h(g1h), _pss(per_thread_states), @@ -4611,7 +4612,7 @@ public: ReferenceProcessor* rp = _g1h->ref_processor_stw(); - G1ParScanThreadState* pss = _pss[worker_id]; + G1ParScanThreadState* pss = _pss->state_for_worker(worker_id); pss->set_ref_processor(rp); bool only_young = _g1h->collector_state()->gcs_are_young(); @@ -5267,15 +5268,15 @@ public: class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor { private: - G1CollectedHeap* _g1h; - G1ParScanThreadState** _pss; - RefToScanQueueSet* _queues; - WorkGang* _workers; - uint _active_workers; + G1CollectedHeap* _g1h; + G1ParScanThreadStateSet* _pss; + RefToScanQueueSet* _queues; + WorkGang* _workers; + uint _active_workers; public: G1STWRefProcTaskExecutor(G1CollectedHeap* g1h, - G1ParScanThreadState** per_thread_states, + G1ParScanThreadStateSet* per_thread_states, WorkGang* workers, RefToScanQueueSet *task_queues, uint n_workers) : @@ -5299,14 +5300,14 @@ class G1STWRefProcTaskProxy: public AbstractGangTask { typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask; ProcessTask& _proc_task; G1CollectedHeap* _g1h; - G1ParScanThreadState** _pss; + G1ParScanThreadStateSet* _pss; RefToScanQueueSet* _task_queues; ParallelTaskTerminator* _terminator; public: G1STWRefProcTaskProxy(ProcessTask& proc_task, G1CollectedHeap* g1h, - G1ParScanThreadState** per_thread_states, + G1ParScanThreadStateSet* per_thread_states, RefToScanQueueSet *task_queues, ParallelTaskTerminator* terminator) : AbstractGangTask("Process reference objects in parallel"), @@ -5324,7 +5325,7 @@ public: G1STWIsAliveClosure is_alive(_g1h); - G1ParScanThreadState* pss = _pss[worker_id]; + G1ParScanThreadState* pss = _pss->state_for_worker(worker_id); pss->set_ref_processor(NULL); G1ParScanExtRootClosure only_copy_non_heap_cl(_g1h, pss); @@ -5403,14 +5404,14 @@ void G1STWRefProcTaskExecutor::execute(EnqueueTask& enq_task) { class G1ParPreserveCMReferentsTask: public AbstractGangTask { protected: - G1CollectedHeap* _g1h; - G1ParScanThreadState** _pss; - RefToScanQueueSet* _queues; - ParallelTaskTerminator _terminator; - uint _n_workers; + G1CollectedHeap* _g1h; + G1ParScanThreadStateSet* _pss; + RefToScanQueueSet* _queues; + ParallelTaskTerminator _terminator; + uint _n_workers; public: - G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, int workers, RefToScanQueueSet *task_queues) : + G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, int workers, RefToScanQueueSet *task_queues) : AbstractGangTask("ParPreserveCMReferents"), _g1h(g1h), _pss(per_thread_states), @@ -5423,7 +5424,7 @@ public: ResourceMark rm; HandleMark hm; - G1ParScanThreadState* pss = _pss[worker_id]; + G1ParScanThreadState* pss = _pss->state_for_worker(worker_id); pss->set_ref_processor(NULL); assert(pss->queue_is_empty(), "both queue and overflow should be empty"); @@ -5484,7 +5485,7 @@ public: }; // Weak Reference processing during an evacuation pause (part 1). -void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_thread_states) { +void G1CollectedHeap::process_discovered_references(G1ParScanThreadStateSet* per_thread_states) { double ref_proc_start = os::elapsedTime(); ReferenceProcessor* rp = _ref_processor_stw; @@ -5529,7 +5530,7 @@ void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_t // JNI refs. // Use only a single queue for this PSS. - G1ParScanThreadState* pss = per_thread_states[0]; + G1ParScanThreadState* pss = per_thread_states->state_for_worker(0); pss->set_ref_processor(NULL); assert(pss->queue_is_empty(), "pre-condition"); @@ -5590,7 +5591,7 @@ void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_t } // Weak Reference processing during an evacuation pause (part 2). -void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_thread_states) { +void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadStateSet* per_thread_states) { double ref_enq_start = os::elapsedTime(); ReferenceProcessor* rp = _ref_processor_stw; @@ -5625,7 +5626,7 @@ void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_t g1_policy()->phase_times()->record_ref_enq_time(ref_enq_time * 1000.0); } -void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) { +void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states) { _expand_heap_after_alloc_failure = true; _evacuation_failed = false; @@ -5645,11 +5646,6 @@ void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) { double start_par_time_sec = os::elapsedTime(); double end_par_time_sec; - G1ParScanThreadState** per_thread_states = NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC); - for (uint i = 0; i < n_workers; i++) { - per_thread_states[i] = new_par_scan_state(i); - } - { G1RootProcessor root_processor(this, n_workers); G1ParTask g1_par_task(this, per_thread_states, _task_queues, &root_processor, n_workers); @@ -5703,11 +5699,7 @@ void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) { _allocator->release_gc_alloc_regions(evacuation_info); g1_rem_set()->cleanup_after_oops_into_collection_set_do(); - for (uint i = 0; i < n_workers; i++) { - G1ParScanThreadState* pss = per_thread_states[i]; - delete pss; - } - FREE_C_HEAP_ARRAY(G1ParScanThreadState*, per_thread_states); + per_thread_states->flush(); record_obj_copy_mem_stats(); diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp index c554d7582ac..fcdc7146fb9 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp @@ -56,6 +56,7 @@ class HRRSCleanupTask; class GenerationSpec; class OopsInHeapRegionClosure; class G1ParScanThreadState; +class G1ParScanThreadStateSet; class G1KlassScanClosure; class G1ParScanThreadState; class ObjectClosure; @@ -192,6 +193,7 @@ class G1CollectedHeap : public CollectedHeap { // Closures used in implementation. friend class G1ParScanThreadState; + friend class G1ParScanThreadStateSet; friend class G1ParTask; friend class G1PLABAllocator; friend class G1PrepareCompactClosure; @@ -584,11 +586,11 @@ protected: // Process any reference objects discovered during // an incremental evacuation pause. - void process_discovered_references(G1ParScanThreadState** per_thread_states); + void process_discovered_references(G1ParScanThreadStateSet* per_thread_states); // Enqueue any remaining discovered references // after processing. - void enqueue_discovered_references(G1ParScanThreadState** per_thread_states); + void enqueue_discovered_references(G1ParScanThreadStateSet* per_thread_states); public: WorkGang* workers() const { return _workers; } @@ -683,9 +685,6 @@ public: // Allocates a new heap region instance. HeapRegion* new_heap_region(uint hrs_index, MemRegion mr); - // Allocates a new per thread par scan state for the given thread id. - G1ParScanThreadState* new_par_scan_state(uint worker_id); - // Allocate the highest free region in the reserved heap. This will commit // regions as necessary. HeapRegion* alloc_highest_free_region(); @@ -799,7 +798,7 @@ protected: bool do_collection_pause_at_safepoint(double target_pause_time_ms); // Actually do the work of evacuating the collection set. - void evacuate_collection_set(EvacuationInfo& evacuation_info); + void evacuate_collection_set(EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states); // Print the header for the per-thread termination statistics. static void print_termination_stats_hdr(outputStream* const st); diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp index f05668e685a..0c04c106aad 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp @@ -38,7 +38,3 @@ HeapRegion* G1CollectedHeap::new_heap_region(uint hrs_index, MemRegion mr) { return new HeapRegion(hrs_index, bot_shared(), mr); } - -G1ParScanThreadState* G1CollectedHeap::new_par_scan_state(uint worker_id) { - return new G1ParScanThreadState(this, worker_id); -} diff --git a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp index 1ed47e6d6e2..27a7779b63c 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp @@ -865,8 +865,8 @@ public: return _recorded_survivor_regions; } - void record_thread_age_table(ageTable* age_table) { - _survivors_age_table.merge_par(age_table); + void record_age_table(ageTable* age_table) { + _survivors_age_table.merge(age_table); } void update_max_gc_locker_expansion(); diff --git a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp index ef87c3666a5..660cfa76289 100644 --- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp +++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp @@ -71,11 +71,16 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id) _dest[InCSetState::Old] = InCSetState::Old; } -G1ParScanThreadState::~G1ParScanThreadState() { +// Pass locally gathered statistics to global state. +void G1ParScanThreadState::flush() { + _dcq.flush(); // Update allocation statistics. _plab_allocator->flush_and_retire_stats(); + _g1h->g1_policy()->record_age_table(&_age_table); +} + +G1ParScanThreadState::~G1ParScanThreadState() { delete _plab_allocator; - _g1h->g1_policy()->record_thread_age_table(&_age_table); // Update heap statistics. _g1h->update_surviving_young_words(_surviving_young_words); FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base); @@ -314,6 +319,25 @@ oop G1ParScanThreadState::copy_to_survivor_space(InCSetState const state, } } +G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id) { + assert(worker_id < _n_workers, "out of bounds access"); + return _states[worker_id]; +} + +void G1ParScanThreadStateSet::flush() { + assert(!_flushed, "thread local state from the per thread states should be flushed once"); + + for (uint worker_index = 0; worker_index < _n_workers; ++worker_index) { + G1ParScanThreadState* pss = _states[worker_index]; + + pss->flush(); + + delete pss; + _states[worker_index] = NULL; + } + _flushed = true; +} + oop G1ParScanThreadState::handle_evacuation_failure_par(oop old, markOop m) { assert(_g1h->obj_in_cs(old), err_msg("Object " PTR_FORMAT " should be in the CSet", p2i(old))); diff --git a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp index ee97257a516..31f4e1eb793 100644 --- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp +++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp @@ -121,6 +121,8 @@ class G1ParScanThreadState : public CHeapObj { return _surviving_young_words + 1; } + void flush(); + private: #define G1_PARTIAL_ARRAY_MASK 0x2 @@ -189,4 +191,34 @@ class G1ParScanThreadState : public CHeapObj { oop handle_evacuation_failure_par(oop obj, markOop m); }; +class G1ParScanThreadStateSet : public StackObj { + G1CollectedHeap* _g1h; + G1ParScanThreadState** _states; + uint _n_workers; + bool _flushed; + + public: + G1ParScanThreadStateSet(G1CollectedHeap* g1h, uint n_workers) : + _g1h(g1h), + _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC)), + _n_workers(n_workers), + _flushed(false) { + for (uint i = 0; i < n_workers; ++i) { + _states[i] = new_par_scan_state(i); + } + } + + ~G1ParScanThreadStateSet() { + assert(_flushed, "thread local state from the per thread states should have been flushed"); + FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states); + } + + void flush(); + + G1ParScanThreadState* state_for_worker(uint worker_id); + + private: + G1ParScanThreadState* new_par_scan_state(uint worker_id); +}; + #endif // SHARE_VM_GC_G1_G1PARSCANTHREADSTATE_HPP diff --git a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp new file mode 100644 index 00000000000..b63c07c260b --- /dev/null +++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" + +#include "gc/g1/g1ParScanThreadState.hpp" + +G1ParScanThreadState* G1ParScanThreadStateSet::new_par_scan_state(uint worker_id) { + return new G1ParScanThreadState(_g1h, worker_id); +} diff --git a/hotspot/src/share/vm/gc/shared/ageTable.cpp b/hotspot/src/share/vm/gc/shared/ageTable.cpp index 0a6c189f94f..cfa2a648323 100644 --- a/hotspot/src/share/vm/gc/shared/ageTable.cpp +++ b/hotspot/src/share/vm/gc/shared/ageTable.cpp @@ -28,7 +28,6 @@ #include "gc/shared/collectorPolicy.hpp" #include "gc/shared/gcPolicyCounters.hpp" #include "memory/resourceArea.hpp" -#include "runtime/atomic.inline.hpp" #include "utilities/copy.hpp" /* Copyright (c) 1992, 2015, Oracle and/or its affiliates, and Stanford University. @@ -73,12 +72,6 @@ void ageTable::merge(ageTable* subTable) { } } -void ageTable::merge_par(ageTable* subTable) { - for (int i = 0; i < table_size; i++) { - Atomic::add_ptr(subTable->sizes[i], &sizes[i]); - } -} - uint ageTable::compute_tenuring_threshold(size_t survivor_capacity, GCPolicyCounters* gc_counters) { size_t desired_survivor_size = (size_t)((((double) survivor_capacity)*TargetSurvivorRatio)/100); uint result; diff --git a/hotspot/src/share/vm/gc/shared/ageTable.hpp b/hotspot/src/share/vm/gc/shared/ageTable.hpp index 2902822006b..588cd9e5c72 100644 --- a/hotspot/src/share/vm/gc/shared/ageTable.hpp +++ b/hotspot/src/share/vm/gc/shared/ageTable.hpp @@ -68,7 +68,6 @@ class ageTable VALUE_OBJ_CLASS_SPEC { // Merge another age table with the current one. Used // for parallel young generation gc. void merge(ageTable* subTable); - void merge_par(ageTable* subTable); // calculate new tenuring threshold based on age information uint compute_tenuring_threshold(size_t survivor_capacity, GCPolicyCounters* gc_counters); From 5ee47e4f95553f98619974122d670d4d7da6cd89 Mon Sep 17 00:00:00 2001 From: Kirill Zhaldybin Date: Wed, 9 Sep 2015 15:14:05 +0300 Subject: [PATCH 03/20] 8134523: Humongous object test fails with OOME Added Xms for runs with region' size 16M and 32M to prevent OOME Reviewed-by: mgerdin, dfazunen --- .../test/gc/g1/humongousObjects/TestHumongousThreshold.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hotspot/test/gc/g1/humongousObjects/TestHumongousThreshold.java b/hotspot/test/gc/g1/humongousObjects/TestHumongousThreshold.java index 5ce300c962d..ac632e53fe6 100644 --- a/hotspot/test/gc/g1/humongousObjects/TestHumongousThreshold.java +++ b/hotspot/test/gc/g1/humongousObjects/TestHumongousThreshold.java @@ -56,11 +56,11 @@ import sun.hotspot.WhiteBox; * gc.g1.humongousObjects.TestHumongousThreshold * * @run main/othervm -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. - * -XX:G1HeapRegionSize=16M + * -Xms128M -XX:G1HeapRegionSize=16M * gc.g1.humongousObjects.TestHumongousThreshold * * @run main/othervm -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:. - * -XX:G1HeapRegionSize=32M + * -Xms200M -XX:G1HeapRegionSize=32M * gc.g1.humongousObjects.TestHumongousThreshold * */ From 6f11efbbb4471df7d6ba79e9956c8cbb6ae37a62 Mon Sep 17 00:00:00 2001 From: Mikael Gerdin Date: Wed, 9 Sep 2015 14:22:45 +0200 Subject: [PATCH 04/20] 8135154: Move cards scanned and surviving young words aggregation to G1ParScanThreadStateSet Reviewed-by: tschatzl, ehelin --- .../src/share/vm/gc/g1/g1CollectedHeap.cpp | 60 ++++--------------- .../src/share/vm/gc/g1/g1CollectedHeap.hpp | 9 +-- .../src/share/vm/gc/g1/g1CollectorPolicy.cpp | 4 +- .../src/share/vm/gc/g1/g1CollectorPolicy.hpp | 2 +- .../share/vm/gc/g1/g1ParScanThreadState.cpp | 36 ++++++++--- .../share/vm/gc/g1/g1ParScanThreadState.hpp | 24 ++++++-- .../vm/gc/g1/g1ParScanThreadState_ext.cpp | 4 +- hotspot/src/share/vm/gc/g1/g1RemSet.cpp | 35 ++++------- hotspot/src/share/vm/gc/g1/g1RemSet.hpp | 19 +++--- 9 files changed, 84 insertions(+), 109 deletions(-) diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp index b3d29bdc53f..648605e1901 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp @@ -2025,7 +2025,6 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) : _survivor_evac_stats(YoungPLABSize, PLABWeight), _old_evac_stats(OldPLABSize, PLABWeight), _expand_heap_after_alloc_failure(true), - _surviving_young_words(NULL), _old_marking_cycles_started(0), _old_marking_cycles_completed(0), _heap_summary_sent(false), @@ -3698,10 +3697,6 @@ size_t G1CollectedHeap::pending_card_num() { return (buffer_size * buffer_num + extra_cards) / oopSize; } -size_t G1CollectedHeap::cards_scanned() { - return g1_rem_set()->cardsScanned(); -} - class RegisterHumongousWithInCSetFastTestClosure : public HeapRegionClosure { private: size_t _total_humongous; @@ -3842,36 +3837,6 @@ void G1CollectedHeap::register_humongous_regions_with_cset() { cl.flush_rem_set_entries(); } -void G1CollectedHeap::setup_surviving_young_words() { - assert(_surviving_young_words == NULL, "pre-condition"); - uint array_length = g1_policy()->young_cset_region_length(); - _surviving_young_words = NEW_C_HEAP_ARRAY(size_t, (size_t) array_length, mtGC); - if (_surviving_young_words == NULL) { - vm_exit_out_of_memory(sizeof(size_t) * array_length, OOM_MALLOC_ERROR, - "Not enough space for young surv words summary."); - } - memset(_surviving_young_words, 0, (size_t) array_length * sizeof(size_t)); -#ifdef ASSERT - for (uint i = 0; i < array_length; ++i) { - assert( _surviving_young_words[i] == 0, "memset above" ); - } -#endif // !ASSERT -} - -void G1CollectedHeap::update_surviving_young_words(size_t* surv_young_words) { - assert_at_safepoint(true); - uint array_length = g1_policy()->young_cset_region_length(); - for (uint i = 0; i < array_length; ++i) { - _surviving_young_words[i] += surv_young_words[i]; - } -} - -void G1CollectedHeap::cleanup_surviving_young_words() { - guarantee( _surviving_young_words != NULL, "pre-condition" ); - FREE_C_HEAP_ARRAY(size_t, _surviving_young_words); - _surviving_young_words = NULL; -} - #ifdef ASSERT class VerifyCSetClosure: public HeapRegionClosure { public: @@ -4159,23 +4124,20 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) { collection_set_iterate(&cl); #endif // ASSERT - setup_surviving_young_words(); - // Initialize the GC alloc regions. _allocator->init_gc_alloc_regions(evacuation_info); - G1ParScanThreadStateSet per_thread_states(this, workers()->active_workers()); + G1ParScanThreadStateSet per_thread_states(this, workers()->active_workers(), g1_policy()->young_cset_region_length()); // Actually do the work... evacuate_collection_set(evacuation_info, &per_thread_states); - free_collection_set(g1_policy()->collection_set(), evacuation_info); + const size_t* surviving_young_words = per_thread_states.surviving_young_words(); + free_collection_set(g1_policy()->collection_set(), evacuation_info, surviving_young_words); eagerly_reclaim_humongous_regions(); g1_policy()->clear_collection_set(); - cleanup_surviving_young_words(); - // Start a new incremental collection set for the next pause. g1_policy()->start_incremental_cset_building(); @@ -4260,7 +4222,8 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) { // investigate this in CR 7178365. double sample_end_time_sec = os::elapsedTime(); double pause_time_ms = (sample_end_time_sec - sample_start_time_sec) * MILLIUNITS; - g1_policy()->record_collection_pause_end(pause_time_ms); + size_t total_cards_scanned = per_thread_states.total_cards_scanned(); + g1_policy()->record_collection_pause_end(pause_time_ms, total_cards_scanned); evacuation_info.set_collectionset_used_before(g1_policy()->collection_set_bytes_used_before()); evacuation_info.set_bytes_copied(g1_policy()->bytes_copied_during_gc()); @@ -4669,9 +4632,12 @@ public: worker_id); G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, pss); - _g1h->g1_rem_set()->oops_into_collection_set_do(&push_heap_rs_cl, - weak_root_cl, - worker_id); + size_t cards_scanned = _g1h->g1_rem_set()->oops_into_collection_set_do(&push_heap_rs_cl, + weak_root_cl, + worker_id); + + _pss->add_cards_scanned(worker_id, cards_scanned); + double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec; double term_sec = 0.0; @@ -6050,7 +6016,7 @@ void G1CollectedHeap::cleanUpCardTable() { g1_policy()->phase_times()->record_clear_ct_time(elapsed * 1000.0); } -void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info) { +void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info, const size_t* surviving_young_words) { size_t pre_used = 0; FreeRegionList local_free_list("Local List for CSet Freeing"); @@ -6104,7 +6070,7 @@ void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& e int index = cur->young_index_in_cset(); assert(index != -1, "invariant"); assert((uint) index < policy->young_cset_region_length(), "invariant"); - size_t words_survived = _surviving_young_words[index]; + size_t words_survived = surviving_young_words[index]; cur->record_surv_words_in_group(words_survived); // At this point the we have 'popped' cur from the collection set diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp index fcdc7146fb9..5f47e21d34f 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp @@ -311,14 +311,8 @@ private: volatile unsigned _gc_time_stamp; - size_t* _surviving_young_words; - G1HRPrinter _hr_printer; - void setup_surviving_young_words(); - void update_surviving_young_words(size_t* surv_young_words); - void cleanup_surviving_young_words(); - // It decides whether an explicit GC should start a concurrent cycle // instead of doing a STW GC. Currently, a concurrent cycle is // explicitly started if: @@ -832,7 +826,7 @@ protected: // After a collection pause, make the regions in the CS into free // regions. - void free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info); + void free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info, const size_t* surviving_young_words); // Abandon the current collection set without recording policy // statistics or updating free lists. @@ -1609,7 +1603,6 @@ public: public: size_t pending_card_num(); - size_t cards_scanned(); protected: size_t _max_heap_capacity; diff --git a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp index aa868350e33..5fafa9e8cbd 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp @@ -923,7 +923,7 @@ bool G1CollectorPolicy::need_to_start_conc_mark(const char* source, size_t alloc // Anything below that is considered to be zero #define MIN_TIMER_GRANULARITY 0.0000001 -void G1CollectorPolicy::record_collection_pause_end(double pause_time_ms) { +void G1CollectorPolicy::record_collection_pause_end(double pause_time_ms, size_t cards_scanned) { double end_time_sec = os::elapsedTime(); assert(_cur_collection_pause_used_regions_at_start >= cset_region_length(), "otherwise, the subtraction below does not make sense"); @@ -1052,8 +1052,6 @@ void G1CollectorPolicy::record_collection_pause_end(double pause_time_ms) { _cost_per_card_ms_seq->add(cost_per_card_ms); } - size_t cards_scanned = _g1->cards_scanned(); - double cost_per_entry_ms = 0.0; if (cards_scanned > 10) { cost_per_entry_ms = phase_times()->average_time_ms(G1GCPhaseTimes::ScanRS) / (double) cards_scanned; diff --git a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp index 27a7779b63c..29608203116 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp @@ -634,7 +634,7 @@ public: // Record the start and end of an evacuation pause. void record_collection_pause_start(double start_time_sec); - void record_collection_pause_end(double pause_time_ms); + void record_collection_pause_end(double pause_time_ms, size_t cards_scanned); // Record the start and end of a full collection. void record_full_collection_start(); diff --git a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp index 660cfa76289..99cb4b0bc1f 100644 --- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp +++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp @@ -32,7 +32,7 @@ #include "oops/oop.inline.hpp" #include "runtime/prefetch.inline.hpp" -G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id) +G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, size_t young_cset_length) : _g1h(g1h), _refs(g1h->task_queue(worker_id)), _dcq(&g1h->dirty_card_queue_set()), @@ -51,8 +51,8 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id) // non-young regions (where the age is -1) // We also add a few elements at the beginning and at the end in // an attempt to eliminate cache contention - uint real_length = 1 + _g1h->g1_policy()->young_cset_region_length(); - uint array_length = PADDING_ELEM_NUM + + size_t real_length = 1 + young_cset_length; + size_t array_length = PADDING_ELEM_NUM + real_length + PADDING_ELEM_NUM; _surviving_young_words_base = NEW_C_HEAP_ARRAY(size_t, array_length, mtGC); @@ -60,7 +60,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id) vm_exit_out_of_memory(array_length * sizeof(size_t), OOM_MALLOC_ERROR, "Not enough space for young surv histo."); _surviving_young_words = _surviving_young_words_base + PADDING_ELEM_NUM; - memset(_surviving_young_words, 0, (size_t) real_length * sizeof(size_t)); + memset(_surviving_young_words, 0, real_length * sizeof(size_t)); _plab_allocator = G1PLABAllocator::create_allocator(_g1h->allocator()); @@ -72,17 +72,20 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id) } // Pass locally gathered statistics to global state. -void G1ParScanThreadState::flush() { +void G1ParScanThreadState::flush(size_t* surviving_young_words) { _dcq.flush(); // Update allocation statistics. _plab_allocator->flush_and_retire_stats(); _g1h->g1_policy()->record_age_table(&_age_table); + + uint length = _g1h->g1_policy()->young_cset_region_length(); + for (uint region_index = 0; region_index < length; region_index++) { + surviving_young_words[region_index] += _surviving_young_words[region_index]; + } } G1ParScanThreadState::~G1ParScanThreadState() { delete _plab_allocator; - // Update heap statistics. - _g1h->update_surviving_young_words(_surviving_young_words); FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base); } @@ -324,14 +327,31 @@ G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id) return _states[worker_id]; } +void G1ParScanThreadStateSet::add_cards_scanned(uint worker_id, size_t cards_scanned) { + assert(worker_id < _n_workers, "out of bounds access"); + _cards_scanned[worker_id] += cards_scanned; +} + +size_t G1ParScanThreadStateSet::total_cards_scanned() const { + assert(_flushed, "thread local state from the per thread states should have been flushed"); + return _total_cards_scanned; +} + +const size_t* G1ParScanThreadStateSet::surviving_young_words() const { + assert(_flushed, "thread local state from the per thread states should have been flushed"); + return _surviving_young_words_total; +} + void G1ParScanThreadStateSet::flush() { assert(!_flushed, "thread local state from the per thread states should be flushed once"); + assert(_total_cards_scanned == 0, "should have been cleared"); for (uint worker_index = 0; worker_index < _n_workers; ++worker_index) { G1ParScanThreadState* pss = _states[worker_index]; - pss->flush(); + _total_cards_scanned += _cards_scanned[worker_index]; + pss->flush(_surviving_young_words_total); delete pss; _states[worker_index] = NULL; } diff --git a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp index 31f4e1eb793..9986379c84e 100644 --- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp +++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp @@ -82,7 +82,7 @@ class G1ParScanThreadState : public CHeapObj { } public: - G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id); + G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, size_t young_cset_length); ~G1ParScanThreadState(); void set_ref_processor(ReferenceProcessor* rp) { _scanner.set_ref_processor(rp); } @@ -121,7 +121,7 @@ class G1ParScanThreadState : public CHeapObj { return _surviving_young_words + 1; } - void flush(); + void flush(size_t* surviving_young_words); private: #define G1_PARTIAL_ARRAY_MASK 0x2 @@ -194,31 +194,45 @@ class G1ParScanThreadState : public CHeapObj { class G1ParScanThreadStateSet : public StackObj { G1CollectedHeap* _g1h; G1ParScanThreadState** _states; + size_t* _surviving_young_words_total; + size_t* _cards_scanned; + size_t _total_cards_scanned; uint _n_workers; bool _flushed; public: - G1ParScanThreadStateSet(G1CollectedHeap* g1h, uint n_workers) : + G1ParScanThreadStateSet(G1CollectedHeap* g1h, uint n_workers, size_t young_cset_length) : _g1h(g1h), _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC)), + _surviving_young_words_total(NEW_C_HEAP_ARRAY(size_t, young_cset_length, mtGC)), + _cards_scanned(NEW_C_HEAP_ARRAY(size_t, n_workers, mtGC)), + _total_cards_scanned(0), _n_workers(n_workers), _flushed(false) { for (uint i = 0; i < n_workers; ++i) { - _states[i] = new_par_scan_state(i); + _states[i] = new_par_scan_state(i, young_cset_length); } + memset(_surviving_young_words_total, 0, young_cset_length * sizeof(size_t)); + memset(_cards_scanned, 0, n_workers * sizeof(size_t)); } ~G1ParScanThreadStateSet() { assert(_flushed, "thread local state from the per thread states should have been flushed"); FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states); + FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_total); + FREE_C_HEAP_ARRAY(size_t, _cards_scanned); } void flush(); G1ParScanThreadState* state_for_worker(uint worker_id); + void add_cards_scanned(uint worker_id, size_t cards_scanned); + size_t total_cards_scanned() const; + const size_t* surviving_young_words() const; + private: - G1ParScanThreadState* new_par_scan_state(uint worker_id); + G1ParScanThreadState* new_par_scan_state(uint worker_id, size_t young_cset_length); }; #endif // SHARE_VM_GC_G1_G1PARSCANTHREADSTATE_HPP diff --git a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp index b63c07c260b..0e91d3ae448 100644 --- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp +++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp @@ -26,6 +26,6 @@ #include "gc/g1/g1ParScanThreadState.hpp" -G1ParScanThreadState* G1ParScanThreadStateSet::new_par_scan_state(uint worker_id) { - return new G1ParScanThreadState(_g1h, worker_id); +G1ParScanThreadState* G1ParScanThreadStateSet::new_par_scan_state(uint worker_id, size_t young_cset_length) { + return new G1ParScanThreadState(_g1h, worker_id, young_cset_length); } diff --git a/hotspot/src/share/vm/gc/g1/g1RemSet.cpp b/hotspot/src/share/vm/gc/g1/g1RemSet.cpp index ebc87dd1016..0aa48e6d4c9 100644 --- a/hotspot/src/share/vm/gc/g1/g1RemSet.cpp +++ b/hotspot/src/share/vm/gc/g1/g1RemSet.cpp @@ -76,7 +76,6 @@ G1RemSet::G1RemSet(G1CollectedHeap* g1, CardTableModRefBS* ct_bs) _ct_bs(ct_bs), _g1p(_g1->g1_policy()), _cg1r(g1->concurrent_g1_refine()), _cset_rs_update_cl(NULL), - _cards_scanned(NULL), _total_cards_scanned(0), _prev_period_summary() { _cset_rs_update_cl = NEW_C_HEAP_ARRAY(G1ParPushHeapRSClosure*, n_workers(), mtGC); @@ -228,9 +227,9 @@ public: size_t cards_looked_up() { return _cards;} }; -void G1RemSet::scanRS(G1ParPushHeapRSClosure* oc, - OopClosure* non_heap_roots, - uint worker_i) { +size_t G1RemSet::scanRS(G1ParPushHeapRSClosure* oc, + OopClosure* non_heap_roots, + uint worker_i) { double rs_time_start = os::elapsedTime(); G1CodeBlobClosure code_root_cl(non_heap_roots); @@ -246,11 +245,10 @@ void G1RemSet::scanRS(G1ParPushHeapRSClosure* oc, double scan_rs_time_sec = (os::elapsedTime() - rs_time_start) - scanRScl.strong_code_root_scan_time_sec(); - assert(_cards_scanned != NULL, "invariant"); - _cards_scanned[worker_i] = scanRScl.cards_done(); - _g1p->phase_times()->record_time_secs(G1GCPhaseTimes::ScanRS, worker_i, scan_rs_time_sec); _g1p->phase_times()->record_time_secs(G1GCPhaseTimes::CodeRoots, worker_i, scanRScl.strong_code_root_scan_time_sec()); + + return scanRScl.cards_done(); } // Closure used for updating RSets and recording references that @@ -298,9 +296,9 @@ void G1RemSet::cleanupHRRS() { HeapRegionRemSet::cleanup(); } -void G1RemSet::oops_into_collection_set_do(G1ParPushHeapRSClosure* oc, - OopClosure* non_heap_roots, - uint worker_i) { +size_t G1RemSet::oops_into_collection_set_do(G1ParPushHeapRSClosure* oc, + OopClosure* non_heap_roots, + uint worker_i) { #if CARD_REPEAT_HISTO ct_freq_update_histo_and_reset(); #endif @@ -322,10 +320,11 @@ void G1RemSet::oops_into_collection_set_do(G1ParPushHeapRSClosure* oc, DirtyCardQueue into_cset_dcq(&_g1->into_cset_dirty_card_queue_set()); updateRS(&into_cset_dcq, worker_i); - scanRS(oc, non_heap_roots, worker_i); + size_t cards_scanned = scanRS(oc, non_heap_roots, worker_i); // We now clear the cached values of _cset_rs_update_cl for this worker _cset_rs_update_cl[worker_i] = NULL; + return cards_scanned; } void G1RemSet::prepare_for_oops_into_collection_set_do() { @@ -333,23 +332,9 @@ void G1RemSet::prepare_for_oops_into_collection_set_do() { _g1->set_refine_cte_cl_concurrency(false); DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set(); dcqs.concatenate_logs(); - - guarantee( _cards_scanned == NULL, "invariant" ); - _cards_scanned = NEW_C_HEAP_ARRAY(size_t, n_workers(), mtGC); - for (uint i = 0; i < n_workers(); ++i) { - _cards_scanned[i] = 0; - } - _total_cards_scanned = 0; } void G1RemSet::cleanup_after_oops_into_collection_set_do() { - guarantee( _cards_scanned != NULL, "invariant" ); - _total_cards_scanned = 0; - for (uint i = 0; i < n_workers(); ++i) { - _total_cards_scanned += _cards_scanned[i]; - } - FREE_C_HEAP_ARRAY(size_t, _cards_scanned); - _cards_scanned = NULL; // Cleanup after copy _g1->set_refine_cte_cl_concurrency(true); // Set all cards back to clean. diff --git a/hotspot/src/share/vm/gc/g1/g1RemSet.hpp b/hotspot/src/share/vm/gc/g1/g1RemSet.hpp index 4360d04ff2c..8a670de58f9 100644 --- a/hotspot/src/share/vm/gc/g1/g1RemSet.hpp +++ b/hotspot/src/share/vm/gc/g1/g1RemSet.hpp @@ -62,9 +62,6 @@ protected: ConcurrentG1Refine* _cg1r; - size_t* _cards_scanned; - size_t _total_cards_scanned; - // Used for caching the closure that is responsible for scanning // references into the collection set. G1ParPushHeapRSClosure** _cset_rs_update_cl; @@ -94,9 +91,12 @@ public: // partitioning the work to be done. It should be the same as // the "i" passed to the calling thread's work(i) function. // In the sequential case this param will be ignored. - void oops_into_collection_set_do(G1ParPushHeapRSClosure* blk, - OopClosure* non_heap_roots, - uint worker_i); + // + // Returns the number of cards scanned while looking for pointers + // into the collection set. + size_t oops_into_collection_set_do(G1ParPushHeapRSClosure* blk, + OopClosure* non_heap_roots, + uint worker_i); // Prepare for and cleanup after an oops_into_collection_set_do // call. Must call each of these once before and after (in sequential @@ -106,14 +106,13 @@ public: void prepare_for_oops_into_collection_set_do(); void cleanup_after_oops_into_collection_set_do(); - void scanRS(G1ParPushHeapRSClosure* oc, - OopClosure* non_heap_roots, - uint worker_i); + size_t scanRS(G1ParPushHeapRSClosure* oc, + OopClosure* non_heap_roots, + uint worker_i); void updateRS(DirtyCardQueue* into_cset_dcq, uint worker_i); CardTableModRefBS* ct_bs() { return _ct_bs; } - size_t cardsScanned() { return _total_cards_scanned; } // Record, if necessary, the fact that *p (where "p" is in region "from", // which is required to be non-NULL) has changed to a new non-NULL value. From 7ec42b2f7a142d5fb0fa98657ec83a49f477a124 Mon Sep 17 00:00:00 2001 From: Sangheon Kim Date: Wed, 9 Sep 2015 09:19:32 -0700 Subject: [PATCH 05/20] 8135025: Error message is repeated for large value at G1ConcRefinementThreads Changed error handling when G1ConcRefinementThreads creation failed Reviewed-by: jwilhelm, kbarrett, tschatzl --- .../src/share/vm/gc/g1/concurrentG1Refine.cpp | 37 ++++++++++++++----- .../src/share/vm/gc/g1/concurrentG1Refine.hpp | 7 +++- .../src/share/vm/gc/g1/g1CollectedHeap.cpp | 6 ++- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/hotspot/src/share/vm/gc/g1/concurrentG1Refine.cpp b/hotspot/src/share/vm/gc/g1/concurrentG1Refine.cpp index 577a3692e1b..ea67798bbb5 100644 --- a/hotspot/src/share/vm/gc/g1/concurrentG1Refine.cpp +++ b/hotspot/src/share/vm/gc/g1/concurrentG1Refine.cpp @@ -29,7 +29,7 @@ #include "gc/g1/g1HotCardCache.hpp" #include "runtime/java.hpp" -ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure) : +ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) : _threads(NULL), _n_threads(0), _hot_card_cache(g1h) { @@ -48,29 +48,46 @@ ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosu FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2); } set_red_zone(MAX2(G1ConcRefinementRedZone, yellow_zone())); +} - _n_worker_threads = thread_num(); +ConcurrentG1Refine* ConcurrentG1Refine::create(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure, jint* ecode) { + ConcurrentG1Refine* cg1r = new ConcurrentG1Refine(g1h); + if (cg1r == NULL) { + *ecode = JNI_ENOMEM; + vm_shutdown_during_initialization("Could not create ConcurrentG1Refine"); + return NULL; + } + cg1r->_n_worker_threads = thread_num(); // We need one extra thread to do the young gen rset size sampling. - _n_threads = _n_worker_threads + 1; + cg1r->_n_threads = cg1r->_n_worker_threads + 1; - reset_threshold_step(); + cg1r->reset_threshold_step(); - _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads, mtGC); + cg1r->_threads = NEW_C_HEAP_ARRAY_RETURN_NULL(ConcurrentG1RefineThread*, cg1r->_n_threads, mtGC); + if (cg1r->_threads == NULL) { + *ecode = JNI_ENOMEM; + vm_shutdown_during_initialization("Could not allocate an array for ConcurrentG1RefineThread"); + return NULL; + } uint worker_id_offset = DirtyCardQueueSet::num_par_ids(); ConcurrentG1RefineThread *next = NULL; - for (uint i = _n_threads - 1; i != UINT_MAX; i--) { - ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, refine_closure, worker_id_offset, i); + for (uint i = cg1r->_n_threads - 1; i != UINT_MAX; i--) { + ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(cg1r, next, refine_closure, worker_id_offset, i); assert(t != NULL, "Conc refine should have been created"); if (t->osthread() == NULL) { - vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread"); + *ecode = JNI_ENOMEM; + vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread"); + return NULL; } - assert(t->cg1r() == this, "Conc refine thread should refer to this"); - _threads[i] = t; + assert(t->cg1r() == cg1r, "Conc refine thread should refer to this"); + cg1r->_threads[i] = t; next = t; } + *ecode = JNI_OK; + return cg1r; } void ConcurrentG1Refine::reset_threshold_step() { diff --git a/hotspot/src/share/vm/gc/g1/concurrentG1Refine.hpp b/hotspot/src/share/vm/gc/g1/concurrentG1Refine.hpp index ce0e13eaba7..ef8b7a58a62 100644 --- a/hotspot/src/share/vm/gc/g1/concurrentG1Refine.hpp +++ b/hotspot/src/share/vm/gc/g1/concurrentG1Refine.hpp @@ -71,10 +71,15 @@ class ConcurrentG1Refine: public CHeapObj { // Reset the threshold step value based of the current zone boundaries. void reset_threshold_step(); + ConcurrentG1Refine(G1CollectedHeap* g1h); + public: - ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure); ~ConcurrentG1Refine(); + // Returns ConcurrentG1Refine instance if succeeded to create/initialize ConcurrentG1Refine and ConcurrentG1RefineThread. + // Otherwise, returns NULL with error code. + static ConcurrentG1Refine* create(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure, jint* ecode); + void init(G1RegionToSpaceMapper* card_counts_storage); void stop(); diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp index 648605e1901..631f29956ff 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp @@ -2125,7 +2125,11 @@ jint G1CollectedHeap::initialize() { _refine_cte_cl = new RefineCardTableEntryClosure(); - _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl); + jint ecode = JNI_OK; + _cg1r = ConcurrentG1Refine::create(this, _refine_cte_cl, &ecode); + if (_cg1r == NULL) { + return ecode; + } // Reserve the maximum. From fea40d07b495ca69eb3f1f8d7a6205d9c3ca9101 Mon Sep 17 00:00:00 2001 From: Kim Barrett Date: Wed, 9 Sep 2015 14:31:12 -0400 Subject: [PATCH 06/20] 8135209: Avoid abutting string literals and identifiers Add spaces between string literals and identifiers. Reviewed-by: brutisso, pliden --- hotspot/src/share/vm/gc/g1/g1EvacStats.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp b/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp index 43dafc1cdbb..009c6dc5151 100644 --- a/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp +++ b/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp @@ -46,11 +46,11 @@ void G1EvacStats::adjust_desired_plab_sz() { if (_allocated == 0) { assert((_unused == 0), err_msg("Inconsistency in PLAB stats: " - "_allocated: "SIZE_FORMAT", " - "_wasted: "SIZE_FORMAT", " - "_region_end_waste: "SIZE_FORMAT", " - "_unused: "SIZE_FORMAT", " - "_used : "SIZE_FORMAT, + "_allocated: " SIZE_FORMAT ", " + "_wasted: " SIZE_FORMAT ", " + "_region_end_waste: " SIZE_FORMAT ", " + "_unused: " SIZE_FORMAT ", " + "_used : " SIZE_FORMAT, _allocated, _wasted, _region_end_waste, _unused, used())); _allocated = 1; } From e587cb7165719edfe928d76dcef3e9b95c9d2b11 Mon Sep 17 00:00:00 2001 From: Matthias Klose Date: Wed, 9 Sep 2015 23:47:32 -0400 Subject: [PATCH 07/20] 8135298: Fix zero builds for "unknown" architectures on linux Add zero architectures for default cases Reviewed-by: coleenp --- hotspot/src/os/linux/vm/os_linux.cpp | 32 ++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/hotspot/src/os/linux/vm/os_linux.cpp b/hotspot/src/os/linux/vm/os_linux.cpp index 769665642d9..5ddf9ac53af 100644 --- a/hotspot/src/os/linux/vm/os_linux.cpp +++ b/hotspot/src/os/linux/vm/os_linux.cpp @@ -2211,9 +2211,13 @@ void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) { } } -const char* search_string = IA32_ONLY("model name") AMD64_ONLY("model name") - IA64_ONLY("") SPARC_ONLY("cpu") - ARM32_ONLY("Processor") PPC_ONLY("Processor") AARCH64_ONLY("Processor"); +#if defined(AMD64) || defined(IA32) || defined(X32) +const char* search_string = "model name"; +#elif defined(SPARC) +const char* search_string = "cpu"; +#else +const char* search_string = "Processor"; +#endif // Parses the cpuinfo file for string representing the model name. void os::get_summary_cpu_info(char* cpuinfo, size_t length) { @@ -2248,9 +2252,25 @@ void os::get_summary_cpu_info(char* cpuinfo, size_t length) { } // cpuinfo not found or parsing failed, just print generic string. The entire // /proc/cpuinfo file will be printed later in the file (or enough of it for x86) - strncpy(cpuinfo, IA32_ONLY("x86_32") AMD64_ONLY("x86_32") - IA64_ONLY("IA64") SPARC_ONLY("sparcv9") - ARM32_ONLY("ARM") PPC_ONLY("PPC64") AARCH64_ONLY("AArch64"), length); +#if defined(AMD64) + strncpy(cpuinfo, "x86_64", length); +#elif defined(IA32) + strncpy(cpuinfo, "x86_32", length); +#elif defined(IA64) + strncpy(cpuinfo, "IA64", length); +#elif defined(SPARC) + strncpy(cpuinfo, "sparcv9", length); +#elif defined(AARCH64) + strncpy(cpuinfo, "AArch64", length); +#elif defined(ARM) + strncpy(cpuinfo, "ARM", length); +#elif defined(PPC) + strncpy(cpuinfo, "PPC64", length); +#elif defined(ZERO_LIBARCH) + strncpy(cpuinfo, ZERO_LIBARCH, length); +#else + strncpy(cpuinfo, "unknown", length); +#endif } void os::print_siginfo(outputStream* st, void* siginfo) { From 89ec770497807c602c9a97c1c462db5aaee073eb Mon Sep 17 00:00:00 2001 From: Erik Helin Date: Fri, 11 Sep 2015 10:02:35 +0200 Subject: [PATCH 08/20] 8135260: Split G1CollectorPolicy::finalize_cset into two parts Reviewed-by: tschatzl, mgerdin --- .../src/share/vm/gc/g1/g1CollectedHeap.cpp | 3 +- .../src/share/vm/gc/g1/g1CollectorPolicy.cpp | 34 ++++++++++--------- .../src/share/vm/gc/g1/g1CollectorPolicy.hpp | 5 +-- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp index 631f29956ff..89dc4674120 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp @@ -4102,7 +4102,8 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) { g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty); #endif // YOUNG_LIST_VERBOSE - g1_policy()->finalize_cset(target_pause_time_ms); + double time_remaining_ms = g1_policy()->finalize_young_cset_part(target_pause_time_ms); + g1_policy()->finalize_old_cset_part(time_remaining_ms); evacuation_info.set_collectionset_regions(g1_policy()->cset_region_length()); diff --git a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp index 5fafa9e8cbd..0577c3fc80d 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp @@ -1869,7 +1869,7 @@ uint G1CollectorPolicy::calc_max_old_cset_length() { } -void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) { +double G1CollectorPolicy::finalize_young_cset_part(double target_pause_time_ms) { double young_start_time_sec = os::elapsedTime(); YoungList* young_list = _g1->young_list(); @@ -1881,7 +1881,6 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) { guarantee(_collection_set == NULL, "Precondition"); double base_time_ms = predict_base_elapsed_time_ms(_pending_cards); - double predicted_pause_time_ms = base_time_ms; double time_remaining_ms = MAX2(target_pause_time_ms - base_time_ms, 0.0); ergo_verbose4(ErgoCSetConstruction | ErgoHigh, @@ -1925,15 +1924,16 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) { _collection_set = _inc_cset_head; _collection_set_bytes_used_before = _inc_cset_bytes_used_before; time_remaining_ms = MAX2(time_remaining_ms - _inc_cset_predicted_elapsed_time_ms, 0.0); - predicted_pause_time_ms += _inc_cset_predicted_elapsed_time_ms; - ergo_verbose3(ErgoCSetConstruction | ErgoHigh, + ergo_verbose4(ErgoCSetConstruction | ErgoHigh, "add young regions to CSet", ergo_format_region("eden") ergo_format_region("survivors") - ergo_format_ms("predicted young region time"), + ergo_format_ms("predicted young region time") + ergo_format_ms("target pause time"), eden_region_length, survivor_region_length, - _inc_cset_predicted_elapsed_time_ms); + _inc_cset_predicted_elapsed_time_ms, + target_pause_time_ms); // The number of recorded young regions is the incremental // collection set's current size @@ -1942,8 +1942,13 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) { double young_end_time_sec = os::elapsedTime(); phase_times()->record_young_cset_choice_time_ms((young_end_time_sec - young_start_time_sec) * 1000.0); - // Set the start of the non-young choice time. - double non_young_start_time_sec = young_end_time_sec; + return time_remaining_ms; +} + +void G1CollectorPolicy::finalize_old_cset_part(double time_remaining_ms) { + double non_young_start_time_sec = os::elapsedTime(); + double predicted_old_time_ms = 0.0; + if (!collector_state()->gcs_are_young()) { CollectionSetChooser* cset_chooser = _collectionSetChooser; @@ -2031,7 +2036,7 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) { // We will add this region to the CSet. time_remaining_ms = MAX2(time_remaining_ms - predicted_time_ms, 0.0); - predicted_pause_time_ms += predicted_time_ms; + predicted_old_time_ms += predicted_time_ms; cset_chooser->remove_and_move_to_next(hr); _g1->old_set_remove(hr); add_old_region_to_cset(hr); @@ -2066,16 +2071,13 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) { stop_incremental_cset_building(); - ergo_verbose5(ErgoCSetConstruction, + ergo_verbose3(ErgoCSetConstruction, "finish choosing CSet", - ergo_format_region("eden") - ergo_format_region("survivors") ergo_format_region("old") - ergo_format_ms("predicted pause time") - ergo_format_ms("target pause time"), - eden_region_length, survivor_region_length, + ergo_format_ms("predicted old region time") + ergo_format_ms("time remaining"), old_cset_region_length(), - predicted_pause_time_ms, target_pause_time_ms); + predicted_old_time_ms, time_remaining_ms); double non_young_end_time_sec = os::elapsedTime(); phase_times()->record_non_young_cset_choice_time_ms((non_young_end_time_sec - non_young_start_time_sec) * 1000.0); diff --git a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp index 29608203116..ab2b2fea961 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp @@ -473,7 +473,7 @@ private: // The number of bytes in the collection set before the pause. Set from // the incrementally built collection set at the start of an evacuation - // pause, and incremented in finalize_cset() when adding old regions + // pause, and incremented in finalize_old_cset_part() when adding old regions // (if any) to the collection set. size_t _collection_set_bytes_used_before; @@ -689,7 +689,8 @@ public: // Choose a new collection set. Marks the chosen regions as being // "in_collection_set", and links them together. The head and number of // the collection set are available via access methods. - void finalize_cset(double target_pause_time_ms); + double finalize_young_cset_part(double target_pause_time_ms); + virtual void finalize_old_cset_part(double time_remaining_ms); // The head of the list (via "next_in_collection_set()") representing the // current collection set. From c00b15bccdadf22832213a9f97e92c01b6d9d348 Mon Sep 17 00:00:00 2001 From: Erik Helin Date: Fri, 11 Sep 2015 13:20:05 +0200 Subject: [PATCH 09/20] 8135253: Add push method to CollectionSetChooser Reviewed-by: mgerdin, tschatzl --- .../share/vm/gc/g1/collectionSetChooser.cpp | 38 +++++++++++-------- .../share/vm/gc/g1/collectionSetChooser.hpp | 34 ++++++++--------- .../src/share/vm/gc/g1/g1CollectorPolicy.cpp | 2 +- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/hotspot/src/share/vm/gc/g1/collectionSetChooser.cpp b/hotspot/src/share/vm/gc/g1/collectionSetChooser.cpp index 79d47d5552e..d3ad72b38ef 100644 --- a/hotspot/src/share/vm/gc/g1/collectionSetChooser.cpp +++ b/hotspot/src/share/vm/gc/g1/collectionSetChooser.cpp @@ -83,7 +83,7 @@ CollectionSetChooser::CollectionSetChooser() : _regions((ResourceObj::set_allocation_type((address) &_regions, ResourceObj::C_HEAP), 100), true /* C_Heap */), - _curr_index(0), _length(0), _first_par_unreserved_idx(0), + _front(0), _end(0), _first_par_unreserved_idx(0), _region_live_threshold_bytes(0), _remaining_reclaimable_bytes(0) { _region_live_threshold_bytes = HeapRegion::GrainBytes * (size_t) G1MixedGCLiveThresholdPercent / 100; @@ -91,19 +91,19 @@ CollectionSetChooser::CollectionSetChooser() : #ifndef PRODUCT void CollectionSetChooser::verify() { - guarantee(_length <= regions_length(), - err_msg("_length: %u regions length: %u", _length, regions_length())); - guarantee(_curr_index <= _length, - err_msg("_curr_index: %u _length: %u", _curr_index, _length)); + guarantee(_end <= regions_length(), + err_msg("_end: %u regions length: %u", _end, regions_length())); + guarantee(_front <= _end, + err_msg("_front: %u _end: %u", _front, _end)); uint index = 0; size_t sum_of_reclaimable_bytes = 0; - while (index < _curr_index) { + while (index < _front) { guarantee(regions_at(index) == NULL, - "all entries before _curr_index should be NULL"); + "all entries before _front should be NULL"); index += 1; } HeapRegion *prev = NULL; - while (index < _length) { + while (index < _end) { HeapRegion *curr = regions_at(index++); guarantee(curr != NULL, "Regions in _regions array cannot be NULL"); guarantee(!curr->is_young(), "should not be young!"); @@ -132,15 +132,15 @@ void CollectionSetChooser::sort_regions() { regions_trunc_to(_first_par_unreserved_idx); } _regions.sort(order_regions); - assert(_length <= regions_length(), "Requirement"); + assert(_end <= regions_length(), "Requirement"); #ifdef ASSERT - for (uint i = 0; i < _length; i++) { + for (uint i = 0; i < _end; i++) { assert(regions_at(i) != NULL, "Should be true by sorting!"); } #endif // ASSERT if (G1PrintRegionLivenessInfo) { G1PrintRegionLivenessInfoClosure cl(gclog_or_tty, "Post-Sorting"); - for (uint i = 0; i < _length; ++i) { + for (uint i = 0; i < _end; ++i) { HeapRegion* r = regions_at(i); cl.doHeapRegion(r); } @@ -154,11 +154,19 @@ void CollectionSetChooser::add_region(HeapRegion* hr) { err_msg("Pinned region shouldn't be added to the collection set (index %u)", hr->hrm_index())); assert(!hr->is_young(), "should not be young!"); _regions.append(hr); - _length++; + _end++; _remaining_reclaimable_bytes += hr->reclaimable_bytes(); hr->calc_gc_efficiency(); } +void CollectionSetChooser::push(HeapRegion* hr) { + assert(hr != NULL, "Can't put back a NULL region"); + assert(_front >= 1, "Too many regions have been put back"); + _front--; + regions_at_put(_front, hr); + _remaining_reclaimable_bytes += hr->reclaimable_bytes(); +} + void CollectionSetChooser::prepare_for_par_region_addition(uint n_threads, uint n_regions, uint chunk_size) { @@ -193,7 +201,7 @@ void CollectionSetChooser::update_totals(uint region_num, // We could have just used atomics instead of taking the // lock. However, we currently don't have an atomic add for size_t. MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag); - _length += region_num; + _end += region_num; _remaining_reclaimable_bytes += reclaimable_bytes; } else { assert(reclaimable_bytes == 0, "invariant"); @@ -202,7 +210,7 @@ void CollectionSetChooser::update_totals(uint region_num, void CollectionSetChooser::clear() { _regions.clear(); - _curr_index = 0; - _length = 0; + _front = 0; + _end = 0; _remaining_reclaimable_bytes = 0; }; diff --git a/hotspot/src/share/vm/gc/g1/collectionSetChooser.hpp b/hotspot/src/share/vm/gc/g1/collectionSetChooser.hpp index aaa9db17a23..011c4ef1f2e 100644 --- a/hotspot/src/share/vm/gc/g1/collectionSetChooser.hpp +++ b/hotspot/src/share/vm/gc/g1/collectionSetChooser.hpp @@ -48,12 +48,10 @@ class CollectionSetChooser: public CHeapObj { // The index of the next candidate old region to be considered for // addition to the CSet. - uint _curr_index; + uint _front; - // The number of candidate old regions added to the CSet chooser. - // Note: this is not updated when removing a region using - // remove_and_move_to_next() below. - uint _length; + // The index of the last candidate old region + uint _end; // Keeps track of the start of the next array chunk to be claimed by // parallel GC workers. @@ -73,31 +71,33 @@ public: // collection without removing it from the CSet chooser. HeapRegion* peek() { HeapRegion* res = NULL; - if (_curr_index < _length) { - res = regions_at(_curr_index); + if (_front < _end) { + res = regions_at(_front); assert(res != NULL, err_msg("Unexpected NULL hr in _regions at index %u", - _curr_index)); + _front)); } return res; } // Remove the given region from the CSet chooser and move to the - // next one. The given region should be the current candidate region - // in the CSet chooser. - void remove_and_move_to_next(HeapRegion* hr) { + // next one. + HeapRegion* pop() { + HeapRegion* hr = regions_at(_front); assert(hr != NULL, "pre-condition"); - assert(_curr_index < _length, "pre-condition"); - assert(regions_at(_curr_index) == hr, "pre-condition"); - regions_at_put(_curr_index, NULL); + assert(_front < _end, "pre-condition"); + regions_at_put(_front, NULL); assert(hr->reclaimable_bytes() <= _remaining_reclaimable_bytes, err_msg("remaining reclaimable bytes inconsistent " "from region: " SIZE_FORMAT " remaining: " SIZE_FORMAT, hr->reclaimable_bytes(), _remaining_reclaimable_bytes)); _remaining_reclaimable_bytes -= hr->reclaimable_bytes(); - _curr_index += 1; + _front += 1; + return hr; } + void push(HeapRegion* hr); + CollectionSetChooser(); void sort_regions(); @@ -113,7 +113,7 @@ public: } // Returns the number candidate old regions added - uint length() { return _length; } + uint length() { return _end; } // Serial version. void add_region(HeapRegion *hr); @@ -135,7 +135,7 @@ public: void clear(); // Return the number of candidate regions that remain to be collected. - uint remaining_regions() { return _length - _curr_index; } + uint remaining_regions() { return _end - _front; } // Determine whether the CSet chooser has more candidate regions or not. bool is_empty() { return remaining_regions() == 0; } diff --git a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp index 0577c3fc80d..c9a635f208a 100644 --- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp +++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp @@ -2037,7 +2037,7 @@ void G1CollectorPolicy::finalize_old_cset_part(double time_remaining_ms) { // We will add this region to the CSet. time_remaining_ms = MAX2(time_remaining_ms - predicted_time_ms, 0.0); predicted_old_time_ms += predicted_time_ms; - cset_chooser->remove_and_move_to_next(hr); + cset_chooser->pop(); // already have region via peek() _g1->old_set_remove(hr); add_old_region_to_cset(hr); From 807c69046a1d7fb3df0faf33b5baa2f32cbf4888 Mon Sep 17 00:00:00 2001 From: Jon Masamitsu Date: Mon, 25 May 2015 19:26:23 -0700 Subject: [PATCH 10/20] 8081629: CMS split_block() does not correctly fix up block-offset-table for large blocks Reviewed-by: tschatzl, ysr --- hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp b/hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp index 3dcdf9c137a..36876f0a0b5 100644 --- a/hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp +++ b/hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp @@ -447,14 +447,16 @@ void BlockOffsetArrayNonContigSpace::split_block(HeapWord* blk, } else { // Unilaterally fix the first (num_pref_cards - 1) following // the "offset card" in the suffix block. + const size_t right_most_fixed_index = suff_index + num_pref_cards - 1; set_remainder_to_point_to_start_incl(suff_index + 1, - suff_index + num_pref_cards - 1, true /* reducing */); + right_most_fixed_index, true /* reducing */); // Fix the appropriate cards in the remainder of the // suffix block -- these are the last num_pref_cards // cards in each power block of the "new" range plumbed // from suff_addr. bool more = true; uint i = 1; + // Fix the first power block with back_by > num_pref_cards. while (more && (i < N_powers)) { size_t back_by = power_to_cards_back(i); size_t right_index = suff_index + back_by - 1; @@ -463,6 +465,9 @@ void BlockOffsetArrayNonContigSpace::split_block(HeapWord* blk, right_index = end_index - 1; more = false; } + if (left_index <= right_most_fixed_index) { + left_index = right_most_fixed_index + 1; + } if (back_by > num_pref_cards) { // Fill in the remainder of this "power block", if it // is non-null. @@ -471,12 +476,14 @@ void BlockOffsetArrayNonContigSpace::split_block(HeapWord* blk, N_words + i - 1, true /* reducing */); } else { more = false; // we are done + assert((end_index - 1) == right_index, "Must be at the end."); } i++; break; } i++; } + // Fix the rest of the power blocks. while (more && (i < N_powers)) { size_t back_by = power_to_cards_back(i); size_t right_index = suff_index + back_by - 1; From e75f5a5cdeaedd9d202651ca0d78a5cfed0ebd4e Mon Sep 17 00:00:00 2001 From: Michael Berg Date: Wed, 9 Sep 2015 10:34:17 -0700 Subject: [PATCH 11/20] 8135028: support for vectorizing double precision sqrt Reviewed-by: kvn, twisti --- hotspot/src/cpu/x86/vm/assembler_x86.cpp | 20 ++++ hotspot/src/cpu/x86/vm/assembler_x86.hpp | 4 + hotspot/src/cpu/x86/vm/x86.ad | 73 ++++++++++++++ hotspot/src/share/vm/adlc/formssel.cpp | 1 + hotspot/src/share/vm/opto/classes.hpp | 1 + hotspot/src/share/vm/opto/superword.cpp | 5 + hotspot/src/share/vm/opto/vectornode.cpp | 6 ++ hotspot/src/share/vm/opto/vectornode.hpp | 8 ++ .../loopopts/superword/SumRedSqrt_Double.java | 95 +++++++++++++++++++ 9 files changed, 213 insertions(+) create mode 100644 hotspot/test/compiler/loopopts/superword/SumRedSqrt_Double.java diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index 2df0fca3de6..fcbfce628ed 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -3993,6 +3993,26 @@ void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len); } +void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx(), ""); + if (VM_Version::supports_evex()) { + emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); + } +} + +void Assembler::vsqrtpd(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_avx(), ""); + if (VM_Version::supports_evex()) { + tuple_type = EVEX_FV; + input_size_in_bits = EVEX_64bit; + emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); + } else { + emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); + } +} + void Assembler::andpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp index 3cd5dec6314..2d5e7f8c495 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp @@ -1920,6 +1920,10 @@ private: void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + // Sqrt Packed Floating-Point Values - Double precision only + void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len); + void vsqrtpd(XMMRegister dst, Address src, int vector_len); + // Bitwise Logical AND of Packed Floating-Point Values void andpd(XMMRegister dst, XMMRegister src); void andps(XMMRegister dst, XMMRegister src); diff --git a/hotspot/src/cpu/x86/vm/x86.ad b/hotspot/src/cpu/x86/vm/x86.ad index c8aa50a106c..39ee7c03d63 100644 --- a/hotspot/src/cpu/x86/vm/x86.ad +++ b/hotspot/src/cpu/x86/vm/x86.ad @@ -1691,6 +1691,10 @@ const bool Matcher::match_rule_supported(int opcode) { if (UseSSE < 1) // requires at least SSE return false; break; + case Op_SqrtVD: + if (UseAVX < 1) // enabled for AVX only + return false; + break; case Op_CompareAndSwapL: #ifdef _LP64 case Op_CompareAndSwapP: @@ -7474,6 +7478,75 @@ instruct vshiftcnt(vecS dst, rRegI cnt) %{ ins_pipe( pipe_slow ); %} +// --------------------------------- Sqrt -------------------------------------- + +// Floating point vector sqrt - double precision only +instruct vsqrt2D_reg(vecX dst, vecX src) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SqrtVD src)); + format %{ "vsqrtpd $dst,$src\t! sqrt packed2D" %} + ins_encode %{ + int vector_len = 0; + __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsqrt2D_mem(vecX dst, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SqrtVD (LoadVector mem))); + format %{ "vsqrtpd $dst,$mem\t! sqrt packed2D" %} + ins_encode %{ + int vector_len = 0; + __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsqrt4D_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SqrtVD src)); + format %{ "vsqrtpd $dst,$src\t! sqrt packed4D" %} + ins_encode %{ + int vector_len = 1; + __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsqrt4D_mem(vecY dst, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SqrtVD (LoadVector mem))); + format %{ "vsqrtpd $dst,$mem\t! sqrt packed4D" %} + ins_encode %{ + int vector_len = 1; + __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsqrt8D_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (SqrtVD src)); + format %{ "vsqrtpd $dst,$src\t! sqrt packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsqrt8D_mem(vecZ dst, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (SqrtVD (LoadVector mem))); + format %{ "vsqrtpd $dst,$mem\t! sqrt packed8D" %} + ins_encode %{ + int vector_len = 2; + __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + // ------------------------------ LeftShift ----------------------------------- // Shorts/Chars vector left shift diff --git a/hotspot/src/share/vm/adlc/formssel.cpp b/hotspot/src/share/vm/adlc/formssel.cpp index 30a02fa441f..141f7c08c70 100644 --- a/hotspot/src/share/vm/adlc/formssel.cpp +++ b/hotspot/src/share/vm/adlc/formssel.cpp @@ -4143,6 +4143,7 @@ bool MatchRule::is_vector() const { "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD", "MulVS","MulVI","MulVL","MulVF","MulVD", "DivVF","DivVD", + "SqrtVD", "AndV" ,"XorV" ,"OrV", "AddReductionVI", "AddReductionVL", "AddReductionVF", "AddReductionVD", diff --git a/hotspot/src/share/vm/opto/classes.hpp b/hotspot/src/share/vm/opto/classes.hpp index 936af318373..5e361177e86 100644 --- a/hotspot/src/share/vm/opto/classes.hpp +++ b/hotspot/src/share/vm/opto/classes.hpp @@ -290,6 +290,7 @@ macro(MulVD) macro(MulReductionVD) macro(DivVF) macro(DivVD) +macro(SqrtVD) macro(LShiftCntV) macro(RShiftCntV) macro(LShiftVB) diff --git a/hotspot/src/share/vm/opto/superword.cpp b/hotspot/src/share/vm/opto/superword.cpp index dda417e6466..791624e5511 100644 --- a/hotspot/src/share/vm/opto/superword.cpp +++ b/hotspot/src/share/vm/opto/superword.cpp @@ -1858,6 +1858,11 @@ void SuperWord::output() { vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } + } else if (opc == Op_SqrtD) { + // Promote operand to vector (Sqrt is a 2 address instruction) + Node* in = vector_opd(p, 1); + vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n)); + vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else { ShouldNotReachHere(); } diff --git a/hotspot/src/share/vm/opto/vectornode.cpp b/hotspot/src/share/vm/opto/vectornode.cpp index 8f88329bd05..274e2b7e759 100644 --- a/hotspot/src/share/vm/opto/vectornode.cpp +++ b/hotspot/src/share/vm/opto/vectornode.cpp @@ -92,6 +92,9 @@ int VectorNode::opcode(int sopc, BasicType bt) { case Op_DivD: assert(bt == T_DOUBLE, "must be"); return Op_DivVD; + case Op_SqrtD: + assert(bt == T_DOUBLE, "must be"); + return Op_SqrtVD; case Op_LShiftI: switch (bt) { case T_BOOLEAN: @@ -277,6 +280,9 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b case Op_DivVF: return new DivVFNode(n1, n2, vt); case Op_DivVD: return new DivVDNode(n1, n2, vt); + // Currently only supports double precision sqrt + case Op_SqrtVD: return new SqrtVDNode(n1, vt); + case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt); case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt); case Op_LShiftVI: return new LShiftVINode(n1, n2, vt); diff --git a/hotspot/src/share/vm/opto/vectornode.hpp b/hotspot/src/share/vm/opto/vectornode.hpp index 572c44c8648..4652730d1dc 100644 --- a/hotspot/src/share/vm/opto/vectornode.hpp +++ b/hotspot/src/share/vm/opto/vectornode.hpp @@ -309,6 +309,14 @@ class DivVDNode : public VectorNode { virtual int Opcode() const; }; +//------------------------------SqrtVDNode-------------------------------------- +// Vector Sqrt double +class SqrtVDNode : public VectorNode { + public: + SqrtVDNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {} + virtual int Opcode() const; +}; + //------------------------------LShiftVBNode----------------------------------- // Vector left shift bytes class LShiftVBNode : public VectorNode { diff --git a/hotspot/test/compiler/loopopts/superword/SumRedSqrt_Double.java b/hotspot/test/compiler/loopopts/superword/SumRedSqrt_Double.java new file mode 100644 index 00000000000..75a2c14e36f --- /dev/null +++ b/hotspot/test/compiler/loopopts/superword/SumRedSqrt_Double.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** +* @test +* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double sqrt test +* +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double +*/ + +public class SumRedSqrt_Double +{ + public static void main(String[] args) throws Exception { + double[] a = new double[256*1024]; + double[] b = new double[256*1024]; + double[] c = new double[256*1024]; + double[] d = new double[256*1024]; + sumReductionInit(a,b,c); + double total = 0; + double valid = 2.06157643776E14; + for(int j = 0; j < 2000; j++) { + total = sumReductionImplement(a,b,c,d,total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void sumReductionInit( + double[] a, + double[] b, + double[] c) + { + for(int j = 0; j < 1; j++) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i * 1 + j; + b[i] = i * 1 - j; + c[i] = i + j; + } + } + } + + public static double sumReductionImplement( + double[] a, + double[] b, + double[] c, + double[] d, + double total) + { + for(int i = 0; i < a.length; i++) + { + d[i]= Math.sqrt(a[i] * b[i]) + Math.sqrt(a[i] * c[i]) + Math.sqrt(b[i] * c[i]); + total += d[i]; + } + return total; + } + +} From 518c5cacbcba96bc33683cf3779fc8661dd21d6d Mon Sep 17 00:00:00 2001 From: Andrew Haley Date: Tue, 8 Sep 2015 14:08:58 +0100 Subject: [PATCH 12/20] 8135157: DMB elimination in AArch64 C2 synchronization implementation Reduce memory barrier usage in C2 fast lock and unlock. Co-authored-by: Wei Tang Reviewed-by: kvn --- hotspot/src/cpu/aarch64/vm/aarch64.ad | 125 ++++++------------ .../cpu/aarch64/vm/macroAssembler_aarch64.hpp | 19 +++ 2 files changed, 61 insertions(+), 83 deletions(-) diff --git a/hotspot/src/cpu/aarch64/vm/aarch64.ad b/hotspot/src/cpu/aarch64/vm/aarch64.ad index e02aed6d937..eb42df45806 100644 --- a/hotspot/src/cpu/aarch64/vm/aarch64.ad +++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad @@ -3803,82 +3803,38 @@ encode %{ enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{ MacroAssembler _masm(&cbuf); - Register old_reg = as_Register($oldval$$reg); - Register new_reg = as_Register($newval$$reg); - Register base = as_Register($mem$$base); - Register addr_reg; - int index = $mem$$index; - int scale = $mem$$scale; - int disp = $mem$$disp; - if (index == -1) { - if (disp != 0) { - __ lea(rscratch2, Address(base, disp)); - addr_reg = rscratch2; - } else { - // TODO - // should we ever get anything other than this case? - addr_reg = base; - } - } else { - Register index_reg = as_Register(index); - if (disp == 0) { - __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } else { - __ lea(rscratch2, Address(base, disp)); - __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } - } - Label retry_load, done; - __ bind(retry_load); - __ ldxr(rscratch1, addr_reg); - __ cmp(rscratch1, old_reg); - __ br(Assembler::NE, done); - __ stlxr(rscratch1, new_reg, addr_reg); - __ cbnzw(rscratch1, retry_load); - __ bind(done); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr); %} enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{ MacroAssembler _masm(&cbuf); - Register old_reg = as_Register($oldval$$reg); - Register new_reg = as_Register($newval$$reg); - Register base = as_Register($mem$$base); - Register addr_reg; - int index = $mem$$index; - int scale = $mem$$scale; - int disp = $mem$$disp; - if (index == -1) { - if (disp != 0) { - __ lea(rscratch2, Address(base, disp)); - addr_reg = rscratch2; - } else { - // TODO - // should we ever get anything other than this case? - addr_reg = base; - } - } else { - Register index_reg = as_Register(index); - if (disp == 0) { - __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } else { - __ lea(rscratch2, Address(base, disp)); - __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } - } - Label retry_load, done; - __ bind(retry_load); - __ ldxrw(rscratch1, addr_reg); - __ cmpw(rscratch1, old_reg); - __ br(Assembler::NE, done); - __ stlxrw(rscratch1, new_reg, addr_reg); - __ cbnzw(rscratch1, retry_load); - __ bind(done); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw); %} + + // The only difference between aarch64_enc_cmpxchg and + // aarch64_enc_cmpxchg_acq is that we use load-acquire in the + // CompareAndSwap sequence to serve as a barrier on acquiring a + // lock. + enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{ + MacroAssembler _masm(&cbuf); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr); + %} + + enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{ + MacroAssembler _masm(&cbuf); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw); + %} + + // auxiliary used for CompareAndSwapX to set result register enc_class aarch64_enc_cset_eq(iRegINoSp res) %{ MacroAssembler _masm(&cbuf); @@ -4398,13 +4354,10 @@ encode %{ // Compare object markOop with mark and if equal exchange scratch1 // with object markOop. - // Note that this is simply a CAS: it does not generate any - // barriers. These are separately generated by - // membar_acquire_lock(). { Label retry_load; __ bind(retry_load); - __ ldxr(tmp, oop); + __ ldaxr(tmp, oop); __ cmp(tmp, disp_hdr); __ br(Assembler::NE, cas_failed); // use stlxr to ensure update is immediately visible @@ -4454,7 +4407,7 @@ encode %{ { Label retry_load, fail; __ bind(retry_load); - __ ldxr(rscratch1, tmp); + __ ldaxr(rscratch1, tmp); __ cmp(disp_hdr, rscratch1); __ br(Assembler::NE, fail); // use stlxr to ensure update is immediately visible @@ -8017,10 +7970,10 @@ instruct membar_acquire_lock() %{ match(MemBarAcquireLock); ins_cost(VOLATILE_REF_COST); - format %{ "membar_acquire_lock" %} + format %{ "membar_acquire_lock (elided)" %} ins_encode %{ - __ membar(Assembler::LoadLoad|Assembler::LoadStore); + __ block_comment("membar_acquire_lock (elided)"); %} ins_pipe(pipe_serial); @@ -8080,10 +8033,10 @@ instruct membar_release_lock() %{ match(MemBarReleaseLock); ins_cost(VOLATILE_REF_COST); - format %{ "membar_release_lock" %} + format %{ "membar_release_lock (elided)" %} ins_encode %{ - __ membar(Assembler::LoadStore|Assembler::StoreStore); + __ block_comment("membar_release_lock (elided)"); %} ins_pipe(pipe_serial); @@ -8369,7 +8322,11 @@ instruct storePConditional(memory heap_top_ptr, iRegP oldval, iRegP newval, rFla ins_pipe(pipe_serial); %} -// this has to be implemented as a CAS + +// storeLConditional is used by PhaseMacroExpand::expand_lock_node +// when attempting to rebias a lock towards the current thread. We +// must use the acquire form of cmpxchg in order to guarantee acquire +// semantics in this case. instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{ match(Set cr (StoreLConditional mem (Binary oldval newval))); @@ -8381,12 +8338,14 @@ instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFl "cmpw rscratch1, zr\t# EQ on successful write" %} - ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval)); + ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval)); ins_pipe(pipe_slow); %} -// this has to be implemented as a CAS +// storeIConditional also has acquire semantics, for no better reason +// than matching storeLConditional. At the time of writing this +// comment storeIConditional was not used anywhere by AArch64. instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{ match(Set cr (StoreIConditional mem (Binary oldval newval))); @@ -8398,7 +8357,7 @@ instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFl "cmpw rscratch1, zr\t# EQ on successful write" %} - ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval)); + ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval)); ins_pipe(pipe_slow); %} diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp index bfe8fef5463..1fbba6f9b18 100644 --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp @@ -917,6 +917,8 @@ public: void cmpptr(Register src1, Address src2); + // Various forms of CAS + void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, Label &suceed, Label *fail); @@ -938,6 +940,23 @@ public: str(rscratch2, adr); } + // A generic CAS; success or failure is in the EQ flag. + template + void cmpxchg(Register addr, Register expected, Register new_val, + T1 load_insn, + void (MacroAssembler::*cmp_insn)(Register, Register), + T2 store_insn, + Register tmp = rscratch1) { + Label retry_load, done; + bind(retry_load); + (this->*load_insn)(tmp, addr); + (this->*cmp_insn)(tmp, expected); + br(Assembler::NE, done); + (this->*store_insn)(tmp, new_val, addr); + cbnzw(tmp, retry_load); + bind(done); + } + // Calls address trampoline_call(Address entry, CodeBuffer *cbuf = NULL); From 9f9739c156578d988daf98d5d7b02a98bb8366c0 Mon Sep 17 00:00:00 2001 From: Kirill Zhaldybin Date: Fri, 11 Sep 2015 16:11:07 +0300 Subject: [PATCH 13/20] 8132980: Improve stability of whitebox methods getCodeBlob and freeCodeBlob Added checks for negative and null size and address values (where applicable) for getCodeBlob, allocateCodeBlob and freeCodeBlob, added regression test Reviewed-by: iignatyev --- hotspot/src/share/vm/prims/whitebox.cpp | 21 +++++-- .../test/testlibrary/jdk/test/lib/Utils.java | 24 ++++++++ .../whitebox/BlobSanityTest.java | 60 +++++++++++++++++++ 3 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 hotspot/test/testlibrary_tests/whitebox/BlobSanityTest.java diff --git a/hotspot/src/share/vm/prims/whitebox.cpp b/hotspot/src/share/vm/prims/whitebox.cpp index a9c50888c41..5a44b5645a0 100644 --- a/hotspot/src/share/vm/prims/whitebox.cpp +++ b/hotspot/src/share/vm/prims/whitebox.cpp @@ -1041,11 +1041,18 @@ CodeBlob* WhiteBox::allocate_code_blob(int size, int blob_type) { } WB_ENTRY(jlong, WB_AllocateCodeBlob(JNIEnv* env, jobject o, jint size, jint blob_type)) - return (jlong) WhiteBox::allocate_code_blob(size, blob_type); + if (size < 0) { + THROW_MSG_0(vmSymbols::java_lang_IllegalArgumentException(), + err_msg("WB_AllocateCodeBlob: size is negative: " INT32_FORMAT, size)); + } + return (jlong) WhiteBox::allocate_code_blob(size, blob_type); WB_END WB_ENTRY(void, WB_FreeCodeBlob(JNIEnv* env, jobject o, jlong addr)) - BufferBlob::free((BufferBlob*) addr); + if (addr == 0) { + return; + } + BufferBlob::free((BufferBlob*) addr); WB_END WB_ENTRY(jobjectArray, WB_GetCodeHeapEntries(JNIEnv* env, jobject o, jint blob_type)) @@ -1090,9 +1097,13 @@ WB_ENTRY(jint, WB_GetCompilationActivityMode(JNIEnv* env, jobject o)) WB_END WB_ENTRY(jobjectArray, WB_GetCodeBlob(JNIEnv* env, jobject o, jlong addr)) - ThreadToNativeFromVM ttn(thread); - CodeBlobStub stub((CodeBlob*) addr); - return codeBlob2objectArray(thread, env, &stub); + if (addr == 0) { + THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(), + "WB_GetCodeBlob: addr is null"); + } + ThreadToNativeFromVM ttn(thread); + CodeBlobStub stub((CodeBlob*) addr); + return codeBlob2objectArray(thread, env, &stub); WB_END WB_ENTRY(jlong, WB_GetThreadStackSize(JNIEnv* env, jobject o)) diff --git a/hotspot/test/testlibrary/jdk/test/lib/Utils.java b/hotspot/test/testlibrary/jdk/test/lib/Utils.java index eb9ae00017c..a15967bb77b 100644 --- a/hotspot/test/testlibrary/jdk/test/lib/Utils.java +++ b/hotspot/test/testlibrary/jdk/test/lib/Utils.java @@ -428,4 +428,28 @@ public final class Utils { public static long adjustTimeout(long tOut) { return Math.round(tOut * Utils.TIMEOUT_FACTOR); } + + /** + * Runs runnable and checks that it throws expected exception. If exceptionException is null it means + * that we expect no exception to be thrown. + * @param runnable what we run + * @param expectedException expected exception + */ + public static void runAndCheckException(Runnable runnable, Class expectedException) { + try { + runnable.run(); + if (expectedException != null) { + throw new AssertionError("Didn't get expected exception " + expectedException.getSimpleName()); + } + } catch (Throwable t) { + if (expectedException == null) { + throw new AssertionError("Got unexpected exception ", t); + } + if (!expectedException.isAssignableFrom(t.getClass())) { + throw new AssertionError(String.format("Got unexpected exception %s instead of %s", + t.getClass().getSimpleName(), expectedException.getSimpleName()), t); + } + } + } + } diff --git a/hotspot/test/testlibrary_tests/whitebox/BlobSanityTest.java b/hotspot/test/testlibrary_tests/whitebox/BlobSanityTest.java new file mode 100644 index 00000000000..d8bf7970715 --- /dev/null +++ b/hotspot/test/testlibrary_tests/whitebox/BlobSanityTest.java @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test BlobSanityTest + * @bug 8132980 + * @library /testlibrary /../../test/lib + * @modules java.management/sun.management + * @build BlobSanityTest + * @run main ClassFileInstaller sun.hotspot.WhiteBox + * sun.hotspot.WhiteBox$WhiteBoxPermission + * @run main/othervm -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI BlobSanityTest + * @summary sanity testing of allocateCodeBlob, freeCodeBlob and getCodeBlob + */ + + +import sun.hotspot.WhiteBox; + +import java.util.function.Consumer; +import jdk.test.lib.Utils; + +public class BlobSanityTest { + + private static void runTest(Consumer consumer, int val, String testCaseName, Class + expectedException) { + System.out.println("Calling " + testCaseName); + Utils.runAndCheckException(() -> consumer.accept(val), expectedException); + System.out.println("Looks ok"); + } + + public static void main(String[] args) throws Exception { + System.out.println("Crash means that sanity check failed"); + + WhiteBox wb = WhiteBox.getWhiteBox(); + + runTest(wb::freeCodeBlob, 0, "wb::freeCodeBlob(0)", null); + runTest(wb::getCodeBlob, 0, "wb::getCodeBlob(0)", NullPointerException.class); + runTest(x -> wb.allocateCodeBlob(x, 0), -1, "wb::allocateCodeBlob(-1,0)", IllegalArgumentException.class); + } +} From d67924dc8eff76ec067a74e9fb9fdda6aa7e383d Mon Sep 17 00:00:00 2001 From: Michael Berg Date: Fri, 11 Sep 2015 17:02:44 -0700 Subject: [PATCH 14/20] 8132160: support for AVX 512 call frames and stack management Simplify save/restore frame on x86 systems which support EVEX. Reviewed-by: kvn, iveresov --- hotspot/src/cpu/x86/vm/assembler_x86.cpp | 1266 ++++++++++------- hotspot/src/cpu/x86/vm/assembler_x86.hpp | 56 +- .../src/cpu/x86/vm/c1_LIRAssembler_x86.cpp | 20 +- hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp | 141 +- hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp | 332 +++-- hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp | 3 + .../src/cpu/x86/vm/sharedRuntime_x86_32.cpp | 215 ++- .../src/cpu/x86/vm/sharedRuntime_x86_64.cpp | 311 +--- .../src/cpu/x86/vm/stubGenerator_x86_32.cpp | 52 +- .../src/cpu/x86/vm/stubGenerator_x86_64.cpp | 80 +- .../src/cpu/x86/vm/stubRoutines_x86_32.hpp | 2 +- .../src/cpu/x86/vm/stubRoutines_x86_64.hpp | 2 +- hotspot/src/cpu/x86/vm/vm_version_x86.cpp | 20 +- hotspot/src/cpu/x86/vm/vm_version_x86.hpp | 24 +- hotspot/src/cpu/x86/vm/x86.ad | 295 ++-- hotspot/src/cpu/x86/vm/x86_32.ad | 8 +- hotspot/src/cpu/x86/vm/x86_64.ad | 8 +- 17 files changed, 1578 insertions(+), 1257 deletions(-) diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index fcbfce628ed..c7d743c9324 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -394,25 +394,25 @@ bool Assembler::emit_compressed_disp_byte(int &disp) { int mod_idx = 0; // We will test if the displacement fits the compressed format and if so // apply the compression to the displacment iff the result is8bit. - if (VM_Version::supports_evex() && is_evex_instruction) { - switch (tuple_type) { + if (VM_Version::supports_evex() && _is_evex_instruction) { + switch (_tuple_type) { case EVEX_FV: - if ((evex_encoding & VEX_W) == VEX_W) { - mod_idx += 2 + ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + if ((_evex_encoding & VEX_W) == VEX_W) { + mod_idx += 2 + ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; } else { - mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; } break; case EVEX_HV: - mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; + mod_idx = ((_evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0; break; case EVEX_FVM: break; case EVEX_T1S: - switch (input_size_in_bits) { + switch (_input_size_in_bits) { case EVEX_8bit: break; @@ -433,7 +433,7 @@ bool Assembler::emit_compressed_disp_byte(int &disp) { case EVEX_T1F: case EVEX_T2: case EVEX_T4: - mod_idx = (input_size_in_bits == EVEX_64bit) ? 1 : 0; + mod_idx = (_input_size_in_bits == EVEX_64bit) ? 1 : 0; break; case EVEX_T8: @@ -459,8 +459,8 @@ bool Assembler::emit_compressed_disp_byte(int &disp) { break; } - if (avx_vector_len >= AVX_128bit && avx_vector_len <= AVX_512bit) { - int disp_factor = tuple_table[tuple_type + mod_idx][avx_vector_len]; + if (_avx_vector_len >= AVX_128bit && _avx_vector_len <= AVX_512bit) { + int disp_factor = tuple_table[_tuple_type + mod_idx][_avx_vector_len]; if ((disp % disp_factor) == 0) { int new_disp = disp / disp_factor; if (is8bit(new_disp)) { @@ -591,7 +591,7 @@ void Assembler::emit_operand(Register reg, Register base, Register index, emit_data(disp, rspec, disp32_operand); } } - is_evex_instruction = false; + _is_evex_instruction = false; } void Assembler::emit_operand(XMMRegister reg, Register base, Register index, @@ -1229,8 +1229,8 @@ void Assembler::addsd(XMMRegister dst, XMMRegister src) { void Assembler::addsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2); } else { emit_simd_arith(0x58, dst, src, VEX_SIMD_F2); @@ -1245,8 +1245,8 @@ void Assembler::addss(XMMRegister dst, XMMRegister src) { void Assembler::addss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_simd_arith(0x58, dst, src, VEX_SIMD_F3); } @@ -1254,16 +1254,16 @@ void Assembler::addss(XMMRegister dst, Address src) { void Assembler::aesdec(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDE); emit_operand(dst, src); } void Assembler::aesdec(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDE); emit_int8(0xC0 | encode); } @@ -1271,16 +1271,16 @@ void Assembler::aesdec(XMMRegister dst, XMMRegister src) { void Assembler::aesdeclast(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDF); emit_operand(dst, src); } void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDF); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1288,16 +1288,16 @@ void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) { void Assembler::aesenc(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDC); emit_operand(dst, src); } void Assembler::aesenc(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDC); emit_int8(0xC0 | encode); } @@ -1305,21 +1305,20 @@ void Assembler::aesenc(XMMRegister dst, XMMRegister src) { void Assembler::aesenclast(XMMRegister dst, Address src) { assert(VM_Version::supports_aes(), ""); InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDD); emit_operand(dst, src); } void Assembler::aesenclast(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_aes(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8((unsigned char)0xDD); emit_int8((unsigned char)(0xC0 | encode)); } - void Assembler::andl(Address dst, int32_t imm32) { InstructionMark im(this); prefix(dst); @@ -1347,7 +1346,7 @@ void Assembler::andl(Register dst, Register src) { void Assembler::andnl(Register dst, Register src1, Register src2) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false); + int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2); emit_int8((unsigned char)0xF2); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1355,7 +1354,7 @@ void Assembler::andnl(Register dst, Register src1, Register src2) { void Assembler::andnl(Register dst, Register src1, Address src2) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_legacy(dst, src1, src2, false); + vex_prefix_0F38_legacy(dst, src1, src2); emit_int8((unsigned char)0xF2); emit_operand(dst, src2); } @@ -1382,7 +1381,7 @@ void Assembler::bswapl(Register reg) { // bswap void Assembler::blsil(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false); + int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1390,14 +1389,14 @@ void Assembler::blsil(Register dst, Register src) { void Assembler::blsil(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_legacy(rbx, dst, src, false); + vex_prefix_0F38_legacy(rbx, dst, src); emit_int8((unsigned char)0xF3); emit_operand(rbx, src); } void Assembler::blsmskl(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false); + int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1405,14 +1404,14 @@ void Assembler::blsmskl(Register dst, Register src) { void Assembler::blsmskl(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(rdx, dst, src, false); + vex_prefix_0F38_legacy(rdx, dst, src); emit_int8((unsigned char)0xF3); emit_operand(rdx, src); } void Assembler::blsrl(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false); + int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1420,7 +1419,7 @@ void Assembler::blsrl(Register dst, Register src) { void Assembler::blsrl(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_legacy(rcx, dst, src, false); + vex_prefix_0F38_legacy(rcx, dst, src); emit_int8((unsigned char)0xF3); emit_operand(rcx, src); } @@ -1569,9 +1568,9 @@ void Assembler::comisd(XMMRegister dst, Address src) { // 0x66 is there. Strangly ucomisd comes out correct NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; - emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; + emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true); } else { emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); } @@ -1580,7 +1579,7 @@ void Assembler::comisd(XMMRegister dst, Address src) { void Assembler::comisd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true); + emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, /* no_mask_reg */ true); } else { emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66); } @@ -1588,16 +1587,16 @@ void Assembler::comisd(XMMRegister dst, XMMRegister src) { void Assembler::comiss(XMMRegister dst, Address src) { if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true); } void Assembler::comiss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true); + emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true); } void Assembler::cpuid() { @@ -1607,12 +1606,12 @@ void Assembler::cpuid() { void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3); + emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true); } void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE); + emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ true); } void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { @@ -1627,8 +1626,8 @@ void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { void Assembler::cvtsd2ss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1F; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1F; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2); } else { emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2); @@ -1637,12 +1636,7 @@ void Assembler::cvtsd2ss(XMMRegister dst, Address src) { void Assembler::cvtsi2sdl(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = 0; - if (VM_Version::supports_evex()) { - encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true); - } else { - encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, false); - } + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VM_Version::supports_evex()); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1650,9 +1644,9 @@ void Assembler::cvtsi2sdl(XMMRegister dst, Register src) { void Assembler::cvtsi2sdl(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; - emit_simd_arith_q(0x2A, dst, src, VEX_SIMD_F2, true); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; + emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true); } else { emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2); } @@ -1660,23 +1654,23 @@ void Assembler::cvtsi2sdl(XMMRegister dst, Address src) { void Assembler::cvtsi2ssl(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvtsi2ssl(XMMRegister dst, Address src) { if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true); + emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); } void Assembler::cvtsi2ssq(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true); + int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1688,8 +1682,8 @@ void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) { void Assembler::cvtss2sd(XMMRegister dst, Address src) { if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3); @@ -1698,14 +1692,14 @@ void Assembler::cvtss2sd(XMMRegister dst, Address src) { void Assembler::cvttsd2sil(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true); + int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvttss2sil(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true); + int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1721,8 +1715,8 @@ void Assembler::decl(Address dst) { void Assembler::divsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2); } else { emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2); @@ -1740,8 +1734,8 @@ void Assembler::divsd(XMMRegister dst, XMMRegister src) { void Assembler::divss(XMMRegister dst, Address src) { if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } NOT_LP64(assert(VM_Version::supports_sse(), "")); emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3); @@ -1995,8 +1989,16 @@ void Assembler::mov(Register dst, Register src) { void Assembler::movapd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - if (VM_Version::supports_evex()) { - emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66, true); + if (VM_Version::supports_avx512novl()) { + int vector_len = AVX_512bit; + int dst_enc = dst->encoding(); + int src_enc = src->encoding(); + int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F, + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); + emit_int8(0x28); + emit_int8((unsigned char)(0xC0 | encode)); + } else if (VM_Version::supports_evex()) { + emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66); } else { emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66); } @@ -2004,13 +2006,19 @@ void Assembler::movapd(XMMRegister dst, XMMRegister src) { void Assembler::movaps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE); + if (VM_Version::supports_avx512novl()) { + int vector_len = AVX_512bit; + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, vector_len); + emit_int8(0x28); + emit_int8((unsigned char)(0xC0 | encode)); + } else { + emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE); + } } void Assembler::movlhps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, true, VEX_OPCODE_0F, - false, AVX_128bit); + int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, /* no_mask_reg */ true); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2023,48 +2031,54 @@ void Assembler::movb(Register dst, Address src) { emit_operand(dst, src); } -void Assembler::kmovq(KRegister dst, KRegister src) { +void Assembler::kmovql(KRegister dst, KRegister src) { NOT_LP64(assert(VM_Version::supports_evex(), "")); int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, - true, VEX_OPCODE_0F, true); + /* no_mask_reg */ true, VEX_OPCODE_0F, /* rex_w */ true); emit_int8((unsigned char)0x90); emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::kmovq(KRegister dst, Address src) { +void Assembler::kmovql(KRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_evex(), "")); int dst_enc = dst->encoding(); int nds_enc = 0; vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_NONE, - VEX_OPCODE_0F, true, AVX_128bit, true, true); + VEX_OPCODE_0F, /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true); emit_int8((unsigned char)0x90); emit_operand((Register)dst, src); } -void Assembler::kmovq(Address dst, KRegister src) { +void Assembler::kmovql(Address dst, KRegister src) { NOT_LP64(assert(VM_Version::supports_evex(), "")); int src_enc = src->encoding(); int nds_enc = 0; vex_prefix(dst, nds_enc, src_enc, VEX_SIMD_NONE, - VEX_OPCODE_0F, true, AVX_128bit, true, true); + VEX_OPCODE_0F, /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_reg_mask */ true); emit_int8((unsigned char)0x90); emit_operand((Register)src, dst); } void Assembler::kmovql(KRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_evex(), "")); - bool supports_bw = VM_Version::supports_avx512bw(); - VexSimdPrefix pre = supports_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE; - int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, - VEX_OPCODE_0F, supports_bw); + VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE; + int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true, + VEX_OPCODE_0F, /* legacy_mode */ !_legacy_mode_bw); emit_int8((unsigned char)0x92); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::kmovdl(KRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_evex(), "")); - VexSimdPrefix pre = VM_Version::supports_avx512bw() ? VEX_SIMD_F2 : VEX_SIMD_NONE; - int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, VEX_OPCODE_0F, false); + VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE; + int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, /* no_mask_reg */ true); + emit_int8((unsigned char)0x92); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::kmovwl(KRegister dst, Register src) { + NOT_LP64(assert(VM_Version::supports_evex(), "")); + int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, /* no_mask_reg */ true); emit_int8((unsigned char)0x92); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2088,7 +2102,7 @@ void Assembler::movb(Address dst, Register src) { void Assembler::movdl(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, true); + int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, /* no_mask_reg */ true); emit_int8(0x6E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2096,7 +2110,7 @@ void Assembler::movdl(XMMRegister dst, Register src) { void Assembler::movdl(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // swap src/dst to get correct prefix - int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, true); + int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, /* no_mask_reg */ true); emit_int8(0x7E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2104,11 +2118,11 @@ void Assembler::movdl(Register dst, XMMRegister src) { void Assembler::movdl(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, true, VEX_OPCODE_0F); + simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true); emit_int8(0x6E); emit_operand(dst, src); } @@ -2116,58 +2130,61 @@ void Assembler::movdl(XMMRegister dst, Address src) { void Assembler::movdl(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, true); + simd_prefix(dst, src, VEX_SIMD_66, /* no_reg_mask */ true); emit_int8(0x7E); emit_operand(src, dst); } void Assembler::movdqa(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66); } void Assembler::movdqa(XMMRegister dst, Address src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66); } void Assembler::movdqu(XMMRegister dst, Address src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3); } void Assembler::movdqu(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3); } void Assembler::movdqu(Address dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3, false); + simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false); emit_int8(0x7F); emit_operand(src, dst); } // Move Unaligned 256bit Vector void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; assert(UseAVX > 0, ""); - if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; - } int vector_len = AVX_256bit; int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector_len); emit_int8(0x6F); @@ -2175,67 +2192,100 @@ void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) { } void Assembler::vmovdqu(XMMRegister dst, Address src) { + _instruction_uses_vl = true; assert(UseAVX > 0, ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } InstructionMark im(this); int vector_len = AVX_256bit; - vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false); + vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len); emit_int8(0x6F); emit_operand(dst, src); } void Assembler::vmovdqu(Address dst, XMMRegister src) { + _instruction_uses_vl = true; assert(UseAVX > 0, ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } InstructionMark im(this); int vector_len = AVX_256bit; // swap src<->dst for encoding assert(src != xnoreg, "sanity"); - vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false); + vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len); emit_int8(0x7F); emit_operand(src, dst); } // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64) -void Assembler::evmovdqu(XMMRegister dst, XMMRegister src, int vector_len) { +void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, ""); int src_enc = src->encoding(); int dst_enc = dst->encoding(); int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F, - true, vector_len, false, false); + /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x6F); emit_int8((unsigned char)(0xC0 | encode)); } -void Assembler::evmovdqu(XMMRegister dst, Address src, int vector_len) { +void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, ""); InstructionMark im(this); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; - vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false); - } else { - vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false); + _tuple_type = EVEX_FVM; } + vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len); emit_int8(0x6F); emit_operand(dst, src); } -void Assembler::evmovdqu(Address dst, XMMRegister src, int vector_len) { +void Assembler::evmovdqul(Address dst, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, ""); InstructionMark im(this); assert(src != xnoreg, "sanity"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; - // swap src<->dst for encoding - vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false); - } else { - // swap src<->dst for encoding - vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false); + _tuple_type = EVEX_FVM; } + // swap src<->dst for encoding + vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len); + emit_int8(0x7F); + emit_operand(src, dst); +} + +void Assembler::evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; + assert(UseAVX > 0, ""); + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F, + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); + emit_int8(0x6F); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) { + _instruction_uses_vl = true; + assert(UseAVX > 2, ""); + InstructionMark im(this); + _tuple_type = EVEX_FVM; + vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len); + emit_int8(0x6F); + emit_operand(dst, src); +} + +void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; + assert(UseAVX > 2, ""); + InstructionMark im(this); + assert(src != xnoreg, "sanity"); + _tuple_type = EVEX_FVM; + // swap src<->dst for encoding + vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len); emit_int8(0x7F); emit_operand(src, dst); } @@ -2282,10 +2332,12 @@ void Assembler::movl(Address dst, Register src) { void Assembler::movlpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; + emit_simd_arith_q(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true); + } else { + emit_simd_arith(0x12, dst, src, VEX_SIMD_66, /* no_mask_reg */ true); } - emit_simd_arith(0x12, dst, src, VEX_SIMD_66, true); } void Assembler::movq( MMXRegister dst, Address src ) { @@ -2312,11 +2364,11 @@ void Assembler::movq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; - simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, true); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; + simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, /* no_mask_reg */ true); } else { - simd_prefix(dst, src, VEX_SIMD_F3, true, VEX_OPCODE_0F); + simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); } emit_int8(0x7E); emit_operand(dst, src); @@ -2326,12 +2378,12 @@ void Assembler::movq(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; - simd_prefix(src, xnoreg, dst, VEX_SIMD_66, true, - VEX_OPCODE_0F, true, AVX_128bit); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F, /* rex_w */ true); } else { - simd_prefix(dst, src, VEX_SIMD_66, true); + simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ true); } emit_int8((unsigned char)0xD6); emit_operand(src, dst); @@ -2356,7 +2408,7 @@ void Assembler::movsbl(Register dst, Register src) { // movsxb void Assembler::movsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, true); + emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true); } else { emit_simd_arith(0x10, dst, src, VEX_SIMD_F2); } @@ -2365,9 +2417,9 @@ void Assembler::movsd(XMMRegister dst, XMMRegister src) { void Assembler::movsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; - emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, true); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; + emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true); } else { emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2); } @@ -2377,11 +2429,11 @@ void Assembler::movsd(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; simd_prefix_q(src, xnoreg, dst, VEX_SIMD_F2); } else { - simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, false); + simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, /* no_mask_reg */ false); } emit_int8(0x11); emit_operand(src, dst); @@ -2389,26 +2441,26 @@ void Assembler::movsd(Address dst, XMMRegister src) { void Assembler::movss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, true); + emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); } void Assembler::movss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } - emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, true); + emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); } void Assembler::movss(Address dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_F3, false); + simd_prefix(dst, src, VEX_SIMD_F3, /* no_mask_reg */ false); emit_int8(0x11); emit_operand(src, dst); } @@ -2501,8 +2553,8 @@ void Assembler::mull(Register src) { void Assembler::mulsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2); } else { emit_simd_arith(0x59, dst, src, VEX_SIMD_F2); @@ -2521,8 +2573,8 @@ void Assembler::mulsd(XMMRegister dst, XMMRegister src) { void Assembler::mulss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_simd_arith(0x59, dst, src, VEX_SIMD_F3); } @@ -2831,29 +2883,27 @@ void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } - emit_simd_arith(0x67, dst, src, VEX_SIMD_66, - false, (VM_Version::supports_avx512dq() == false)); + emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::packuswb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x67, dst, src, VEX_SIMD_66, - false, (VM_Version::supports_avx512dq() == false)); + emit_simd_arith(0x67, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "some form of AVX must be enabled"); - emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len, - false, (VM_Version::supports_avx512dq() == false)); + emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx2(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_3A, true, vector_len); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_3A, /* rex_w */ true, vector_len); emit_int8(0x00); emit_int8(0xC0 | encode); emit_int8(imm8); @@ -2867,8 +2917,8 @@ void Assembler::pause() { void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) { assert(VM_Version::supports_sse4_2(), ""); InstructionMark im(this); - simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_3A, - false, AVX_128bit, true); + simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_3A, + /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x61); emit_operand(dst, src); emit_int8(imm8); @@ -2876,8 +2926,8 @@ void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) { void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_2(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_3A, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x61); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2885,8 +2935,8 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A, - false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2894,8 +2944,8 @@ void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A, - false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); + int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2903,8 +2953,8 @@ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A, - false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_dq); emit_int8(0x22); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2912,8 +2962,8 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A, - false, AVX_128bit, (VM_Version::supports_avx512dq() == false)); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F_3A, /* rex_w */ true, AVX_128bit, /* legacy_mode */ _legacy_mode_dq); emit_int8(0x22); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); @@ -2922,17 +2972,17 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_HVM; + _tuple_type = EVEX_HVM; } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38); + simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38); emit_int8(0x30); emit_operand(dst, src); } void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F_38); emit_int8(0x30); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3035,8 +3085,8 @@ void Assembler::prefix(Prefix p) { void Assembler::pshufb(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_ssse3(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38, - false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x00); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3044,33 +3094,34 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) { void Assembler::pshufb(XMMRegister dst, Address src) { assert(VM_Version::supports_ssse3(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } InstructionMark im(this); - simd_prefix(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38, - false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + simd_prefix(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x00); emit_operand(dst, src); } void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) { + _instruction_uses_vl = true; assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66); emit_int8(mode & 0xFF); - } void Assembler::pshufd(XMMRegister dst, Address src, int mode) { + _instruction_uses_vl = true; assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, false); + simd_prefix(dst, src, VEX_SIMD_66, /* no_mask_reg */ false); emit_int8(0x70); emit_operand(dst, src); emit_int8(mode & 0xFF); @@ -3079,8 +3130,7 @@ void Assembler::pshufd(XMMRegister dst, Address src, int mode) { void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, false, - (VM_Version::supports_avx512bw() == false)); + emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); emit_int8(mode & 0xFF); } @@ -3089,29 +3139,33 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } InstructionMark im(this); - simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, false, VEX_OPCODE_0F, - false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, /* no_mask_reg */ false, + VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x70); emit_operand(dst, src); emit_int8(mode & 0xFF); } void Assembler::psrldq(XMMRegister dst, int shift) { - // Shift 128 bit value in xmm register by number of bytes. + // Shift left 128 bit value in dst XMMRegister by shift number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + // XMM3 is for /3 encoding: 66 0F 73 /3 ib + int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift); } void Assembler::pslldq(XMMRegister dst, int shift) { - // Shift left 128 bit value in xmm register by number of bytes. + // Shift left 128 bit value in dst XMMRegister by shift number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + // XMM7 is for /7 encoding: 66 0F 73 /7 ib + int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, /* no_mask_reg */ true, + VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift); @@ -3121,16 +3175,16 @@ void Assembler::ptest(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); InstructionMark im(this); - simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + simd_prefix(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x17); emit_operand(dst, src); } void Assembler::ptest(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x17); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3142,7 +3196,8 @@ void Assembler::vptest(XMMRegister dst, Address src) { assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false); + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* rex_w */ false, + vector_len, /* legacy_mode */ true, /* no_mask_reg */ false); emit_int8(0x17); emit_operand(dst, src); } @@ -3150,8 +3205,7 @@ void Assembler::vptest(XMMRegister dst, Address src) { void Assembler::vptest(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); int vector_len = AVX_256bit; - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38, true, false); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true); emit_int8(0x17); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3160,34 +3214,41 @@ void Assembler::punpcklbw(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } - emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false)); + emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw); } void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false)); + emit_simd_arith(0x60, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_vlbw); } void Assembler::punpckldq(XMMRegister dst, Address src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_simd_arith(0x62, dst, src, VEX_SIMD_66); } void Assembler::punpckldq(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x62, dst, src, VEX_SIMD_66); } void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0x6C, dst, src, VEX_SIMD_66); + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x6C, dst, src, VEX_SIMD_66); + } else { + emit_simd_arith(0x6C, dst, src, VEX_SIMD_66); + } } void Assembler::push(int32_t imm32) { @@ -3396,8 +3457,8 @@ void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) { void Assembler::sqrtsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2); } else { emit_simd_arith(0x51, dst, src, VEX_SIMD_F2); @@ -3416,8 +3477,8 @@ void Assembler::std() { void Assembler::sqrtss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_simd_arith(0x51, dst, src, VEX_SIMD_F3); } @@ -3479,10 +3540,14 @@ void Assembler::subsd(XMMRegister dst, XMMRegister src) { void Assembler::subsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; + } + if (VM_Version::supports_evex()) { + emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2); + } else { + emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2); } - emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2); } void Assembler::subss(XMMRegister dst, XMMRegister src) { @@ -3493,8 +3558,8 @@ void Assembler::subss(XMMRegister dst, XMMRegister src) { void Assembler::subss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3); } @@ -3553,9 +3618,9 @@ void Assembler::tzcntq(Register dst, Register src) { void Assembler::ucomisd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; - emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; + emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true); } else { emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); } @@ -3564,7 +3629,7 @@ void Assembler::ucomisd(XMMRegister dst, Address src) { void Assembler::ucomisd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true); + emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, /* no_mask_reg */ true); } else { emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66); } @@ -3573,15 +3638,15 @@ void Assembler::ucomisd(XMMRegister dst, XMMRegister src) { void Assembler::ucomiss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } - emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true); } void Assembler::ucomiss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true); + emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ true); } void Assembler::xabort(int8_t imm8) { @@ -3664,8 +3729,8 @@ void Assembler::xorl(Register dst, Register src) { void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit); } else { emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit); @@ -3684,8 +3749,8 @@ void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } @@ -3698,8 +3763,8 @@ void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit); } else { emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit); @@ -3718,8 +3783,8 @@ void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } @@ -3732,8 +3797,8 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit); } else { emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit); @@ -3752,8 +3817,8 @@ void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } @@ -3766,8 +3831,8 @@ void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit); } else { emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit); @@ -3786,8 +3851,8 @@ void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit); } @@ -3802,6 +3867,7 @@ void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { // Float-point vector arithmetic void Assembler::addpd(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0x58, dst, src, VEX_SIMD_66); @@ -3811,11 +3877,13 @@ void Assembler::addpd(XMMRegister dst, XMMRegister src) { } void Assembler::addps(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE); } void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3825,15 +3893,17 @@ void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve } void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3841,15 +3911,17 @@ void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector } void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::subpd(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_66); @@ -3859,11 +3931,13 @@ void Assembler::subpd(XMMRegister dst, XMMRegister src) { } void Assembler::subps(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE); } void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3873,15 +3947,17 @@ void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve } void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3889,15 +3965,17 @@ void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector } void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::mulpd(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66); @@ -3907,11 +3985,13 @@ void Assembler::mulpd(XMMRegister dst, XMMRegister src) { } void Assembler::mulps(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE); } void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3921,15 +4001,17 @@ void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve } void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3937,15 +4019,17 @@ void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector } void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::divpd(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_66); @@ -3955,11 +4039,13 @@ void Assembler::divpd(XMMRegister dst, XMMRegister src) { } void Assembler::divps(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE); } void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3969,15 +4055,17 @@ void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve } void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len); @@ -3985,15 +4073,17 @@ void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector } void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len); } void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); @@ -4003,10 +4093,11 @@ void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) { } void Assembler::vsqrtpd(XMMRegister dst, Address src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0x51, dst, xnoreg, src, VEX_SIMD_66, vector_len); @@ -4015,154 +4106,145 @@ void Assembler::vsqrtpd(XMMRegister dst, Address src, int vector_len) { void Assembler::andpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + if (VM_Version::supports_avx512dq()) { emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66); } else { - emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true); + emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::andps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, false, - (VM_Version::supports_avx512dq() == false)); + emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::andps(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } - emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, - false, (VM_Version::supports_avx512dq() == false)); + emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::andpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + if (VM_Version::supports_avx512dq()) { + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66); } else { - emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true); + emit_simd_arith(0x54, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + if (VM_Version::supports_avx512dq()) { emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len); } else { - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - bool legacy_mode = (VM_Version::supports_avx512dq() == false); - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, legacy_mode); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + if (VM_Version::supports_avx512dq()) { + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len); } else { - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } - emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, - (VM_Version::supports_avx512dq() == false)); + emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::xorpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + if (VM_Version::supports_avx512dq()) { emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66); } else { - emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true); + emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::xorps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, - false, (VM_Version::supports_avx512dq() == false)); + emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::xorpd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + if (VM_Version::supports_avx512dq()) { + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66); } else { - emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true); + emit_simd_arith(0x57, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::xorps(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } - emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, false, - (VM_Version::supports_avx512dq() == false)); + emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { + if (VM_Version::supports_avx512dq()) { emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len); } else { - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, - (VM_Version::supports_avx512dq() == false)); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); - if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + if (VM_Version::supports_avx512dq()) { + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len); } else { - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ true); } } void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } - emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, - (VM_Version::supports_avx512dq() == false)); + emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_dq); } // Integer vector arithmetic void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx() && (vector_len == 0) || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, - VEX_OPCODE_0F_38, true, false); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true); emit_int8(0x01); emit_int8((unsigned char)(0xC0 | encode)); } @@ -4170,28 +4252,29 @@ void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int v void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx() && (vector_len == 0) || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, - VEX_OPCODE_0F_38, true, false); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38, /* legacy_mode */ true); emit_int8(0x02); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::paddb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xFC, dst, src, VEX_SIMD_66); + emit_simd_arith(0xFC, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::paddw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xFD, dst, src, VEX_SIMD_66); + emit_simd_arith(0xFD, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::paddd(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xFE, dst, src, VEX_SIMD_66); } void Assembler::paddq(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0xD4, dst, src, VEX_SIMD_66); @@ -4202,38 +4285,38 @@ void Assembler::paddq(XMMRegister dst, XMMRegister src) { void Assembler::phaddw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse3(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x01); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::phaddd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse3(), "")); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_38, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_38, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x02); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); @@ -4245,33 +4328,35 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } - emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len); + emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } - emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len); + emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len); @@ -4280,20 +4365,22 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector void Assembler::psubb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xF8, dst, src, VEX_SIMD_66); + emit_simd_arith(0xF8, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::psubw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xF9, dst, src, VEX_SIMD_66); + emit_simd_arith(0xF9, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::psubd(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xFA, dst, src, VEX_SIMD_66); } void Assembler::psubq(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0xFB, dst, src, VEX_SIMD_66); @@ -4304,22 +4391,22 @@ void Assembler::psubq(XMMRegister dst, XMMRegister src) { void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); @@ -4331,35 +4418,35 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } - emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } - emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); } else { emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len); @@ -4368,28 +4455,27 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector void Assembler::pmullw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xD5, dst, src, VEX_SIMD_66, - (VM_Version::supports_avx512bw() == false)); + emit_simd_arith(0xD5, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::pmulld(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; assert(VM_Version::supports_sse4_1(), ""); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, - false, VEX_OPCODE_0F_38); + /* no_mask_reg */ false, VEX_OPCODE_0F_38); emit_int8(0x40); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38); emit_int8(0x40); emit_int8((unsigned char)(0xC0 | encode)); } @@ -4399,8 +4485,8 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int v int src_enc = src->encoding(); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, - VEX_OPCODE_0F_38, true, vector_len, false, false); + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, + /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ false); emit_int8(0x40); emit_int8((unsigned char)(0xC0 | encode)); } @@ -4408,22 +4494,23 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int v void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FVM; + _tuple_type = EVEX_FVM; } - emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len); + emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, - VEX_OPCODE_0F_38, false, vector_len); + VEX_OPCODE_0F_38, /* vex_w */ false, vector_len); emit_int8(0x40); emit_operand(dst, src); } @@ -4431,13 +4518,14 @@ void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vecto void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_64bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_64bit; } InstructionMark im(this); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len); + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, + VEX_OPCODE_0F_38, /* vex_w */ true, vector_len, /* legacy_mode */ _legacy_mode_dq); emit_int8(0x40); emit_operand(dst, src); } @@ -4446,26 +4534,28 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vecto void Assembler::psllw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM6 is for /6 encoding: 66 0F 71 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, - false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F, + /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x71); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); } void Assembler::pslld(XMMRegister dst, int shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM6 is for /6 encoding: 66 0F 72 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false); + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false); emit_int8(0x72); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); } void Assembler::psllq(XMMRegister dst, int shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM6 is for /6 encoding: 66 0F 73 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true); + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, VEX_OPCODE_0F, /* rex_w */ true); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -4473,16 +4563,17 @@ void Assembler::psllq(XMMRegister dst, int shift) { void Assembler::psllw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, false, - (VM_Version::supports_avx512bw() == false)); + emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::pslld(XMMRegister dst, XMMRegister shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66); } void Assembler::psllq(XMMRegister dst, XMMRegister shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0xF3, dst, shift, VEX_SIMD_66); @@ -4494,12 +4585,12 @@ void Assembler::psllq(XMMRegister dst, XMMRegister shift) { void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); // XMM6 is for /6 encoding: 66 0F 71 /6 ib - emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); emit_int8(shift & 0xFF); } void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); // XMM6 is for /6 encoding: 66 0F 72 /6 ib emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector_len); @@ -4507,6 +4598,7 @@ void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_l } void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); // XMM6 is for /6 encoding: 66 0F 73 /6 ib if (VM_Version::supports_evex()) { @@ -4519,16 +4611,17 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_l void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector_len); } void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { emit_vex_arith_q(0xF3, dst, src, shift, VEX_SIMD_66, vector_len); @@ -4541,33 +4634,31 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int void Assembler::psrlw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM2 is for /2 encoding: 66 0F 71 /2 ib - int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, - (VM_Version::supports_avx512bw() == false)); + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x71); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); } void Assembler::psrld(XMMRegister dst, int shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM2 is for /2 encoding: 66 0F 72 /2 ib - int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false); + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false); emit_int8(0x72); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); } void Assembler::psrlq(XMMRegister dst, int shift) { + _instruction_uses_vl = true; // Do not confuse it with psrldq SSE2 instruction which // shifts 128 bit value in xmm register by number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM2 is for /2 encoding: 66 0F 73 /2 ib - int encode = 0; - if (VM_Version::supports_evex() && VM_Version::supports_avx512bw()) { - encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false); - } else { - encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true); - } + int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F, /* rex_w */ VM_Version::supports_evex()); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -4575,16 +4666,17 @@ void Assembler::psrlq(XMMRegister dst, int shift) { void Assembler::psrlw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, false, - (VM_Version::supports_avx512bw() == false)); + emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::psrld(XMMRegister dst, XMMRegister shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66); } void Assembler::psrlq(XMMRegister dst, XMMRegister shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { emit_simd_arith_q(0xD3, dst, shift, VEX_SIMD_66); @@ -4595,20 +4687,21 @@ void Assembler::psrlq(XMMRegister dst, XMMRegister shift) { void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - // XMM2 is for /2 encoding: 66 0F 73 /2 ib - emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + // XMM2 is for /2 encoding: 66 0F 71 /2 ib + emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); emit_int8(shift & 0xFF); } void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); - // XMM2 is for /2 encoding: 66 0F 73 /2 ib + // XMM2 is for /2 encoding: 66 0F 72 /2 ib emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector_len); emit_int8(shift & 0xFF); } void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); // XMM2 is for /2 encoding: 66 0F 73 /2 ib if (VM_Version::supports_evex()) { @@ -4621,16 +4714,17 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_l void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector_len); } void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { emit_vex_arith_q(0xD3, dst, src, shift, VEX_SIMD_66, vector_len); @@ -4643,17 +4737,18 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int void Assembler::psraw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM4 is for /4 encoding: 66 0F 71 /4 ib - int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, - (VM_Version::supports_avx512bw() == false)); + int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F, /* rex_w */ false, AVX_128bit, /* legacy_mode */ _legacy_mode_bw); emit_int8(0x71); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); } void Assembler::psrad(XMMRegister dst, int shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); // XMM4 is for /4 encoding: 66 0F 72 /4 ib - int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false); + int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, /* no_mask_reg */ false); emit_int8(0x72); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift & 0xFF); @@ -4661,11 +4756,11 @@ void Assembler::psrad(XMMRegister dst, int shift) { void Assembler::psraw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66, - (VM_Version::supports_avx512bw() == false)); + emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::psrad(XMMRegister dst, XMMRegister shift) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66); } @@ -4673,12 +4768,12 @@ void Assembler::psrad(XMMRegister dst, XMMRegister shift) { void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); // XMM4 is for /4 encoding: 66 0F 71 /4 ib - emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); emit_int8(shift & 0xFF); } void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); // XMM4 is for /4 encoding: 66 0F 71 /4 ib emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector_len); @@ -4687,11 +4782,11 @@ void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_l void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len, - (VM_Version::supports_avx512bw() == false)); + emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len, /* no_mask_reg */ false, /* legacy_mode */ _legacy_mode_bw); } void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector_len); } @@ -4704,53 +4799,61 @@ void Assembler::pand(XMMRegister dst, XMMRegister src) { } void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::por(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xEB, dst, src, VEX_SIMD_66); } void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::pxor(XMMRegister dst, XMMRegister src) { + _instruction_uses_vl = true; NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xEF, dst, src, VEX_SIMD_66); } void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len); } void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + _instruction_uses_vl = true; assert(UseAVX > 0, "requires some form of AVX"); if (VM_Version::supports_evex()) { - tuple_type = EVEX_FV; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_FV; + _input_size_in_bits = EVEX_32bit; } emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len); } @@ -4759,6 +4862,9 @@ void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_ void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); int vector_len = AVX_256bit; + if (VM_Version::supports_evex()) { + vector_len = AVX_512bit; + } int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x18); emit_int8((unsigned char)(0xC0 | encode)); @@ -4773,8 +4879,8 @@ void Assembler::vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) int src_enc = src->encoding(); int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; - int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, - VEX_OPCODE_0F_3A, true, vector_len, false, false); + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x1A); emit_int8((unsigned char)(0xC0 | encode)); // 0x00 - insert into lower 256 bits @@ -4783,35 +4889,70 @@ void Assembler::vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) } void Assembler::vinsertf64x4h(XMMRegister dst, Address src) { - assert(VM_Version::supports_avx(), ""); - if (VM_Version::supports_evex()) { - tuple_type = EVEX_T4; - input_size_in_bits = EVEX_64bit; - } + assert(VM_Version::supports_evex(), ""); + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_64bit; InstructionMark im(this); int vector_len = AVX_512bit; assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ true, vector_len); emit_int8(0x1A); emit_operand(dst, src); // 0x01 - insert into upper 128 bits emit_int8(0x01); } -void Assembler::vinsertf128h(XMMRegister dst, Address src) { - assert(VM_Version::supports_avx(), ""); - if (VM_Version::supports_evex()) { - tuple_type = EVEX_T4; - input_size_in_bits = EVEX_32bit; - } +void Assembler::vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) { + assert(VM_Version::supports_evex(), ""); + int vector_len = AVX_512bit; + int src_enc = src->encoding(); + int dst_enc = dst->encoding(); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); + emit_int8(0x18); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x00 - insert into q0 128 bits (0..127) + // 0x01 - insert into q1 128 bits (128..255) + // 0x02 - insert into q2 128 bits (256..383) + // 0x03 - insert into q3 128 bits (384..511) + emit_int8(value & 0x3); +} + +void Assembler::vinsertf32x4h(XMMRegister dst, Address src, int value) { + assert(VM_Version::supports_evex(), ""); + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_32bit; InstructionMark im(this); - int vector_len = AVX_256bit; + int vector_len = AVX_512bit; assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len); + emit_int8(0x18); + emit_operand(dst, src); + // 0x00 - insert into q0 128 bits (0..127) + // 0x01 - insert into q1 128 bits (128..255) + // 0x02 - insert into q2 128 bits (256..383) + // 0x03 - insert into q3 128 bits (384..511) + emit_int8(value & 0x3); +} + +void Assembler::vinsertf128h(XMMRegister dst, Address src) { + assert(VM_Version::supports_avx(), ""); + int vector_len = AVX_256bit; + if (VM_Version::supports_evex()) { + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_32bit; + vector_len = AVX_512bit; + } + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len); emit_int8(0x18); emit_operand(dst, src); // 0x01 - insert into upper 128 bits @@ -4821,6 +4962,9 @@ void Assembler::vinsertf128h(XMMRegister dst, Address src) { void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); int vector_len = AVX_256bit; + if (VM_Version::supports_evex()) { + vector_len = AVX_512bit; + } int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x19); emit_int8((unsigned char)(0xC0 | encode)); @@ -4831,15 +4975,16 @@ void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) { void Assembler::vextractf128h(Address dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); + int vector_len = AVX_256bit; if (VM_Version::supports_evex()) { - tuple_type = EVEX_T4; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_32bit; + vector_len = AVX_512bit; } InstructionMark im(this); - int vector_len = AVX_256bit; assert(src != xnoreg, "sanity"); int src_enc = src->encoding(); - vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len); emit_int8(0x19); emit_operand(src, dst); // 0x01 - extract from upper 128 bits @@ -4849,6 +4994,9 @@ void Assembler::vextractf128h(Address dst, XMMRegister src) { void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); int vector_len = AVX_256bit; + if (VM_Version::supports_evex()) { + vector_len = AVX_512bit; + } int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x38); emit_int8((unsigned char)(0xC0 | encode)); @@ -4864,7 +5012,7 @@ void Assembler::vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) int dst_enc = dst->encoding(); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, - VM_Version::supports_avx512dq(), vector_len, false, false); + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_reg_mask */ false); emit_int8(0x38); emit_int8((unsigned char)(0xC0 | encode)); // 0x00 - insert into lower 256 bits @@ -4874,16 +5022,17 @@ void Assembler::vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) void Assembler::vinserti128h(XMMRegister dst, Address src) { assert(VM_Version::supports_avx2(), ""); + int vector_len = AVX_256bit; if (VM_Version::supports_evex()) { - tuple_type = EVEX_T4; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_32bit; + vector_len = AVX_512bit; } InstructionMark im(this); - int vector_len = AVX_256bit; assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len); emit_int8(0x38); emit_operand(dst, src); // 0x01 - insert into upper 128 bits @@ -4893,6 +5042,9 @@ void Assembler::vinserti128h(XMMRegister dst, Address src) { void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); int vector_len = AVX_256bit; + if (VM_Version::supports_evex()) { + vector_len = AVX_512bit; + } int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A); emit_int8(0x39); emit_int8((unsigned char)(0xC0 | encode)); @@ -4903,15 +5055,16 @@ void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) { void Assembler::vextracti128h(Address dst, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); + int vector_len = AVX_256bit; if (VM_Version::supports_evex()) { - tuple_type = EVEX_T4; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_32bit; + vector_len = AVX_512bit; } InstructionMark im(this); - int vector_len = AVX_256bit; assert(src != xnoreg, "sanity"); int src_enc = src->encoding(); - vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len); emit_int8(0x39); emit_operand(src, dst); // 0x01 - extract from upper 128 bits @@ -4924,7 +5077,7 @@ void Assembler::vextracti64x4h(XMMRegister dst, XMMRegister src) { int src_enc = src->encoding(); int dst_enc = dst->encoding(); int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, - true, vector_len, false, false); + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x3B); emit_int8((unsigned char)(0xC0 | encode)); // 0x01 - extract from upper 256 bits @@ -4936,8 +5089,14 @@ void Assembler::vextracti64x2h(XMMRegister dst, XMMRegister src, int value) { int vector_len = AVX_512bit; int src_enc = src->encoding(); int dst_enc = dst->encoding(); - int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, - VM_Version::supports_avx512dq(), vector_len, false, false); + int encode; + if (VM_Version::supports_avx512dq()) { + encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); + } else { + encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + /* vex_w */ false, vector_len, /* legacy_mode */ true, /* no_mask_reg */ false); + } emit_int8(0x39); emit_int8((unsigned char)(0xC0 | encode)); // 0x01 - extract from bits 255:128 @@ -4952,7 +5111,7 @@ void Assembler::vextractf64x4h(XMMRegister dst, XMMRegister src) { int src_enc = src->encoding(); int dst_enc = dst->encoding(); int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, - VM_Version::supports_avx512dq(), vector_len, false, false); + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x1B); emit_int8((unsigned char)(0xC0 | encode)); // 0x01 - extract from upper 256 bits @@ -4960,18 +5119,18 @@ void Assembler::vextractf64x4h(XMMRegister dst, XMMRegister src) { } void Assembler::vextractf64x4h(Address dst, XMMRegister src) { - assert(VM_Version::supports_avx2(), ""); - tuple_type = EVEX_T4; - input_size_in_bits = EVEX_64bit; + assert(VM_Version::supports_evex(), ""); + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_64bit; InstructionMark im(this); int vector_len = AVX_512bit; assert(src != xnoreg, "sanity"); int src_enc = src->encoding(); vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, - VM_Version::supports_avx512dq(), vector_len); + /* vex_w */ true, vector_len); emit_int8(0x1B); emit_operand(src, dst); - // 0x01 - extract from upper 128 bits + // 0x01 - extract from upper 256 bits emit_int8(0x01); } @@ -4980,10 +5139,29 @@ void Assembler::vextractf32x4h(XMMRegister dst, XMMRegister src, int value) { int vector_len = AVX_512bit; int src_enc = src->encoding(); int dst_enc = dst->encoding(); - int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, - VEX_OPCODE_0F_3A, false, vector_len, false, false); + int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, + /* vex_w */ false, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x19); emit_int8((unsigned char)(0xC0 | encode)); + // 0x00 - extract from bits 127:0 + // 0x01 - extract from bits 255:128 + // 0x02 - extract from bits 383:256 + // 0x03 - extract from bits 511:384 + emit_int8(value & 0x3); +} + +void Assembler::vextractf32x4h(Address dst, XMMRegister src, int value) { + assert(VM_Version::supports_evex(), ""); + _tuple_type = EVEX_T4; + _input_size_in_bits = EVEX_32bit; + InstructionMark im(this); + int vector_len = AVX_512bit; + assert(src != xnoreg, "sanity"); + int src_enc = src->encoding(); + vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, /* vex_w */ false, vector_len); + emit_int8(0x19); + emit_operand(src, dst); + // 0x00 - extract from bits 127:0 // 0x01 - extract from bits 255:128 // 0x02 - extract from bits 383:256 // 0x03 - extract from bits 511:384 @@ -4996,7 +5174,7 @@ void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) { int src_enc = src->encoding(); int dst_enc = dst->encoding(); int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, - VM_Version::supports_avx512dq(), vector_len, false, false); + /* vex_w */ !_legacy_mode_dq, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x19); emit_int8((unsigned char)(0xC0 | encode)); // 0x01 - extract from bits 255:128 @@ -5007,178 +5185,190 @@ void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) { // duplicate 4-bytes integer data from src into 8 locations in dest void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) { - assert(VM_Version::supports_avx2(), ""); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); int vector_len = AVX_256bit; - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38, false); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38); emit_int8(0x58); emit_int8((unsigned char)(0xC0 | encode)); } // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38, false); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38); emit_int8(0x78); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_8bit; + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_8bit; InstructionMark im(this); assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len); emit_int8(0x78); emit_operand(dst, src); } // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38, false); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38); emit_int8(0x79); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_16bit; + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_16bit; InstructionMark im(this); assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len); emit_int8(0x79); emit_operand(dst, src); } // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38, false); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_38); emit_int8(0x58); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; InstructionMark im(this); assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len); emit_int8(0x58); emit_operand(dst, src); } // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, true, vector_len, false, false); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /* vex_w */ true, vector_len, /* legacy_mode */ false, /* no_mask_reg */ false); emit_int8(0x59); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; InstructionMark im(this); assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len); + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len); emit_int8(0x59); emit_operand(dst, src); } // duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, false, vector_len, false, false); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false); emit_int8(0x18); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + assert(UseAVX > 1, ""); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; InstructionMark im(this); assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ false, vector_len); emit_int8(0x18); emit_operand(dst, src); } // duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, true, vector_len, false, false); + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /*vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false); emit_int8(0x19); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; + _instruction_uses_vl = true; + assert(UseAVX > 1, ""); + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_64bit; InstructionMark im(this); assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len); + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, /* vex_w */ true, vector_len); emit_int8(0x19); emit_operand(dst, src); } // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, false, vector_len, false, false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /*vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false); emit_int8(0x7A); emit_int8((unsigned char)(0xC0 | encode)); } // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, false, vector_len, false, false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false); emit_int8(0x7B); emit_int8((unsigned char)(0xC0 | encode)); } // duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, false, vector_len, false, false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /* vex_w */ false, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false); emit_int8(0x7C); emit_int8((unsigned char)(0xC0 | encode)); } // duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) { + _instruction_uses_vl = true; assert(VM_Version::supports_evex(), ""); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, - VEX_OPCODE_0F_38, true, vector_len, false, false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, + /* vex_w */ true, vector_len, /* legacy_mode */ false, /*no_mask_reg */ false); emit_int8(0x7C); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5186,8 +5376,8 @@ void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) { // Carry-Less Multiplication Quadword void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { assert(VM_Version::supports_clmul(), ""); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, - VEX_OPCODE_0F_3A, false, AVX_128bit, true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, /* no_mask_reg */ false, + VEX_OPCODE_0F_3A, /* rex_w */ false, AVX_128bit, /* legacy_mode */ true); emit_int8(0x44); emit_int8((unsigned char)(0xC0 | encode)); emit_int8((unsigned char)mask); @@ -5197,8 +5387,7 @@ void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) { assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), ""); int vector_len = AVX_128bit; - int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_3A, true); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A, /* legacy_mode */ true); emit_int8(0x44); emit_int8((unsigned char)(0xC0 | encode)); emit_int8((unsigned char)mask); @@ -5757,7 +5946,7 @@ void Assembler::evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool int vector_len, bool no_mask_reg ){ // EVEX 0x62 prefix prefix(EVEX_4bytes); - evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0); + _evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0); // P0: byte 2, initialized to RXBR`00mm // instead of not'd @@ -5796,10 +5985,10 @@ void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0; bool vex_b = adr.base_needs_rex(); bool vex_x = adr.index_needs_rex(); - avx_vector_len = vector_len; + _avx_vector_len = vector_len; - // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit - if (VM_Version::supports_avx512vl() == false) { + // if vector length is turned off, revert to AVX for vectors smaller than 512-bit + if (_legacy_mode_vl && _instruction_uses_vl) { switch (vector_len) { case AVX_128bit: case AVX_256bit: @@ -5812,11 +6001,12 @@ void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix { bool evex_r = (xreg_enc >= 16); bool evex_v = (nds_enc >= 16); - is_evex_instruction = true; + _is_evex_instruction = true; evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg); } else { vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len); } + _instruction_uses_vl = false; } int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, @@ -5824,10 +6014,10 @@ int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexS bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0; bool vex_b = ((src_enc & 8) == 8) ? 1 : 0; bool vex_x = false; - avx_vector_len = vector_len; + _avx_vector_len = vector_len; - // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit - if (VM_Version::supports_avx512vl() == false) { + // if vector length is turned off, revert to AVX for vectors smaller than 512-bit + if (_legacy_mode_vl && _instruction_uses_vl) { switch (vector_len) { case AVX_128bit: case AVX_256bit: @@ -5847,6 +6037,8 @@ int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexS vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len); } + _instruction_uses_vl = false; + // return modrm byte components for operands return (((dst_enc & 7) << 3) | (src_enc & 7)); } @@ -5935,13 +6127,13 @@ void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src } void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) { - int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, legacy_mode, AVX_128bit); + int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode); emit_int8(opcode); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) { - int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit); + int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true); emit_int8(opcode); emit_int8((unsigned char)(0xC0 | encode)); } @@ -5965,7 +6157,7 @@ void Assembler::emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) { - int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, false, no_mask_reg); + int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, legacy_mode, no_mask_reg); emit_int8(opcode); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6614,7 +6806,7 @@ void Assembler::cmpxchgq(Register reg, Address adr) { void Assembler::cvtsi2sdq(XMMRegister dst, Register src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true); + int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true); emit_int8(0x2A); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6622,11 +6814,11 @@ void Assembler::cvtsi2sdq(XMMRegister dst, Register src) { void Assembler::cvtsi2sdq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); - simd_prefix_q(dst, dst, src, VEX_SIMD_F2, true); + simd_prefix_q(dst, dst, src, VEX_SIMD_F2, /* no_mask_reg */ true); emit_int8(0x2A); emit_operand(dst, src); } @@ -6634,25 +6826,25 @@ void Assembler::cvtsi2sdq(XMMRegister dst, Address src) { void Assembler::cvtsi2ssq(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_32bit; + _tuple_type = EVEX_T1S; + _input_size_in_bits = EVEX_32bit; } InstructionMark im(this); - simd_prefix_q(dst, dst, src, VEX_SIMD_F3, true); + simd_prefix_q(dst, dst, src, VEX_SIMD_F3, /* no_mask_reg */ true); emit_int8(0x2A); emit_operand(dst, src); } void Assembler::cvttsd2siq(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true); + int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, /* no_mask_reg */ true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::cvttss2siq(Register dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true); + int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, /* no_mask_reg */ true); emit_int8(0x2C); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6688,6 +6880,13 @@ void Assembler::fxrstor(Address src) { emit_operand(as_Register(1), src); } +void Assembler::xrstor(Address src) { + prefixq(src); + emit_int8(0x0F); + emit_int8((unsigned char)0xAE); + emit_operand(as_Register(5), src); +} + void Assembler::fxsave(Address dst) { prefixq(dst); emit_int8(0x0F); @@ -6695,6 +6894,13 @@ void Assembler::fxsave(Address dst) { emit_operand(as_Register(0), dst); } +void Assembler::xsave(Address dst) { + prefixq(dst); + emit_int8(0x0F); + emit_int8((unsigned char)0xAE); + emit_operand(as_Register(4), dst); +} + void Assembler::idivq(Register src) { int encode = prefixq_and_encode(src->encoding()); emit_int8((unsigned char)0xF7); @@ -6821,7 +7027,7 @@ void Assembler::lzcntq(Register dst, Register src) { void Assembler::movdq(XMMRegister dst, Register src) { // table D-1 says MMX/SSE2 NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, true); + int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, /* no_mask_reg */ true); emit_int8(0x6E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6830,7 +7036,7 @@ void Assembler::movdq(Register dst, XMMRegister src) { // table D-1 says MMX/SSE2 NOT_LP64(assert(VM_Version::supports_sse2(), "")); // swap src/dst to get correct prefix - int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, true); + int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, /* no_mask_reg */ true); emit_int8(0x7E); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6963,8 +7169,8 @@ void Assembler::mulq(Register src) { void Assembler::mulxq(Register dst1, Register dst2, Register src) { assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); - int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), - VEX_SIMD_F2, VEX_OPCODE_0F_38, true, AVX_128bit, true, false); + int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, + /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false); emit_int8((unsigned char)0xF6); emit_int8((unsigned char)(0xC0 | encode)); } @@ -7126,8 +7332,8 @@ void Assembler::rorq(Register dst, int imm8) { void Assembler::rorxq(Register dst, Register src, int imm8) { assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, - VEX_OPCODE_0F_3A, true, AVX_128bit, true, false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, + /* vex_w */ true, AVX_128bit, /* legacy_mode */ true, /* no_mask_reg */ false); emit_int8((unsigned char)0xF0); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp index 2d5e7f8c495..3b3781c58f5 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp @@ -438,7 +438,9 @@ class ArrayAddress VALUE_OBJ_CLASS_SPEC { }; -const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize); +// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes +// See fxsave and xsave(EVEX enabled) documentation for layout +const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize); // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write @@ -594,11 +596,16 @@ class Assembler : public AbstractAssembler { private: - int evex_encoding; - int input_size_in_bits; - int avx_vector_len; - int tuple_type; - bool is_evex_instruction; + int _evex_encoding; + int _input_size_in_bits; + int _avx_vector_len; + int _tuple_type; + bool _is_evex_instruction; + bool _legacy_mode_bw; + bool _legacy_mode_dq; + bool _legacy_mode_vl; + bool _legacy_mode_vlbw; + bool _instruction_uses_vl; // 64bit prefixes int prefix_and_encode(int reg_enc, bool byteinst = false); @@ -972,11 +979,16 @@ private: // belong in macro assembler but there is no need for both varieties to exist void init_attributes(void) { - evex_encoding = 0; - input_size_in_bits = 0; - avx_vector_len = AVX_NoVec; - tuple_type = EVEX_ETUP; - is_evex_instruction = false; + _evex_encoding = 0; + _input_size_in_bits = 0; + _avx_vector_len = AVX_NoVec; + _tuple_type = EVEX_ETUP; + _is_evex_instruction = false; + _legacy_mode_bw = (VM_Version::supports_avx512bw() == false); + _legacy_mode_dq = (VM_Version::supports_avx512dq() == false); + _legacy_mode_vl = (VM_Version::supports_avx512vl() == false); + _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false); + _instruction_uses_vl = false; } void lea(Register dst, Address src); @@ -1344,8 +1356,10 @@ private: void fxch(int i = 1); void fxrstor(Address src); + void xrstor(Address src); void fxsave(Address dst); + void xsave(Address dst); void fyl2x(); void frndint(); @@ -1479,11 +1493,12 @@ private: void movb(Address dst, int imm8); void movb(Register dst, Address src); - void kmovq(KRegister dst, KRegister src); + void kmovql(KRegister dst, KRegister src); void kmovql(KRegister dst, Register src); void kmovdl(KRegister dst, Register src); - void kmovq(Address dst, KRegister src); - void kmovq(KRegister dst, Address src); + void kmovwl(KRegister dst, Register src); + void kmovql(Address dst, KRegister src); + void kmovql(KRegister dst, Address src); void movdl(XMMRegister dst, Register src); void movdl(Register dst, XMMRegister src); @@ -1509,9 +1524,12 @@ private: void vmovdqu(XMMRegister dst, XMMRegister src); // Move Unaligned 512bit Vector - void evmovdqu(Address dst, XMMRegister src, int vector_len); - void evmovdqu(XMMRegister dst, Address src, int vector_len); - void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len); + void evmovdqul(Address dst, XMMRegister src, int vector_len); + void evmovdqul(XMMRegister dst, Address src, int vector_len); + void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len); + void evmovdquq(Address dst, XMMRegister src, int vector_len); + void evmovdquq(XMMRegister dst, Address src, int vector_len); + void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len); // Move lower 64bit to high 64bit in 128bit register void movlhps(XMMRegister dst, XMMRegister src); @@ -1643,6 +1661,7 @@ private: // Pemutation of 64bit words void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); + void vpermq(XMMRegister dst, XMMRegister src, int imm8); void pause(); @@ -2061,6 +2080,9 @@ private: void vextracti64x2h(XMMRegister dst, XMMRegister src, int value); void vextractf64x2h(XMMRegister dst, XMMRegister src, int value); void vextractf32x4h(XMMRegister dst, XMMRegister src, int value); + void vextractf32x4h(Address dst, XMMRegister src, int value); + void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value); + void vinsertf32x4h(XMMRegister dst, Address src, int value); // duplicate 4-bytes integer data from src into 8 locations in dest void vpbroadcastd(XMMRegister dst, XMMRegister src); diff --git a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp index d43a14be1bf..ff07711acc0 100644 --- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp @@ -3798,16 +3798,24 @@ void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) { if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) { __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg()); } - __ xorps(dest->as_xmm_float_reg(), - ExternalAddress((address)float_signflip_pool)); - + if (UseAVX > 1) { + __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(), + ExternalAddress((address)float_signflip_pool)); + } else { + __ xorps(dest->as_xmm_float_reg(), + ExternalAddress((address)float_signflip_pool)); + } } else if (dest->is_double_xmm()) { if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) { __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg()); } - __ xorpd(dest->as_xmm_double_reg(), - ExternalAddress((address)double_signflip_pool)); - + if (UseAVX > 1) { + __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(), + ExternalAddress((address)double_signflip_pool)); + } else { + __ xorpd(dest->as_xmm_double_reg(), + ExternalAddress((address)double_signflip_pool)); + } } else if (left->is_single_fpu() || left->is_double_fpu()) { assert(left->fpu() == 0, "arg must be on TOS"); assert(dest->fpu() == 0, "dest must be TOS"); diff --git a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp index 7818ddd2c95..02c0bed270d 100644 --- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp +++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp @@ -401,11 +401,9 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args, } else if (UseSSE == 1) { int xmm_off = xmm_regs_as_doubles_off; - for (int n = 0; n < FrameMap::nof_xmm_regs; n++) { - if (n < xmm_bypass_limit) { - VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg(); - map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0); - } + for (int n = 0; n < FrameMap::nof_fpu_regs; n++) { + VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg(); + map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0); xmm_off += 2; } assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers"); @@ -452,14 +450,11 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args, __ frstor(Address(rsp, fpu_state_off * VMRegImpl::stack_slot_size)); // Save the FPU registers in de-opt-able form - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48)); - __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56)); + int offset = 0; + for (int n = 0; n < FrameMap::nof_fpu_regs; n++) { + __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset)); + offset += 8; + } } if (UseSSE >= 2) { @@ -468,52 +463,26 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args, // so always save them as doubles. // note that float values are _not_ converted automatically, so for float values // the second word contains only garbage data. - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0), xmm0); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8), xmm1); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7); + int xmm_bypass_limit = FrameMap::nof_xmm_regs; + int offset = 0; #ifdef _LP64 - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64), xmm8); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72), xmm9); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80), xmm10); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88), xmm11); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96), xmm12); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15); - if (UseAVX > 2) { - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30); - __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31); + if (UseAVX < 3) { + xmm_bypass_limit = xmm_bypass_limit / 2; + } +#endif + for (int n = 0; n < xmm_bypass_limit; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name); + offset += 8; } -#endif // _LP64 } else if (UseSSE == 1) { - // save XMM registers as float because double not supported without SSE2 - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0), xmm0); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8), xmm1); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6); - __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7); + // save XMM registers as float because double not supported without SSE2(num MMX == num fpu) + int offset = 0; + for (int n = 0; n < FrameMap::nof_fpu_regs; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name); + offset += 8; + } } } @@ -528,52 +497,26 @@ static void restore_fpu(StubAssembler* sasm, bool restore_fpu_registers = true) if (restore_fpu_registers) { if (UseSSE >= 2) { // restore XMM registers - __ movdbl(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0)); - __ movdbl(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8)); - __ movdbl(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16)); - __ movdbl(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24)); - __ movdbl(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32)); - __ movdbl(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40)); - __ movdbl(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48)); - __ movdbl(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56)); + int xmm_bypass_limit = FrameMap::nof_xmm_regs; #ifdef _LP64 - __ movdbl(xmm8, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64)); - __ movdbl(xmm9, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72)); - __ movdbl(xmm10, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80)); - __ movdbl(xmm11, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88)); - __ movdbl(xmm12, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96)); - __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104)); - __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112)); - __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120)); - if (UseAVX > 2) { - __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128)); - __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136)); - __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144)); - __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152)); - __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160)); - __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168)); - __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176)); - __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184)); - __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192)); - __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200)); - __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208)); - __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216)); - __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224)); - __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232)); - __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240)); - __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248)); + if (UseAVX < 3) { + xmm_bypass_limit = xmm_bypass_limit / 2; + } +#endif + int offset = 0; + for (int n = 0; n < xmm_bypass_limit; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + __ movdbl(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset)); + offset += 8; } -#endif // _LP64 } else if (UseSSE == 1) { - // restore XMM registers - __ movflt(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 0)); - __ movflt(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 8)); - __ movflt(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16)); - __ movflt(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24)); - __ movflt(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32)); - __ movflt(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40)); - __ movflt(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48)); - __ movflt(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56)); + // restore XMM registers(num MMX == num fpu) + int offset = 0; + for (int n = 0; n < FrameMap::nof_fpu_regs; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + __ movflt(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset)); + offset += 8; + } } if (UseSSE < 2) { diff --git a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp index 6065d192220..eb6e422a854 100644 --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -3751,8 +3751,31 @@ void MacroAssembler::pop_CPU_state() { } void MacroAssembler::pop_FPU_state() { - NOT_LP64(frstor(Address(rsp, 0));) - LP64_ONLY(fxrstor(Address(rsp, 0));) +#ifndef _LP64 + frstor(Address(rsp, 0)); +#else + // AVX will continue to use the fxsave area. + // EVEX needs to utilize the xsave area, which is under different + // management. + if(VM_Version::supports_evex()) { + // EDX:EAX describe the XSAVE header and + // are obtained while fetching info for XCR0 via cpuid. + // These two registers make up 64-bits in the header for which bits + // 62:10 are currently reserved for future implementations and unused. Bit 63 + // is unused for our implementation as we do not utilize + // compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use + // the functionality for PKRU state and MSR tracing. + // Ergo we are primarily concerned with bits 7..0, which define + // which ISA extensions and features are enabled for a given machine and are + // defined in XemXcr0Eax and is used to map the XSAVE area + // for restoring registers as described via XCR0. + movl(rdx,VM_Version::get_xsave_header_upper_segment()); + movl(rax,VM_Version::get_xsave_header_lower_segment()); + xrstor(Address(rsp, 0)); + } else { + fxrstor(Address(rsp, 0)); + } +#endif addptr(rsp, FPUStateSizeInWords * wordSize); } @@ -3769,13 +3792,49 @@ void MacroAssembler::push_CPU_state() { push_FPU_state(); } +#ifdef _LP64 +#define XSTATE_BV 0x200 +#endif + void MacroAssembler::push_FPU_state() { subptr(rsp, FPUStateSizeInWords * wordSize); #ifndef _LP64 fnsave(Address(rsp, 0)); fwait(); #else - fxsave(Address(rsp, 0)); + // AVX will continue to use the fxsave area. + // EVEX needs to utilize the xsave area, which is under different + // management. + if(VM_Version::supports_evex()) { + // Save a copy of EAX and EDX + push(rax); + push(rdx); + // EDX:EAX describe the XSAVE header and + // are obtained while fetching info for XCR0 via cpuid. + // These two registers make up 64-bits in the header for which bits + // 62:10 are currently reserved for future implementations and unused. Bit 63 + // is unused for our implementation as we do not utilize + // compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use + // the functionality for PKRU state and MSR tracing. + // Ergo we are primarily concerned with bits 7..0, which define + // which ISA extensions and features are enabled for a given machine and are + // defined in XemXcr0Eax and is used to program XSAVE area + // for saving the required registers as defined in XCR0. + int xcr0_edx = VM_Version::get_xsave_header_upper_segment(); + int xcr0_eax = VM_Version::get_xsave_header_lower_segment(); + movl(rdx,xcr0_edx); + movl(rax,xcr0_eax); + xsave(Address(rsp, wordSize*2)); + // now Apply control bits and clear bytes 8..23 in the header + pop(rdx); + pop(rax); + movl(Address(rsp, XSTATE_BV), xcr0_eax); + movl(Address(rsp, XSTATE_BV+4), xcr0_edx); + andq(Address(rsp, XSTATE_BV+8), 0); + andq(Address(rsp, XSTATE_BV+16), 0); + } else { + fxsave(Address(rsp, 0)); + } #endif // LP64 } @@ -4082,6 +4141,84 @@ void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src } } +void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) { + int nds_enc = nds->encoding(); + int dst_enc = dst->encoding(); + bool dst_upper_bank = (dst_enc > 15); + bool nds_upper_bank = (nds_enc > 15); + if (VM_Version::supports_avx512novl() && + (nds_upper_bank || dst_upper_bank)) { + if (dst_upper_bank) { + subptr(rsp, 64); + evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit); + movflt(xmm0, nds); + if (reachable(src)) { + vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit); + } else { + lea(rscratch1, src); + vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit); + } + movflt(dst, xmm0); + evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit); + addptr(rsp, 64); + } else { + movflt(dst, nds); + if (reachable(src)) { + vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit); + } else { + lea(rscratch1, src); + vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit); + } + } + } else { + if (reachable(src)) { + vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit); + } else { + lea(rscratch1, src); + vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit); + } + } +} + +void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { + int nds_enc = nds->encoding(); + int dst_enc = dst->encoding(); + bool dst_upper_bank = (dst_enc > 15); + bool nds_upper_bank = (nds_enc > 15); + if (VM_Version::supports_avx512novl() && + (nds_upper_bank || dst_upper_bank)) { + if (dst_upper_bank) { + subptr(rsp, 64); + evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit); + movdbl(xmm0, nds); + if (reachable(src)) { + vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit); + } else { + lea(rscratch1, src); + vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit); + } + movdbl(dst, xmm0); + evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit); + addptr(rsp, 64); + } else { + movdbl(dst, nds); + if (reachable(src)) { + vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit); + } else { + lea(rscratch1, src); + vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit); + } + } + } else { + if (reachable(src)) { + vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit); + } else { + lea(rscratch1, src); + vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit); + } + } +} + void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { if (reachable(src)) { vxorpd(dst, nds, as_Address(src), vector_len); @@ -4318,7 +4455,6 @@ void MacroAssembler::store_check(Register obj, Address dst) { void MacroAssembler::store_check(Register obj) { // Does a store check for the oop in register obj. The content of // register obj is destroyed afterwards. - BarrierSet* bs = Universe::heap()->barrier_set(); assert(bs->kind() == BarrierSet::CardTableForRS || bs->kind() == BarrierSet::CardTableExtension, @@ -4572,69 +4708,58 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int // if we are coming from c1, xmm registers may be live int off = 0; + int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8); + if (UseAVX > 2) { + num_xmm_regs = LP64_ONLY(32) NOT_LP64(8); + } + if (UseSSE == 1) { subptr(rsp, sizeof(jdouble)*8); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm3); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm4); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm5); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); - movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); + for (int n = 0; n < 8; n++) { + movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n)); + } } else if (UseSSE >= 2) { if (UseAVX > 2) { + push(rbx); movl(rbx, 0xffff); -#ifdef _LP64 - kmovql(k1, rbx); -#else - kmovdl(k1, rbx); -#endif + kmovwl(k1, rbx); + pop(rbx); } #ifdef COMPILER2 if (MaxVectorSize > 16) { - assert(UseAVX > 0, "256bit vectors are supported only with AVX"); + if(UseAVX > 2) { + // Save upper half of ZMM registes + subptr(rsp, 32*num_xmm_regs); + for (int n = 0; n < num_xmm_regs; n++) { + vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n)); + } + off = 0; + } + assert(UseAVX > 0, "256 bit vectors are supported only with AVX"); // Save upper half of YMM registes - subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); - vextractf128h(Address(rsp, 0),xmm0); - vextractf128h(Address(rsp, 16),xmm1); - vextractf128h(Address(rsp, 32),xmm2); - vextractf128h(Address(rsp, 48),xmm3); - vextractf128h(Address(rsp, 64),xmm4); - vextractf128h(Address(rsp, 80),xmm5); - vextractf128h(Address(rsp, 96),xmm6); - vextractf128h(Address(rsp,112),xmm7); -#ifdef _LP64 - vextractf128h(Address(rsp,128),xmm8); - vextractf128h(Address(rsp,144),xmm9); - vextractf128h(Address(rsp,160),xmm10); - vextractf128h(Address(rsp,176),xmm11); - vextractf128h(Address(rsp,192),xmm12); - vextractf128h(Address(rsp,208),xmm13); - vextractf128h(Address(rsp,224),xmm14); - vextractf128h(Address(rsp,240),xmm15); -#endif + subptr(rsp, 16*num_xmm_regs); + for (int n = 0; n < num_xmm_regs; n++) { + vextractf128h(Address(rsp, off++*16), as_XMMRegister(n)); + } } #endif - // Save whole 128bit (16 bytes) XMM regiters - subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); - movdqu(Address(rsp,off++*16),xmm0); - movdqu(Address(rsp,off++*16),xmm1); - movdqu(Address(rsp,off++*16),xmm2); - movdqu(Address(rsp,off++*16),xmm3); - movdqu(Address(rsp,off++*16),xmm4); - movdqu(Address(rsp,off++*16),xmm5); - movdqu(Address(rsp,off++*16),xmm6); - movdqu(Address(rsp,off++*16),xmm7); + // Save whole 128bit (16 bytes) XMM registers + subptr(rsp, 16*num_xmm_regs); + off = 0; #ifdef _LP64 - movdqu(Address(rsp,off++*16),xmm8); - movdqu(Address(rsp,off++*16),xmm9); - movdqu(Address(rsp,off++*16),xmm10); - movdqu(Address(rsp,off++*16),xmm11); - movdqu(Address(rsp,off++*16),xmm12); - movdqu(Address(rsp,off++*16),xmm13); - movdqu(Address(rsp,off++*16),xmm14); - movdqu(Address(rsp,off++*16),xmm15); + if (VM_Version::supports_avx512novl()) { + for (int n = 0; n < num_xmm_regs; n++) { + vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0); + } + } else { + for (int n = 0; n < num_xmm_regs; n++) { + movdqu(Address(rsp, off++*16), as_XMMRegister(n)); + } + } +#else + for (int n = 0; n < num_xmm_regs; n++) { + movdqu(Address(rsp, off++*16), as_XMMRegister(n)); + } #endif } @@ -4689,7 +4814,7 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int movsd(Address(rsp, 0), xmm0); fld_d(Address(rsp, 0)); #endif // _LP64 - addptr(rsp, sizeof(jdouble) * nb_args); + addptr(rsp, sizeof(jdouble)*nb_args); if (num_fpu_regs_in_use > 1) { // Must save return value to stack and then restore entire FPU // stack except incoming arguments @@ -4699,63 +4824,50 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int addptr(rsp, sizeof(jdouble)); } fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble))); - addptr(rsp, sizeof(jdouble) * nb_args); + addptr(rsp, sizeof(jdouble)*nb_args); } off = 0; if (UseSSE == 1) { - movflt(xmm0, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm1, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm2, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm3, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm4, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); - movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); + for (int n = 0; n < 8; n++) { + movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble))); + } addptr(rsp, sizeof(jdouble)*8); } else if (UseSSE >= 2) { // Restore whole 128bit (16 bytes) XMM regiters - movdqu(xmm0, Address(rsp,off++*16)); - movdqu(xmm1, Address(rsp,off++*16)); - movdqu(xmm2, Address(rsp,off++*16)); - movdqu(xmm3, Address(rsp,off++*16)); - movdqu(xmm4, Address(rsp,off++*16)); - movdqu(xmm5, Address(rsp,off++*16)); - movdqu(xmm6, Address(rsp,off++*16)); - movdqu(xmm7, Address(rsp,off++*16)); #ifdef _LP64 - movdqu(xmm8, Address(rsp,off++*16)); - movdqu(xmm9, Address(rsp,off++*16)); - movdqu(xmm10, Address(rsp,off++*16)); - movdqu(xmm11, Address(rsp,off++*16)); - movdqu(xmm12, Address(rsp,off++*16)); - movdqu(xmm13, Address(rsp,off++*16)); - movdqu(xmm14, Address(rsp,off++*16)); - movdqu(xmm15, Address(rsp,off++*16)); + if (VM_Version::supports_avx512novl()) { + for (int n = 0; n < num_xmm_regs; n++) { + vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0); + } + } + else { + for (int n = 0; n < num_xmm_regs; n++) { + movdqu(as_XMMRegister(n), Address(rsp, off++*16)); + } + } +#else + for (int n = 0; n < num_xmm_regs; n++) { + movdqu(as_XMMRegister(n), Address(rsp, off++ * 16)); + } #endif - addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); + addptr(rsp, 16*num_xmm_regs); + #ifdef COMPILER2 if (MaxVectorSize > 16) { // Restore upper half of YMM registes. - vinsertf128h(xmm0, Address(rsp, 0)); - vinsertf128h(xmm1, Address(rsp, 16)); - vinsertf128h(xmm2, Address(rsp, 32)); - vinsertf128h(xmm3, Address(rsp, 48)); - vinsertf128h(xmm4, Address(rsp, 64)); - vinsertf128h(xmm5, Address(rsp, 80)); - vinsertf128h(xmm6, Address(rsp, 96)); - vinsertf128h(xmm7, Address(rsp,112)); -#ifdef _LP64 - vinsertf128h(xmm8, Address(rsp,128)); - vinsertf128h(xmm9, Address(rsp,144)); - vinsertf128h(xmm10, Address(rsp,160)); - vinsertf128h(xmm11, Address(rsp,176)); - vinsertf128h(xmm12, Address(rsp,192)); - vinsertf128h(xmm13, Address(rsp,208)); - vinsertf128h(xmm14, Address(rsp,224)); - vinsertf128h(xmm15, Address(rsp,240)); -#endif - addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); + off = 0; + for (int n = 0; n < num_xmm_regs; n++) { + vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16)); + } + addptr(rsp, 16*num_xmm_regs); + if(UseAVX > 2) { + off = 0; + for (int n = 0; n < num_xmm_regs; n++) { + vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32)); + } + addptr(rsp, 32*num_xmm_regs); + } } #endif } @@ -7095,11 +7207,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; if (UseAVX > 2) { movl(rtmp, 0xffff); -#ifdef _LP64 - kmovql(k1, rtmp); -#else - kmovdl(k1, rtmp); -#endif + kmovwl(k1, rtmp); } movdl(xtmp, value); if (UseAVX > 2 && UseUnalignedLoadStores) { @@ -7112,7 +7220,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, align(16); BIND(L_fill_64_bytes_loop); - evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit); + evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); addptr(to, 64); subl(count, 16 << shift); jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); @@ -7120,7 +7228,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, BIND(L_check_fill_32_bytes); addl(count, 8 << shift); jccb(Assembler::less, L_check_fill_8_bytes); - evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit); + evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit); addptr(to, 32); subl(count, 8 << shift); @@ -8399,6 +8507,14 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + movl(tmp, 0xffff); + kmovwl(k1, tmp); + } + lea(table, ExternalAddress(StubRoutines::crc_table_addr())); notl(crc); // ~crc cmpl(len, 16); diff --git a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp index df06cffa662..1a2fa3c163a 100644 --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp @@ -1069,6 +1069,9 @@ public: void vsubss(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vsubss(dst, nds, src); } void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src); + void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src); + void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src); + // AVX Vector instructions void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } diff --git a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp index 397d643f639..80f2873c04f 100644 --- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp +++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp @@ -115,6 +115,7 @@ class RegisterSaver { OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool verify_fpu, bool save_vectors) { int vect_words = 0; + int num_xmm_regs = XMMRegisterImpl::number_of_registers; #ifdef COMPILER2 if (save_vectors) { assert(UseAVX > 0, "512bit vectors are supported only with EVEX"); @@ -173,59 +174,50 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); } + int off = st0_off; + int delta = st1_off - off; + // Save the FPU registers in de-opt-able form + for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) { + __ fstp_d(Address(rsp, off*wordSize)); + off += delta; + } - __ fstp_d(Address(rsp, st0_off*wordSize)); // st(0) - __ fstp_d(Address(rsp, st1_off*wordSize)); // st(1) - __ fstp_d(Address(rsp, st2_off*wordSize)); // st(2) - __ fstp_d(Address(rsp, st3_off*wordSize)); // st(3) - __ fstp_d(Address(rsp, st4_off*wordSize)); // st(4) - __ fstp_d(Address(rsp, st5_off*wordSize)); // st(5) - __ fstp_d(Address(rsp, st6_off*wordSize)); // st(6) - __ fstp_d(Address(rsp, st7_off*wordSize)); // st(7) - - if( UseSSE == 1 ) { // Save the XMM state - __ movflt(Address(rsp,xmm0_off*wordSize),xmm0); - __ movflt(Address(rsp,xmm1_off*wordSize),xmm1); - __ movflt(Address(rsp,xmm2_off*wordSize),xmm2); - __ movflt(Address(rsp,xmm3_off*wordSize),xmm3); - __ movflt(Address(rsp,xmm4_off*wordSize),xmm4); - __ movflt(Address(rsp,xmm5_off*wordSize),xmm5); - __ movflt(Address(rsp,xmm6_off*wordSize),xmm6); - __ movflt(Address(rsp,xmm7_off*wordSize),xmm7); - } else if( UseSSE >= 2 ) { + off = xmm0_off; + delta = xmm1_off - off; + if(UseSSE == 1) { // Save the XMM state + for (int n = 0; n < num_xmm_regs; n++) { + __ movflt(Address(rsp, off*wordSize), as_XMMRegister(n)); + off += delta; + } + } else if(UseSSE >= 2) { // Save whole 128bit (16 bytes) XMM regiters - __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0); - __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1); - __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2); - __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3); - __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4); - __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5); - __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6); - __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7); + if (VM_Version::supports_avx512novl()) { + for (int n = 0; n < num_xmm_regs; n++) { + __ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0); + off += delta; + } + } else { + for (int n = 0; n < num_xmm_regs; n++) { + __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n)); + off += delta; + } + } } if (vect_words > 0) { assert(vect_words*wordSize == 128, ""); __ subptr(rsp, 128); // Save upper half of YMM registes - __ vextractf128h(Address(rsp, 0),xmm0); - __ vextractf128h(Address(rsp, 16),xmm1); - __ vextractf128h(Address(rsp, 32),xmm2); - __ vextractf128h(Address(rsp, 48),xmm3); - __ vextractf128h(Address(rsp, 64),xmm4); - __ vextractf128h(Address(rsp, 80),xmm5); - __ vextractf128h(Address(rsp, 96),xmm6); - __ vextractf128h(Address(rsp,112),xmm7); + off = 0; + for (int n = 0; n < num_xmm_regs; n++) { + __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n)); + } if (UseAVX > 2) { __ subptr(rsp, 256); // Save upper half of ZMM registes - __ vextractf64x4h(Address(rsp, 0), xmm0); - __ vextractf64x4h(Address(rsp, 32), xmm1); - __ vextractf64x4h(Address(rsp, 64), xmm2); - __ vextractf64x4h(Address(rsp, 96), xmm3); - __ vextractf64x4h(Address(rsp, 128), xmm4); - __ vextractf64x4h(Address(rsp, 160), xmm5); - __ vextractf64x4h(Address(rsp, 192), xmm6); - __ vextractf64x4h(Address(rsp, 224), xmm7); + off = 0; + for (int n = 0; n < num_xmm_regs; n++) { + __ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n)); + } } } @@ -238,58 +230,40 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ OopMap* map = new OopMap( frame_words, 0 ); #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words) - - map->set_callee_saved(STACK_OFFSET( rax_off), rax->as_VMReg()); - map->set_callee_saved(STACK_OFFSET( rcx_off), rcx->as_VMReg()); - map->set_callee_saved(STACK_OFFSET( rdx_off), rdx->as_VMReg()); - map->set_callee_saved(STACK_OFFSET( rbx_off), rbx->as_VMReg()); - // rbp, location is known implicitly, no oopMap - map->set_callee_saved(STACK_OFFSET( rsi_off), rsi->as_VMReg()); - map->set_callee_saved(STACK_OFFSET( rdi_off), rdi->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st0_off), as_FloatRegister(0)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st1_off), as_FloatRegister(1)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st2_off), as_FloatRegister(2)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st3_off), as_FloatRegister(3)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st4_off), as_FloatRegister(4)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st5_off), as_FloatRegister(5)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st6_off), as_FloatRegister(6)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(st7_off), as_FloatRegister(7)->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm0_off), xmm0->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm1_off), xmm1->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm2_off), xmm2->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm3_off), xmm3->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm4_off), xmm4->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm5_off), xmm5->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm6_off), xmm6->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm7_off), xmm7->as_VMReg()); - // %%% This is really a waste but we'll keep things as they were for now - if (true) { #define NEXTREG(x) (x)->as_VMReg()->next() - map->set_callee_saved(STACK_OFFSET(st0H_off), NEXTREG(as_FloatRegister(0))); - map->set_callee_saved(STACK_OFFSET(st1H_off), NEXTREG(as_FloatRegister(1))); - map->set_callee_saved(STACK_OFFSET(st2H_off), NEXTREG(as_FloatRegister(2))); - map->set_callee_saved(STACK_OFFSET(st3H_off), NEXTREG(as_FloatRegister(3))); - map->set_callee_saved(STACK_OFFSET(st4H_off), NEXTREG(as_FloatRegister(4))); - map->set_callee_saved(STACK_OFFSET(st5H_off), NEXTREG(as_FloatRegister(5))); - map->set_callee_saved(STACK_OFFSET(st6H_off), NEXTREG(as_FloatRegister(6))); - map->set_callee_saved(STACK_OFFSET(st7H_off), NEXTREG(as_FloatRegister(7))); - map->set_callee_saved(STACK_OFFSET(xmm0H_off), NEXTREG(xmm0)); - map->set_callee_saved(STACK_OFFSET(xmm1H_off), NEXTREG(xmm1)); - map->set_callee_saved(STACK_OFFSET(xmm2H_off), NEXTREG(xmm2)); - map->set_callee_saved(STACK_OFFSET(xmm3H_off), NEXTREG(xmm3)); - map->set_callee_saved(STACK_OFFSET(xmm4H_off), NEXTREG(xmm4)); - map->set_callee_saved(STACK_OFFSET(xmm5H_off), NEXTREG(xmm5)); - map->set_callee_saved(STACK_OFFSET(xmm6H_off), NEXTREG(xmm6)); - map->set_callee_saved(STACK_OFFSET(xmm7H_off), NEXTREG(xmm7)); + + map->set_callee_saved(STACK_OFFSET(rax_off), rax->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(rcx_off), rcx->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(rdx_off), rdx->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(rbx_off), rbx->as_VMReg()); + // rbp, location is known implicitly, no oopMap + map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg()); + // %%% This is really a waste but we'll keep things as they were for now for the upper component + off = st0_off; + delta = st1_off - off; + for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) { + FloatRegister freg_name = as_FloatRegister(n); + map->set_callee_saved(STACK_OFFSET(off), freg_name->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(freg_name)); + off += delta; + } + off = xmm0_off; + delta = xmm1_off - off; + for (int n = 0; n < num_xmm_regs; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(xmm_name)); + off += delta; + } #undef NEXTREG #undef STACK_OFFSET - } return map; - } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { + int num_xmm_regs = XMMRegisterImpl::number_of_registers; // Recover XMM & FPU state int additional_frame_bytes = 0; #ifdef COMPILER2 @@ -301,52 +275,43 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve #else assert(!restore_vectors, "vectors are generated only by C2"); #endif + int off = xmm0_off; + int delta = xmm1_off - off; + if (UseSSE == 1) { assert(additional_frame_bytes == 0, ""); - __ movflt(xmm0,Address(rsp,xmm0_off*wordSize)); - __ movflt(xmm1,Address(rsp,xmm1_off*wordSize)); - __ movflt(xmm2,Address(rsp,xmm2_off*wordSize)); - __ movflt(xmm3,Address(rsp,xmm3_off*wordSize)); - __ movflt(xmm4,Address(rsp,xmm4_off*wordSize)); - __ movflt(xmm5,Address(rsp,xmm5_off*wordSize)); - __ movflt(xmm6,Address(rsp,xmm6_off*wordSize)); - __ movflt(xmm7,Address(rsp,xmm7_off*wordSize)); + for (int n = 0; n < num_xmm_regs; n++) { + __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize)); + off += delta; + } } else if (UseSSE >= 2) { -#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes) - __ movdqu(xmm0,STACK_ADDRESS(xmm0_off)); - __ movdqu(xmm1,STACK_ADDRESS(xmm1_off)); - __ movdqu(xmm2,STACK_ADDRESS(xmm2_off)); - __ movdqu(xmm3,STACK_ADDRESS(xmm3_off)); - __ movdqu(xmm4,STACK_ADDRESS(xmm4_off)); - __ movdqu(xmm5,STACK_ADDRESS(xmm5_off)); - __ movdqu(xmm6,STACK_ADDRESS(xmm6_off)); - __ movdqu(xmm7,STACK_ADDRESS(xmm7_off)); -#undef STACK_ADDRESS + if (VM_Version::supports_avx512novl()) { + for (int n = 0; n < num_xmm_regs; n++) { + __ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0); + off += delta; + } + } else { + for (int n = 0; n < num_xmm_regs; n++) { + __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes)); + off += delta; + } + } } if (restore_vectors) { + if (UseAVX > 2) { + off = 0; + for (int n = 0; n < num_xmm_regs; n++) { + __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32)); + } + __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes + } // Restore upper half of YMM registes. assert(additional_frame_bytes == 128, ""); - __ vinsertf128h(xmm0, Address(rsp, 0)); - __ vinsertf128h(xmm1, Address(rsp, 16)); - __ vinsertf128h(xmm2, Address(rsp, 32)); - __ vinsertf128h(xmm3, Address(rsp, 48)); - __ vinsertf128h(xmm4, Address(rsp, 64)); - __ vinsertf128h(xmm5, Address(rsp, 80)); - __ vinsertf128h(xmm6, Address(rsp, 96)); - __ vinsertf128h(xmm7, Address(rsp,112)); - __ addptr(rsp, additional_frame_bytes); - if (UseAVX > 2) { - additional_frame_bytes = 256; - __ vinsertf64x4h(xmm0, Address(rsp, 0)); - __ vinsertf64x4h(xmm1, Address(rsp, 32)); - __ vinsertf64x4h(xmm2, Address(rsp, 64)); - __ vinsertf64x4h(xmm3, Address(rsp, 96)); - __ vinsertf64x4h(xmm4, Address(rsp, 128)); - __ vinsertf64x4h(xmm5, Address(rsp, 160)); - __ vinsertf64x4h(xmm6, Address(rsp, 192)); - __ vinsertf64x4h(xmm7, Address(rsp, 224)); - __ addptr(rsp, additional_frame_bytes); + off = 0; + for (int n = 0; n < num_xmm_regs; n++) { + __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16)); } + __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes } __ pop_FPU_state(); __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers diff --git a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp index d63f1469796..46874844473 100644 --- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp @@ -69,7 +69,9 @@ class SimpleRuntimeFrame { class RegisterSaver { // Capture info about frame layout. Layout offsets are in jint // units because compiler frame slots are jints. +#define HALF_ZMM_BANK_WORDS 128 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off +#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off enum layout { fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area xmm_off = fpu_state_off + 160/BytesPerInt, // offset in fxsave save area @@ -89,23 +91,24 @@ class RegisterSaver { DEF_XMM_OFFS(13), DEF_XMM_OFFS(14), DEF_XMM_OFFS(15), - DEF_XMM_OFFS(16), - DEF_XMM_OFFS(17), - DEF_XMM_OFFS(18), - DEF_XMM_OFFS(19), - DEF_XMM_OFFS(20), - DEF_XMM_OFFS(21), - DEF_XMM_OFFS(22), - DEF_XMM_OFFS(23), - DEF_XMM_OFFS(24), - DEF_XMM_OFFS(25), - DEF_XMM_OFFS(26), - DEF_XMM_OFFS(27), - DEF_XMM_OFFS(28), - DEF_XMM_OFFS(29), - DEF_XMM_OFFS(30), - DEF_XMM_OFFS(31), - fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt), + zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt), + DEF_ZMM_OFFS(16), + DEF_ZMM_OFFS(17), + DEF_ZMM_OFFS(18), + DEF_ZMM_OFFS(19), + DEF_ZMM_OFFS(20), + DEF_ZMM_OFFS(21), + DEF_ZMM_OFFS(22), + DEF_ZMM_OFFS(23), + DEF_ZMM_OFFS(24), + DEF_ZMM_OFFS(25), + DEF_ZMM_OFFS(26), + DEF_ZMM_OFFS(27), + DEF_ZMM_OFFS(28), + DEF_ZMM_OFFS(29), + DEF_ZMM_OFFS(30), + DEF_ZMM_OFFS(31), + fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt), fpu_stateH_end, r15_off, r15H_off, r14_off, r14H_off, @@ -155,9 +158,10 @@ class RegisterSaver { OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { int vect_words = 0; - int num_xmm_regs = 16; - if (UseAVX > 2) { - num_xmm_regs = 32; + int off = 0; + int num_xmm_regs = XMMRegisterImpl::number_of_registers; + if (UseAVX < 3) { + num_xmm_regs = num_xmm_regs/2; } #ifdef COMPILER2 if (save_vectors) { @@ -165,9 +169,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); // Save upper half of YMM registers vect_words = 16 * num_xmm_regs / wordSize; - additional_frame_words += vect_words; - if (UseAVX > 2) { - // Save upper half of ZMM registers as well + if (UseAVX < 3) { additional_frame_words += vect_words; } } @@ -195,77 +197,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ __ enter(); // rsp becomes 16-byte aligned here __ push_CPU_state(); // Push a multiple of 16 bytes - if (vect_words > 0) { + // push cpu state handles this on EVEX enabled targets + if ((vect_words > 0) && (UseAVX < 3)) { assert(vect_words*wordSize >= 256, ""); - __ subptr(rsp, 256); // Save upper half of YMM registes(0..15) - __ vextractf128h(Address(rsp, 0), xmm0); - __ vextractf128h(Address(rsp, 16), xmm1); - __ vextractf128h(Address(rsp, 32), xmm2); - __ vextractf128h(Address(rsp, 48), xmm3); - __ vextractf128h(Address(rsp, 64), xmm4); - __ vextractf128h(Address(rsp, 80), xmm5); - __ vextractf128h(Address(rsp, 96), xmm6); - __ vextractf128h(Address(rsp, 112), xmm7); - __ vextractf128h(Address(rsp, 128), xmm8); - __ vextractf128h(Address(rsp, 144), xmm9); - __ vextractf128h(Address(rsp, 160), xmm10); - __ vextractf128h(Address(rsp, 176), xmm11); - __ vextractf128h(Address(rsp, 192), xmm12); - __ vextractf128h(Address(rsp, 208), xmm13); - __ vextractf128h(Address(rsp, 224), xmm14); - __ vextractf128h(Address(rsp, 240), xmm15); - if (UseAVX > 2) { - __ subptr(rsp, 256); // Save upper half of YMM registes(16..31) - __ vextractf128h(Address(rsp, 0), xmm16); - __ vextractf128h(Address(rsp, 16), xmm17); - __ vextractf128h(Address(rsp, 32), xmm18); - __ vextractf128h(Address(rsp, 48), xmm19); - __ vextractf128h(Address(rsp, 64), xmm20); - __ vextractf128h(Address(rsp, 80), xmm21); - __ vextractf128h(Address(rsp, 96), xmm22); - __ vextractf128h(Address(rsp, 112), xmm23); - __ vextractf128h(Address(rsp, 128), xmm24); - __ vextractf128h(Address(rsp, 144), xmm25); - __ vextractf128h(Address(rsp, 160), xmm26); - __ vextractf128h(Address(rsp, 176), xmm27); - __ vextractf128h(Address(rsp, 192), xmm28); - __ vextractf128h(Address(rsp, 208), xmm29); - __ vextractf128h(Address(rsp, 224), xmm30); - __ vextractf128h(Address(rsp, 240), xmm31); - // Now handle the ZMM registers (0..31) - __ subptr(rsp, 1024); // Save upper half of ZMM registes - __ vextractf64x4h(Address(rsp, 0), xmm0); - __ vextractf64x4h(Address(rsp, 32), xmm1); - __ vextractf64x4h(Address(rsp, 64), xmm2); - __ vextractf64x4h(Address(rsp, 96), xmm3); - __ vextractf64x4h(Address(rsp, 128), xmm4); - __ vextractf64x4h(Address(rsp, 160), xmm5); - __ vextractf64x4h(Address(rsp, 192), xmm6); - __ vextractf64x4h(Address(rsp, 224), xmm7); - __ vextractf64x4h(Address(rsp, 256), xmm8); - __ vextractf64x4h(Address(rsp, 288), xmm9); - __ vextractf64x4h(Address(rsp, 320), xmm10); - __ vextractf64x4h(Address(rsp, 352), xmm11); - __ vextractf64x4h(Address(rsp, 384), xmm12); - __ vextractf64x4h(Address(rsp, 416), xmm13); - __ vextractf64x4h(Address(rsp, 448), xmm14); - __ vextractf64x4h(Address(rsp, 480), xmm15); - __ vextractf64x4h(Address(rsp, 512), xmm16); - __ vextractf64x4h(Address(rsp, 544), xmm17); - __ vextractf64x4h(Address(rsp, 576), xmm18); - __ vextractf64x4h(Address(rsp, 608), xmm19); - __ vextractf64x4h(Address(rsp, 640), xmm20); - __ vextractf64x4h(Address(rsp, 672), xmm21); - __ vextractf64x4h(Address(rsp, 704), xmm22); - __ vextractf64x4h(Address(rsp, 736), xmm23); - __ vextractf64x4h(Address(rsp, 768), xmm24); - __ vextractf64x4h(Address(rsp, 800), xmm25); - __ vextractf64x4h(Address(rsp, 832), xmm26); - __ vextractf64x4h(Address(rsp, 864), xmm27); - __ vextractf64x4h(Address(rsp, 896), xmm28); - __ vextractf64x4h(Address(rsp, 928), xmm29); - __ vextractf64x4h(Address(rsp, 960), xmm30); - __ vextractf64x4h(Address(rsp, 992), xmm31); + // Save upper half of YMM registes(0..num_xmm_regs) + __ subptr(rsp, num_xmm_regs*16); + for (int n = 0; n < num_xmm_regs; n++) { + __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n)); } } if (frame::arg_reg_save_area_bytes != 0) { @@ -299,39 +237,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg()); map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg()); map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg()); - if (UseAVX > 2) { - map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg()); + // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, + // on EVEX enabled targets, we get it included in the xsave area + off = xmm0_off; + int delta = xmm1_off - off; + for (int n = 0; n < 16; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); + off += delta; + } + if(UseAVX > 2) { + // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets + off = zmm16_off; + delta = zmm17_off - off; + for (int n = 16; n < num_xmm_regs; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()); + off += delta; + } } // %%% These should all be a waste but we'll keep things as they were for now @@ -351,39 +274,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next()); map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next()); map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next()); + // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15, + // on EVEX enabled targets, we get it included in the xsave area + off = xmm0H_off; + delta = xmm1H_off - off; + for (int n = 0; n < 16; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); + off += delta; + } if (UseAVX > 2) { - map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next()); - map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next()); + // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets + off = zmm16H_off; + delta = zmm17H_off - off; + for (int n = 16; n < num_xmm_regs; n++) { + XMMRegister xmm_name = as_XMMRegister(n); + map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next()); + off += delta; + } } } @@ -391,86 +299,25 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { + int num_xmm_regs = XMMRegisterImpl::number_of_registers; + if (UseAVX < 3) { + num_xmm_regs = num_xmm_regs/2; + } if (frame::arg_reg_save_area_bytes != 0) { // Pop arg register save area __ addptr(rsp, frame::arg_reg_save_area_bytes); } #ifdef COMPILER2 - if (restore_vectors) { - // Restore upper half of YMM registes (0..15) - assert(UseAVX > 0, "512bit vectors are supported only with AVX"); - assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); - __ vinsertf128h(xmm0, Address(rsp, 0)); - __ vinsertf128h(xmm1, Address(rsp, 16)); - __ vinsertf128h(xmm2, Address(rsp, 32)); - __ vinsertf128h(xmm3, Address(rsp, 48)); - __ vinsertf128h(xmm4, Address(rsp, 64)); - __ vinsertf128h(xmm5, Address(rsp, 80)); - __ vinsertf128h(xmm6, Address(rsp, 96)); - __ vinsertf128h(xmm7, Address(rsp,112)); - __ vinsertf128h(xmm8, Address(rsp,128)); - __ vinsertf128h(xmm9, Address(rsp,144)); - __ vinsertf128h(xmm10, Address(rsp,160)); - __ vinsertf128h(xmm11, Address(rsp,176)); - __ vinsertf128h(xmm12, Address(rsp,192)); - __ vinsertf128h(xmm13, Address(rsp,208)); - __ vinsertf128h(xmm14, Address(rsp,224)); - __ vinsertf128h(xmm15, Address(rsp,240)); - __ addptr(rsp, 256); - if (UseAVX > 2) { - // Restore upper half of YMM registes (16..31) - __ vinsertf128h(xmm16, Address(rsp, 0)); - __ vinsertf128h(xmm17, Address(rsp, 16)); - __ vinsertf128h(xmm18, Address(rsp, 32)); - __ vinsertf128h(xmm19, Address(rsp, 48)); - __ vinsertf128h(xmm20, Address(rsp, 64)); - __ vinsertf128h(xmm21, Address(rsp, 80)); - __ vinsertf128h(xmm22, Address(rsp, 96)); - __ vinsertf128h(xmm23, Address(rsp,112)); - __ vinsertf128h(xmm24, Address(rsp,128)); - __ vinsertf128h(xmm25, Address(rsp,144)); - __ vinsertf128h(xmm26, Address(rsp,160)); - __ vinsertf128h(xmm27, Address(rsp,176)); - __ vinsertf128h(xmm28, Address(rsp,192)); - __ vinsertf128h(xmm29, Address(rsp,208)); - __ vinsertf128h(xmm30, Address(rsp,224)); - __ vinsertf128h(xmm31, Address(rsp,240)); - __ addptr(rsp, 256); - // Restore upper half of ZMM registes. - __ vinsertf64x4h(xmm0, Address(rsp, 0)); - __ vinsertf64x4h(xmm1, Address(rsp, 32)); - __ vinsertf64x4h(xmm2, Address(rsp, 64)); - __ vinsertf64x4h(xmm3, Address(rsp, 96)); - __ vinsertf64x4h(xmm4, Address(rsp, 128)); - __ vinsertf64x4h(xmm5, Address(rsp, 160)); - __ vinsertf64x4h(xmm6, Address(rsp, 192)); - __ vinsertf64x4h(xmm7, Address(rsp, 224)); - __ vinsertf64x4h(xmm8, Address(rsp, 256)); - __ vinsertf64x4h(xmm9, Address(rsp, 288)); - __ vinsertf64x4h(xmm10, Address(rsp, 320)); - __ vinsertf64x4h(xmm11, Address(rsp, 352)); - __ vinsertf64x4h(xmm12, Address(rsp, 384)); - __ vinsertf64x4h(xmm13, Address(rsp, 416)); - __ vinsertf64x4h(xmm14, Address(rsp, 448)); - __ vinsertf64x4h(xmm15, Address(rsp, 480)); - __ vinsertf64x4h(xmm16, Address(rsp, 512)); - __ vinsertf64x4h(xmm17, Address(rsp, 544)); - __ vinsertf64x4h(xmm18, Address(rsp, 576)); - __ vinsertf64x4h(xmm19, Address(rsp, 608)); - __ vinsertf64x4h(xmm20, Address(rsp, 640)); - __ vinsertf64x4h(xmm21, Address(rsp, 672)); - __ vinsertf64x4h(xmm22, Address(rsp, 704)); - __ vinsertf64x4h(xmm23, Address(rsp, 736)); - __ vinsertf64x4h(xmm24, Address(rsp, 768)); - __ vinsertf64x4h(xmm25, Address(rsp, 800)); - __ vinsertf64x4h(xmm26, Address(rsp, 832)); - __ vinsertf64x4h(xmm27, Address(rsp, 864)); - __ vinsertf64x4h(xmm28, Address(rsp, 896)); - __ vinsertf64x4h(xmm29, Address(rsp, 928)); - __ vinsertf64x4h(xmm30, Address(rsp, 960)); - __ vinsertf64x4h(xmm31, Address(rsp, 992)); - __ addptr(rsp, 1024); + // On EVEX enabled targets everything is handled in pop fpu state + if ((restore_vectors) && (UseAVX < 3)) { + assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX"); + assert(MaxVectorSize == 64, "up to 512bit vectors are supported now"); + int off = 0; + // Restore upper half of YMM registes (0..num_xmm_regs) + for (int n = 0; n < num_xmm_regs; n++) { + __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16)); } + __ addptr(rsp, num_xmm_regs*16); } #else assert(!restore_vectors, "vectors are generated only by C2"); diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp index 8b785b33011..35c8d0940b6 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -795,6 +795,12 @@ class StubGenerator: public StubCodeGenerator { void xmm_copy_forward(Register from, Register to_from, Register qword_count) { assert( UseSSE >= 2, "supported cpu only" ); Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; + if (UseAVX > 2) { + __ push(rbx); + __ movl(rbx, 0xffff); + __ kmovdl(k1, rbx); + __ pop(rbx); + } // Copy 64-byte chunks __ jmpb(L_copy_64_bytes); __ align(OptoLoopAlignment); @@ -802,8 +808,8 @@ class StubGenerator: public StubCodeGenerator { if (UseUnalignedLoadStores) { if (UseAVX > 2) { - __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit); - __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit); + __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit); + __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit); } else if (UseAVX == 2) { __ vmovdqu(xmm0, Address(from, 0)); __ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0); @@ -2217,6 +2223,15 @@ class StubGenerator: public StubCodeGenerator { const XMMRegister xmm_temp4 = xmm5; __ enter(); // required for proper stackwalking of RuntimeStub frame + + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rdx, 0xffff); + __ kmovdl(k1, rdx); + } + __ movptr(from, from_param); __ movptr(key, key_param); @@ -2315,6 +2330,15 @@ class StubGenerator: public StubCodeGenerator { const XMMRegister xmm_temp4 = xmm5; __ enter(); // required for proper stackwalking of RuntimeStub frame + + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rdx, 0xffff); + __ kmovdl(k1, rdx); + } + __ movptr(from, from_param); __ movptr(key, key_param); @@ -2441,6 +2465,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); // required for proper stackwalking of RuntimeStub frame handleSOERegisters(true /*saving*/); + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rdx, 0xffff); + __ kmovdl(k1, rdx); + } + // load registers from incoming parameters const Address from_param(rbp, 8+0); const Address to_param (rbp, 8+4); @@ -2602,6 +2634,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); // required for proper stackwalking of RuntimeStub frame handleSOERegisters(true /*saving*/); + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rdx, 0xffff); + __ kmovdl(k1, rdx); + } + // load registers from incoming parameters const Address from_param(rbp, 8+0); const Address to_param (rbp, 8+4); @@ -2782,6 +2822,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); handleSOERegisters(true); // Save registers + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rdx, 0xffff); + __ kmovdl(k1, rdx); + } + __ movptr(state, state_param); __ movptr(subkeyH, subkeyH_param); __ movptr(data, data_param); diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp index 6e9b685f801..36049bae44b 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -269,12 +269,16 @@ class StubGenerator: public StubCodeGenerator { __ kmovql(k1, rbx); } #ifdef _WIN64 + int last_reg = 15; if (UseAVX > 2) { - for (int i = 6; i <= 31; i++) { - __ movdqu(xmm_save(i), as_XMMRegister(i)); + last_reg = 31; + } + if (VM_Version::supports_avx512novl()) { + for (int i = xmm_save_first; i <= last_reg; i++) { + __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0); } } else { - for (int i = 6; i <= 15; i++) { + for (int i = xmm_save_first; i <= last_reg; i++) { __ movdqu(xmm_save(i), as_XMMRegister(i)); } } @@ -386,13 +390,15 @@ class StubGenerator: public StubCodeGenerator { // restore regs belonging to calling function #ifdef _WIN64 - int xmm_ub = 15; - if (UseAVX > 2) { - xmm_ub = 31; - } // emit the restores for xmm regs - for (int i = 6; i <= xmm_ub; i++) { - __ movdqu(as_XMMRegister(i), xmm_save(i)); + if (VM_Version::supports_avx512novl()) { + for (int i = xmm_save_first; i <= last_reg; i++) { + __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0); + } + } else { + for (int i = xmm_save_first; i <= last_reg; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } } #endif __ movptr(r15, r15_save); @@ -1342,11 +1348,15 @@ class StubGenerator: public StubCodeGenerator { __ align(OptoLoopAlignment); if (UseUnalignedLoadStores) { Label L_end; + if (UseAVX > 2) { + __ movl(to, 0xffff); + __ kmovql(k1, to); + } // Copy 64-bytes per iteration __ BIND(L_loop); if (UseAVX > 2) { - __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); - __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); + __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); + __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); } else if (UseAVX == 2) { __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); @@ -1422,11 +1432,15 @@ class StubGenerator: public StubCodeGenerator { __ align(OptoLoopAlignment); if (UseUnalignedLoadStores) { Label L_end; + if (UseAVX > 2) { + __ movl(to, 0xffff); + __ kmovql(k1, to); + } // Copy 64-bytes per iteration __ BIND(L_loop); if (UseAVX > 2) { - __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit); - __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit); + __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit); + __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit); } else if (UseAVX == 2) { __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); @@ -3106,6 +3120,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); // required for proper stackwalking of RuntimeStub frame + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); @@ -3200,6 +3222,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); // required for proper stackwalking of RuntimeStub frame + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); @@ -3312,6 +3342,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); // required for proper stackwalking of RuntimeStub frame + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + #ifdef _WIN64 // on win64, fill len_reg from stack position __ movl(len_reg, len_mem); @@ -3508,6 +3546,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); // required for proper stackwalking of RuntimeStub frame + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + #ifdef _WIN64 // on win64, fill len_reg from stack position __ movl(len_reg, len_mem); @@ -3746,6 +3792,14 @@ class StubGenerator: public StubCodeGenerator { __ enter(); + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + #ifdef _WIN64 // save the xmm registers which must be preserved 6-10 __ subptr(rsp, -rsp_after_call_off * wordSize); diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp index bca5d493ce4..1599939ebcf 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp @@ -31,7 +31,7 @@ enum platform_dependent_constants { code_size1 = 9000, // simply increase if too small (assembler will crash if too small) - code_size2 = 22000 // simply increase if too small (assembler will crash if too small) + code_size2 = 30000 // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp index b048fd74e0a..d8c50c82b23 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp @@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _ enum platform_dependent_constants { code_size1 = 19000, // simply increase if too small (assembler will crash if too small) - code_size2 = 24000 // simply increase if too small (assembler will crash if too small) + code_size2 = 32000 // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp index 270ee46a228..f29b51469ff 100644 --- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp @@ -367,16 +367,12 @@ class VM_Version_StubGenerator: public StubCodeGenerator { __ movl(rcx, VM_Version::ymm_test_value()); __ movdl(xmm0, rcx); __ movl(rcx, 0xffff); -#ifdef _LP64 - __ kmovql(k1, rcx); -#else - __ kmovdl(k1, rcx); -#endif + __ kmovwl(k1, rcx); __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit); - __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit); + __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit); #ifdef _LP64 - __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit); - __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit); + __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit); + __ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit); #endif VM_Version::clean_cpuFeatures(); __ jmp(save_restore_except); @@ -427,11 +423,11 @@ class VM_Version_StubGenerator: public StubCodeGenerator { UseAVX = 3; UseSSE = 2; __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset()))); - __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit); - __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit); + __ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit); + __ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit); #ifdef _LP64 - __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit); - __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit); + __ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit); + __ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit); #endif VM_Version::clean_cpuFeatures(); UseAVX = saved_useavx; diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp index c7e54a6bb7f..45abb210ffe 100644 --- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp @@ -227,14 +227,15 @@ public: union XemXcr0Eax { uint32_t value; struct { - uint32_t x87 : 1, - sse : 1, - ymm : 1, - : 2, - opmask : 1, - zmm512 : 1, - zmm32 : 1, - : 24; + uint32_t x87 : 1, + sse : 1, + ymm : 1, + bndregs : 1, + bndcsr : 1, + opmask : 1, + zmm512 : 1, + zmm32 : 1, + : 24; } bits; }; @@ -703,6 +704,7 @@ public: static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; } static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; } static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); } + static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); } // Intel features static bool is_intel_family_core() { return is_intel() && extended_cpu_family() == CPU_FAMILY_INTEL_CORE; } @@ -817,6 +819,12 @@ public: intx count = PrefetchFieldsAhead; return count >= 0 ? count : 1; } + static uint32_t get_xsave_header_lower_segment() { + return _cpuid_info.xem_xcr0_eax.value; + } + static uint32_t get_xsave_header_upper_segment() { + return _cpuid_info.xem_xcr0_edx; + } }; #endif // CPU_X86_VM_VM_VERSION_X86_HPP diff --git a/hotspot/src/cpu/x86/vm/x86.ad b/hotspot/src/cpu/x86/vm/x86.ad index 39ee7c03d63..120be4f0bd1 100644 --- a/hotspot/src/cpu/x86/vm/x86.ad +++ b/hotspot/src/cpu/x86/vm/x86.ad @@ -1661,50 +1661,55 @@ const bool Matcher::match_rule_supported(int opcode) { if (!has_match_rule(opcode)) return false; + bool ret_value = true; switch (opcode) { case Op_PopCountI: case Op_PopCountL: if (!UsePopCountInstruction) - return false; - break; + ret_value = false; + break; case Op_MulVI: if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX - return false; - break; + ret_value = false; + break; case Op_MulVL: case Op_MulReductionVL: if (VM_Version::supports_avx512dq() == false) - return false; + ret_value = false; + break; case Op_AddReductionVL: if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here - return false; + ret_value = false; + break; case Op_AddReductionVI: if (UseSSE < 3) // requires at least SSE3 - return false; + ret_value = false; + break; case Op_MulReductionVI: if (UseSSE < 4) // requires at least SSE4 - return false; + ret_value = false; + break; case Op_AddReductionVF: case Op_AddReductionVD: case Op_MulReductionVF: case Op_MulReductionVD: if (UseSSE < 1) // requires at least SSE - return false; - break; + ret_value = false; + break; case Op_SqrtVD: if (UseAVX < 1) // enabled for AVX only - return false; - break; + ret_value = false; + break; case Op_CompareAndSwapL: #ifdef _LP64 case Op_CompareAndSwapP: #endif if (!VM_Version::supports_cx8()) - return false; - break; + ret_value = false; + break; } - return true; // Per default match rules are supported. + return ret_value; // Per default match rules are supported. } // Max vector size in bytes. 0 if not supported. @@ -1725,14 +1730,24 @@ const int Matcher::vector_width_in_bytes(BasicType bt) { case T_DOUBLE: case T_LONG: if (size < 16) return 0; + break; case T_FLOAT: case T_INT: if (size < 8) return 0; + break; case T_BOOLEAN: - case T_BYTE: + if (size < 4) return 0; + break; case T_CHAR: + if (size < 4) return 0; + break; + case T_BYTE: + if (size < 4) return 0; + if ((size > 32) && !VM_Version::supports_avx512bw()) return 0; + break; case T_SHORT: if (size < 4) return 0; + if ((size > 16) && !VM_Version::supports_avx512bw()) return 0; break; default: ShouldNotReachHere(); @@ -1804,7 +1819,7 @@ static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo])); break; case Op_VecZ: - __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2); + __ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2); break; default: ShouldNotReachHere(); @@ -1859,7 +1874,7 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load, __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset)); break; case Op_VecZ: - __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2); + __ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2); break; default: ShouldNotReachHere(); @@ -1879,7 +1894,7 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load, __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg])); break; case Op_VecZ: - __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2); + __ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2); break; default: ShouldNotReachHere(); @@ -1933,9 +1948,40 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load, } #endif } - int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4); + bool is_single_byte = false; + int vec_len = 0; + if ((UseAVX > 2) && (stack_offset != 0)) { + switch (ireg) { + case Op_VecS: + case Op_VecD: + case Op_VecX: + break; + case Op_VecY: + vec_len = 1; + break; + case Op_VecZ: + vec_len = 2; + break; + } + is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0); + } + int offset_size = 0; + int size = 5; + if (UseAVX > 2 ) { + if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { + offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4); + size += 2; // Need an additional two bytes for EVEX encoding + } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { + offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4); + } else { + offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4); + size += 2; // Need an additional two bytes for EVEX encodding + } + } else { + offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4); + } // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. - return 5+offset_size; + return size+offset_size; } static inline jfloat replicate4_imm(int con, int width) { @@ -2679,11 +2725,10 @@ instruct negF_reg_reg(regF dst, regF src) %{ predicate(UseAVX > 0); match(Set dst (NegF src)); ins_cost(150); - format %{ "vxorps $dst, $src, [0x80000000]\t# neg float by sign flipping" %} + format %{ "vnegatess $dst, $src, [0x80000000]\t# neg float by sign flipping" %} ins_encode %{ - int vector_len = 0; - __ vxorps($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(float_signflip()), vector_len); + __ vnegatess($dst$$XMMRegister, $src$$XMMRegister, + ExternalAddress(float_signflip())); %} ins_pipe(pipe_slow); %} @@ -2704,12 +2749,11 @@ instruct negD_reg_reg(regD dst, regD src) %{ predicate(UseAVX > 0); match(Set dst (NegD src)); ins_cost(150); - format %{ "vxorpd $dst, $src, [0x8000000000000000]\t" + format %{ "vnegatess $dst, $src, [0x8000000000000000]\t" "# neg double by sign flipping" %} ins_encode %{ - int vector_len = 0; - __ vxorpd($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(double_signflip()), vector_len); + __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister, + ExternalAddress(double_signflip())); %} ins_pipe(pipe_slow); %} @@ -2842,7 +2886,7 @@ instruct loadV64(vecZ dst, memory mem) %{ format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %} ins_encode %{ int vector_len = 2; - __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len); + __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -2899,7 +2943,7 @@ instruct storeV64(memory mem, vecZ src) %{ format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %} ins_encode %{ int vector_len = 2; - __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len); + __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -3319,6 +3363,37 @@ instruct Repl8F_mem(vecY dst, memory mem) %{ ins_pipe( pipe_slow ); %} +instruct Repl2F_zero(vecD dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 2 && UseAVX < 3); + match(Set dst (ReplicateF zero)); + format %{ "xorps $dst,$dst\t! replicate2F zero" %} + ins_encode %{ + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4F_zero(vecX dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX < 3); + match(Set dst (ReplicateF zero)); + format %{ "xorps $dst,$dst\t! replicate4F zero" %} + ins_encode %{ + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8F_zero(vecY dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX < 3); + match(Set dst (ReplicateF zero)); + format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %} + ins_encode %{ + int vector_len = 1; + __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + instruct Repl2D_mem(vecX dst, memory mem) %{ predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl()); match(Set dst (ReplicateD (LoadD mem))); @@ -3353,6 +3428,28 @@ instruct Repl4D_mem(vecY dst, memory mem) %{ ins_pipe( pipe_slow ); %} +// Replicate double (8 byte) scalar zero to be vector +instruct Repl2D_zero(vecX dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 2 && UseAVX < 3); + match(Set dst (ReplicateD zero)); + format %{ "xorpd $dst,$dst\t! replicate2D zero" %} + ins_encode %{ + __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4D_zero(vecY dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX < 3); + match(Set dst (ReplicateD zero)); + format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %} + ins_encode %{ + int vector_len = 1; + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + // ====================GENERIC REPLICATE========================================== // Replicate byte scalar to be vector @@ -3684,38 +3781,6 @@ instruct Repl4F(vecX dst, regF src) %{ ins_pipe( pipe_slow ); %} -// Replicate float (4 byte) scalar zero to be vector -instruct Repl2F_zero(vecD dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateF zero)); - format %{ "xorps $dst,$dst\t! replicate2F zero" %} - ins_encode %{ - __ xorps($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4F_zero(vecX dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateF zero)); - format %{ "xorps $dst,$dst\t! replicate4F zero" %} - ins_encode %{ - __ xorps($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl8F_zero(vecY dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateF zero)); - format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %} - ins_encode %{ - int vector_len = 1; - __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); - %} - ins_pipe( fpu_reg_reg ); -%} - // Replicate double (8 bytes) scalar to be vector instruct Repl2D(vecX dst, regD src) %{ predicate(n->as_Vector()->length() == 2); @@ -3727,28 +3792,6 @@ instruct Repl2D(vecX dst, regD src) %{ ins_pipe( pipe_slow ); %} -// Replicate double (8 byte) scalar zero to be vector -instruct Repl2D_zero(vecX dst, immD0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateD zero)); - format %{ "xorpd $dst,$dst\t! replicate2D zero" %} - ins_encode %{ - __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4D_zero(vecY dst, immD0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateD zero)); - format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %} - ins_encode %{ - int vector_len = 1; - __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); - %} - ins_pipe( fpu_reg_reg ); -%} - // ====================EVEX REPLICATE============================================= instruct Repl4B_mem_evex(vecS dst, memory mem) %{ @@ -3818,7 +3861,7 @@ instruct Repl32B_mem_evex(vecY dst, memory mem) %{ %} instruct Repl64B_evex(vecZ dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 64 && UseAVX > 2); + predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw()); match(Set dst (ReplicateB src)); format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %} ins_encode %{ @@ -3829,7 +3872,7 @@ instruct Repl64B_evex(vecZ dst, rRegI src) %{ %} instruct Repl64B_mem_evex(vecZ dst, memory mem) %{ - predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw()); + predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw()); match(Set dst (ReplicateB (LoadB mem))); format %{ "vpbroadcastb $dst,$mem\t! replicate64B" %} ins_encode %{ @@ -3866,7 +3909,7 @@ instruct Repl32B_imm_evex(vecY dst, immI con) %{ %} instruct Repl64B_imm_evex(vecZ dst, immI con) %{ - predicate(n->as_Vector()->length() == 64 && UseAVX > 2); + predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw()); match(Set dst (ReplicateB con)); format %{ "movq $dst,[$constantaddress]\n\t" "vpbroadcastb $dst,$dst\t! upper replicate64B" %} @@ -3957,7 +4000,7 @@ instruct Repl16S_mem_evex(vecY dst, memory mem) %{ %} instruct Repl32S_evex(vecZ dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw()); match(Set dst (ReplicateS src)); format %{ "vpbroadcastw $dst,$src\t! replicate32S" %} ins_encode %{ @@ -3968,7 +4011,7 @@ instruct Repl32S_evex(vecZ dst, rRegI src) %{ %} instruct Repl32S_mem_evex(vecZ dst, memory mem) %{ - predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw()); match(Set dst (ReplicateS (LoadS mem))); format %{ "vpbroadcastw $dst,$mem\t! replicate32S" %} ins_encode %{ @@ -4005,7 +4048,7 @@ instruct Repl16S_imm_evex(vecY dst, immI con) %{ %} instruct Repl32S_imm_evex(vecZ dst, immI con) %{ - predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw()); match(Set dst (ReplicateS con)); format %{ "movq $dst,[$constantaddress]\n\t" "vpbroadcastw $dst,$dst\t! replicate32S" %} @@ -4322,13 +4365,50 @@ instruct Repl16F_mem_evex(vecZ dst, memory mem) %{ ins_pipe( pipe_slow ); %} +instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 2 && UseAVX > 2); + match(Set dst (ReplicateF zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate2F zero" %} + ins_encode %{ + // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 2); + match(Set dst (ReplicateF zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate4F zero" %} + ins_encode %{ + // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); + match(Set dst (ReplicateF zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate8F zero" %} + ins_encode %{ + // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{ predicate(n->as_Vector()->length() == 16 && UseAVX > 2); match(Set dst (ReplicateF zero)); - format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %} + format %{ "vpxor $dst k0,$dst,$dst\t! replicate16F zero" %} ins_encode %{ + // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation int vector_len = 2; - __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} @@ -4377,13 +4457,38 @@ instruct Repl8D_mem_evex(vecZ dst, memory mem) %{ ins_pipe( pipe_slow ); %} +instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 2 && UseAVX > 2); + match(Set dst (ReplicateD zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate2D zero" %} + ins_encode %{ + // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 2); + match(Set dst (ReplicateD zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate4D zero" %} + ins_encode %{ + // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{ predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateD zero)); - format %{ "vxorpd $dst k0,$dst,$dst,vect512\t! replicate8D zero" %} + format %{ "vpxor $dst k0,$dst,$dst,vect512\t! replicate8D zero" %} ins_encode %{ + // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation int vector_len = 2; - __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( fpu_reg_reg ); %} diff --git a/hotspot/src/cpu/x86/vm/x86_32.ad b/hotspot/src/cpu/x86/vm/x86_32.ad index dfe10d5b02a..79b55056cba 100644 --- a/hotspot/src/cpu/x86/vm/x86_32.ad +++ b/hotspot/src/cpu/x86/vm/x86_32.ad @@ -1004,10 +1004,10 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off __ vmovdqu(Address(rsp, dst_offset), xmm0); __ vmovdqu(xmm0, Address(rsp, -32)); case Op_VecZ: - __ evmovdqu(Address(rsp, -64), xmm0, 2); - __ evmovdqu(xmm0, Address(rsp, src_offset), 2); - __ evmovdqu(Address(rsp, dst_offset), xmm0, 2); - __ evmovdqu(xmm0, Address(rsp, -64), 2); + __ evmovdqul(Address(rsp, -64), xmm0, 2); + __ evmovdqul(xmm0, Address(rsp, src_offset), 2); + __ evmovdqul(Address(rsp, dst_offset), xmm0, 2); + __ evmovdqul(xmm0, Address(rsp, -64), 2); break; default: ShouldNotReachHere(); diff --git a/hotspot/src/cpu/x86/vm/x86_64.ad b/hotspot/src/cpu/x86/vm/x86_64.ad index c04661cda7d..9fcc7033c8a 100644 --- a/hotspot/src/cpu/x86/vm/x86_64.ad +++ b/hotspot/src/cpu/x86/vm/x86_64.ad @@ -1075,10 +1075,10 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset, __ vmovdqu(Address(rsp, dst_offset), xmm0); __ vmovdqu(xmm0, Address(rsp, -32)); case Op_VecZ: - __ evmovdqu(Address(rsp, -64), xmm0, 2); - __ evmovdqu(xmm0, Address(rsp, src_offset), 2); - __ evmovdqu(Address(rsp, dst_offset), xmm0, 2); - __ evmovdqu(xmm0, Address(rsp, -64), 2); + __ evmovdqul(Address(rsp, -64), xmm0, 2); + __ evmovdqul(xmm0, Address(rsp, src_offset), 2); + __ evmovdqul(Address(rsp, dst_offset), xmm0, 2); + __ evmovdqul(xmm0, Address(rsp, -64), 2); break; default: ShouldNotReachHere(); From 3780022a87628d35fcad00293d9a62e7ce1b9895 Mon Sep 17 00:00:00 2001 From: Tobias Hartmann Date: Mon, 14 Sep 2015 07:02:50 +0200 Subject: [PATCH 15/20] 8080999: MemoryPoolMXBean.getUsageThresholdCount() returns incorrect value Fixed race condition in the JMX code for the LowMemoryDetector. Reviewed-by: kvn --- .../share/vm/services/lowMemoryDetector.cpp | 34 +++++++++++-------- .../share/vm/services/lowMemoryDetector.hpp | 4 +-- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/hotspot/src/share/vm/services/lowMemoryDetector.cpp b/hotspot/src/share/vm/services/lowMemoryDetector.cpp index b68be23dfa4..42c500ad68e 100644 --- a/hotspot/src/share/vm/services/lowMemoryDetector.cpp +++ b/hotspot/src/share/vm/services/lowMemoryDetector.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -200,9 +200,10 @@ SensorInfo::SensorInfo() { // any clears unless the usage becomes greater than or equal // to the high threshold. // -// If the current level is between high and low threhsold, no change. +// If the current level is between high and low threshold, no change. // void SensorInfo::set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* high_low_threshold) { + assert(Service_lock->owned_by_self(), "Must own Service_lock"); assert(high_low_threshold->is_high_threshold_supported(), "just checking"); bool is_over_high = high_low_threshold->is_high_threshold_crossed(usage); @@ -257,6 +258,7 @@ void SensorInfo::set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* hig // the sensor will be on (i.e. sensor is currently off // and has pending trigger requests). void SensorInfo::set_counter_sensor_level(MemoryUsage usage, ThresholdSupport* counter_threshold) { + assert(Service_lock->owned_by_self(), "Must own Service_lock"); assert(counter_threshold->is_high_threshold_supported(), "just checking"); bool is_over_high = counter_threshold->is_high_threshold_crossed(usage); @@ -278,9 +280,7 @@ void SensorInfo::oops_do(OopClosure* f) { } void SensorInfo::process_pending_requests(TRAPS) { - if (!has_pending_requests()) { - return; - } + assert(has_pending_requests(), "Must have pending request"); int pending_count = pending_trigger_count(); if (pending_clear_count() > 0) { @@ -293,7 +293,6 @@ void SensorInfo::process_pending_requests(TRAPS) { void SensorInfo::trigger(int count, TRAPS) { assert(count <= _pending_trigger_count, "just checking"); - if (_sensor_obj != NULL) { Klass* k = Management::sun_management_Sensor_klass(CHECK); instanceKlassHandle sensorKlass (THREAD, k); @@ -316,6 +315,7 @@ void SensorInfo::trigger(int count, TRAPS) { { // Holds Service_lock and update the sensor state MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag); + assert(_pending_trigger_count > 0, "Must have pending trigger"); _sensor_on = true; _sensor_count += count; _pending_trigger_count = _pending_trigger_count - count; @@ -323,6 +323,20 @@ void SensorInfo::trigger(int count, TRAPS) { } void SensorInfo::clear(int count, TRAPS) { + { + // Holds Service_lock and update the sensor state + MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag); + if (_pending_clear_count == 0) { + // Bail out if we lost a race to set_*_sensor_level() which may have + // reactivated the sensor in the meantime because it was triggered again. + return; + } + _sensor_on = false; + _sensor_count += count; + _pending_clear_count = 0; + _pending_trigger_count = _pending_trigger_count - count; + } + if (_sensor_obj != NULL) { Klass* k = Management::sun_management_Sensor_klass(CHECK); instanceKlassHandle sensorKlass (THREAD, k); @@ -338,14 +352,6 @@ void SensorInfo::clear(int count, TRAPS) { &args, CHECK); } - - { - // Holds Service_lock and update the sensor state - MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag); - _sensor_on = false; - _pending_clear_count = 0; - _pending_trigger_count = _pending_trigger_count - count; - } } //-------------------------------------------------------------- diff --git a/hotspot/src/share/vm/services/lowMemoryDetector.hpp b/hotspot/src/share/vm/services/lowMemoryDetector.hpp index 16c306f5301..1e82003b432 100644 --- a/hotspot/src/share/vm/services/lowMemoryDetector.hpp +++ b/hotspot/src/share/vm/services/lowMemoryDetector.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -180,7 +180,7 @@ public: // any clears unless the usage becomes greater than or equal // to the high threshold. // - // If the current level is between high and low threhsold, no change. + // If the current level is between high and low threshold, no change. // void set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* high_low_threshold); From 6479241f4af9c46f898048dc3d07dc938ab32dbf Mon Sep 17 00:00:00 2001 From: Tobias Hartmann Date: Mon, 14 Sep 2015 10:00:26 +0200 Subject: [PATCH 16/20] 8135252: IdealLoopTree::dump_head() prints negative trip count IdealLoopTree::dump_head() should not cast float to int. Reviewed-by: kvn --- hotspot/src/share/vm/opto/loopnode.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hotspot/src/share/vm/opto/loopnode.cpp b/hotspot/src/share/vm/opto/loopnode.cpp index 9d099940a07..9c09c888f2c 100644 --- a/hotspot/src/share/vm/opto/loopnode.cpp +++ b/hotspot/src/share/vm/opto/loopnode.cpp @@ -1901,7 +1901,7 @@ void IdealLoopTree::dump_head( ) const { if (stride_con > 0) tty->print("+"); tty->print("%d", stride_con); - tty->print(" (%d iters) ", (int)cl->profile_trip_cnt()); + tty->print(" (%0.f iters) ", cl->profile_trip_cnt()); if (cl->is_pre_loop ()) tty->print(" pre" ); if (cl->is_main_loop()) tty->print(" main"); From 8dd169a71f921b3ff94da6da570f413d7cdd4653 Mon Sep 17 00:00:00 2001 From: Roland Westrelin Date: Fri, 11 Sep 2015 16:56:56 +0200 Subject: [PATCH 17/20] 8135069: C2 replaces range checks by unsigned comparison with -1 I < 0 || i > -1 wrongly folded as i >u -1 Reviewed-by: kvn --- hotspot/src/share/vm/opto/ifnode.cpp | 46 ++++++++++---- .../rangechecks/TestBadFoldCompare.java | 60 ++++++++++++++++++- 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/hotspot/src/share/vm/opto/ifnode.cpp b/hotspot/src/share/vm/opto/ifnode.cpp index cd60f19cd67..5d22abd6729 100644 --- a/hotspot/src/share/vm/opto/ifnode.cpp +++ b/hotspot/src/share/vm/opto/ifnode.cpp @@ -858,18 +858,29 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f // this_bool = <= // dom_bool = >= (proj = True) or dom_bool = < (proj = False) // x in [a, b] on the fail (= True) projection, b+1 > a-1: - // lo = a, hi = b, adjusted_lim = b-a, cond = <=u + // lo = a, hi = b, adjusted_lim = b-a+1, cond = (proj = True) or dom_bool = <= (proj = False) // x in ]a, b] on the fail (= True) projection b+1 > a: // lo = a+1, hi = b, adjusted_lim = b-a, cond = transform(new AddINode(lo, igvn->intcon(1))); + } + } else { + assert(hi_test == BoolTest::le, "bad test"); + if (lo_test == BoolTest::ge || lo_test == BoolTest::lt) { adjusted_lim = igvn->transform(new SubINode(hi, lo)); + adjusted_lim = igvn->transform(new AddINode(adjusted_lim, igvn->intcon(1))); + cond = BoolTest::lt; + } else { + assert(lo_test == BoolTest::gt || lo_test == BoolTest::le, "bad test"); + adjusted_lim = igvn->transform(new SubINode(hi, lo)); + lo = igvn->transform(new AddINode(lo, igvn->intcon(1))); cond = BoolTest::lt; } - lo = igvn->transform(new AddINode(lo, igvn->intcon(1))); } } else if (lo_type->_lo > hi_type->_hi && lo_type->_hi == max_jint && hi_type->_lo == min_jint) { @@ -879,7 +890,8 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f // lo = b, hi = a, adjusted_lim = a-b, cond = >=u // dom_bool = <= (proj = True) or dom_bool = > (proj = False) // x in [b, a] on the fail (= False) projection, a+1 > b-1: - // lo = b, hi = a, adjusted_lim = a-b, cond = >u + // lo = b, hi = a, adjusted_lim = a-b+1, cond = >=u + // lo = b, hi = a, adjusted_lim = a-b, cond = >u doesn't work because a = b - 1 is possible, then b-a = -1 // this_bool = <= // dom_bool = < (proj = True) or dom_bool = >= (proj = False) // x in ]b, a[ on the fail (= False) projection, a > b: @@ -887,7 +899,7 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f // dom_bool = <= (proj = True) or dom_bool = > (proj = False) // x in ]b, a] on the fail (= False) projection, a+1 > b: // lo = b+1, hi = a, adjusted_lim = a-b, cond = >=u - // lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then hi-lo = -1 + // lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then b-a-1 = -1 swap(lo, hi); swap(lo_type, hi_type); @@ -900,14 +912,26 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f cond = (hi_test == BoolTest::le || hi_test == BoolTest::gt) ? BoolTest::gt : BoolTest::ge; - if (lo_test == BoolTest::le) { - if (cond == BoolTest::gt) { + if (lo_test == BoolTest::lt) { + if (hi_test == BoolTest::lt || hi_test == BoolTest::ge) { + cond = BoolTest::ge; + } else { + assert(hi_test == BoolTest::le || hi_test == BoolTest::gt, "bad test"); adjusted_lim = igvn->transform(new SubINode(hi, lo)); + adjusted_lim = igvn->transform(new AddINode(adjusted_lim, igvn->intcon(1))); + cond = BoolTest::ge; + } + } else if (lo_test == BoolTest::le) { + if (hi_test == BoolTest::lt || hi_test == BoolTest::ge) { + lo = igvn->transform(new AddINode(lo, igvn->intcon(1))); + cond = BoolTest::ge; + } else { + assert(hi_test == BoolTest::le || hi_test == BoolTest::gt, "bad test"); + adjusted_lim = igvn->transform(new SubINode(hi, lo)); + lo = igvn->transform(new AddINode(lo, igvn->intcon(1))); cond = BoolTest::ge; } - lo = igvn->transform(new AddINode(lo, igvn->intcon(1))); } - } else { const TypeInt* failtype = filtered_int_type(igvn, n, proj); if (failtype != NULL) { diff --git a/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java b/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java index 328807a6560..74e53a0e826 100644 --- a/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java +++ b/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java @@ -24,7 +24,8 @@ /* * @test * @bug 8085832 - * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1 + * @bug 8135069 + * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1 and x < 0 || x > -1 wrongly folded as x >u -1 * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestBadFoldCompare */ @@ -58,6 +59,34 @@ public class TestBadFoldCompare { helper2(i, 0, 0, flag); } + static boolean test3_taken; + + static void helper3(int i, int a, int b, boolean flag) { + if (flag) { + if (i < a || i > b - 1) { + test3_taken = true; + } + } + } + + static void test3(int i, boolean flag) { + helper3(i, 0, 0, flag); + } + + static boolean test4_taken; + + static void helper4(int i, int a, int b, boolean flag) { + if (flag) { + if (i > b - 1 || i < a) { + test4_taken = true; + } + } + } + + static void test4(int i, boolean flag) { + helper4(i, 0, 0, flag); + } + static public void main(String[] args) { boolean success = true; @@ -87,6 +116,35 @@ public class TestBadFoldCompare { System.out.println("Test2 failed"); success = false; } + + for (int i = 0; i < 20000; i++) { + helper3(5, 0, 10, (i%2)==0); + helper3(-1, 0, 10, (i%2)==0); + helper3(15, 0, 10, (i%2)==0); + test3(0, false); + } + test3_taken = false; + test3(0, true); + + if (!test3_taken) { + System.out.println("Test3 failed"); + success = false; + } + + for (int i = 0; i < 20000; i++) { + helper4(5, 0, 10, (i%2)==0); + helper4(-1, 0, 10, (i%2)==0); + helper4(15, 0, 10, (i%2)==0); + test4(0, false); + } + test4_taken = false; + test4(0, true); + + if (!test4_taken) { + System.out.println("Test4 failed"); + success = false; + } + if (!success) { throw new RuntimeException("Some tests failed"); } From 404fc5caa85c3216e5537d41b71d0080fc2afd29 Mon Sep 17 00:00:00 2001 From: Roland Westrelin Date: Tue, 8 Sep 2015 19:19:08 +0200 Subject: [PATCH 18/20] 8134974: 8130847 broken with loop predicates Pinned eliminated arraycopy loads in uncommon trap path for loop predicates may need to be moved Reviewed-by: kvn --- hotspot/src/share/vm/opto/loopPredicate.cpp | 7 +++ ...EliminatedArrayLoopPredicateCopyDeopt.java | 53 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 hotspot/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java diff --git a/hotspot/src/share/vm/opto/loopPredicate.cpp b/hotspot/src/share/vm/opto/loopPredicate.cpp index 7d25d965d91..8d202294807 100644 --- a/hotspot/src/share/vm/opto/loopPredicate.cpp +++ b/hotspot/src/share/vm/opto/loopPredicate.cpp @@ -112,6 +112,13 @@ ProjNode* PhaseIdealLoop::create_new_if_for_predicate(ProjNode* cont_proj, Node* if (_idom != NULL) { set_idom(call, rgn, dom_depth(rgn)); } + for (DUIterator_Fast imax, i = uncommon_proj->fast_outs(imax); i < imax; i++) { + Node* n = uncommon_proj->fast_out(i); + if (n->is_Load() || n->is_Store()) { + _igvn.replace_input_of(n, 0, rgn); + --i; --imax; + } + } } else { // Find region's edge corresponding to uncommon_proj for (; proj_index < rgn->req(); proj_index++) diff --git a/hotspot/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java b/hotspot/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java new file mode 100644 index 00000000000..d0ce772564d --- /dev/null +++ b/hotspot/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8134974 + * @summary Cannot pin eliminated arraycopy loads for deopt state in uncommon trap path if it is a loop predicate unc + * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestEliminatedArrayLoopPredicateCopyDeopt + * + */ + +public class TestEliminatedArrayLoopPredicateCopyDeopt { + + static boolean test(int[] array_src) { + int[] array_dst = new int[10]; + System.arraycopy(array_src, 0, array_dst, 0, 10); + + for (int i = 0; i < 100; i++) { + array_src[i] = i; + } + if (array_dst[0] == 0) { + return true; + } + return false; + } + + static public void main(String[] args) { + int[] array_src = new int[100]; + for (int i = 0; i < 20000; i++) { + test(array_src); + } + } +} From fb955a7eff1c49982d165d22f552519f9b6c7cae Mon Sep 17 00:00:00 2001 From: Alexander Kulyakhtin Date: Mon, 14 Sep 2015 14:26:29 +0300 Subject: [PATCH 19/20] 8134641: CodelistTest.java fails with sun.misc.Unsafe.getUnsafe Excluding lines containing sun.misc.Unsafe.getUnsafe from the test input as getUnsafe is hidden from reflection Reviewed-by: sla --- hotspot/test/serviceability/dcmd/compiler/CodelistTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hotspot/test/serviceability/dcmd/compiler/CodelistTest.java b/hotspot/test/serviceability/dcmd/compiler/CodelistTest.java index a7e23bb9de4..95307f682f7 100644 --- a/hotspot/test/serviceability/dcmd/compiler/CodelistTest.java +++ b/hotspot/test/serviceability/dcmd/compiler/CodelistTest.java @@ -90,6 +90,9 @@ public class CodelistTest { if (methodPrintedInLogFormat.contains("MethodHandle")) { continue; } + if (methodPrintedInLogFormat.contains("sun.misc.Unsafe.getUnsafe")) { + continue; + } MethodIdentifierParser mf = new MethodIdentifierParser(methodPrintedInLogFormat); Method m = null; From ea76ede582beafa11402744c73acb3aff5d57422 Mon Sep 17 00:00:00 2001 From: Jiangli Zhou Date: Mon, 14 Sep 2015 14:55:01 -0400 Subject: [PATCH 20/20] 8135097: Unmap failure for executable memory on windows Use 'pd_release_memory' for executable memory in os::pd_unmap_memory(). Reviewed-by: iklam, coleenp --- hotspot/src/os/windows/vm/os_windows.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hotspot/src/os/windows/vm/os_windows.cpp b/hotspot/src/os/windows/vm/os_windows.cpp index 4ba6d704a9d..b193f3c3eee 100644 --- a/hotspot/src/os/windows/vm/os_windows.cpp +++ b/hotspot/src/os/windows/vm/os_windows.cpp @@ -4877,6 +4877,26 @@ char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset, // Returns true=success, otherwise false. bool os::pd_unmap_memory(char* addr, size_t bytes) { + MEMORY_BASIC_INFORMATION mem_info; + if (VirtualQuery(addr, &mem_info, sizeof(mem_info)) == 0) { + if (PrintMiscellaneous && Verbose) { + DWORD err = GetLastError(); + tty->print_cr("VirtualQuery() failed: GetLastError->%ld.", err); + } + return false; + } + + // Executable memory was not mapped using CreateFileMapping/MapViewOfFileEx. + // Instead, executable region was allocated using VirtualAlloc(). See + // pd_map_memory() above. + // + // The following flags should match the 'exec_access' flages used for + // VirtualProtect() in pd_map_memory(). + if (mem_info.Protect == PAGE_EXECUTE_READ || + mem_info.Protect == PAGE_EXECUTE_READWRITE) { + return pd_release_memory(addr, bytes); + } + BOOL result = UnmapViewOfFile(addr); if (result == 0) { if (PrintMiscellaneous && Verbose) {