From 4946a162aa6119416cfdb94b8e3d09200f28d837 Mon Sep 17 00:00:00 2001 From: Naoto Sato Date: Wed, 29 Jul 2020 09:49:43 -0700 Subject: [PATCH] 8247546: Pattern matching does not skip correctly over supplementary characters Reviewed-by: joehw --- .../classes/java/util/regex/Pattern.java | 15 ++++++----- test/jdk/java/util/regex/RegExTest.java | 2 +- .../util/regex/SupplementaryTestCases.txt | 27 ++++++++++++++++++- 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index c0ccc531dbf..8304f05b1d7 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -1049,9 +1049,10 @@ public final class Pattern private transient int patternLength; /** - * If the Start node might possibly match supplementary characters. + * If the Start node might possibly match supplementary or surrogate + * code points. * It is set to true during compiling if - * (1) There is supplementary char in pattern, or + * (1) There is supplementary or surrogate code point in pattern, or * (2) There is complement node of a "family" CharProperty */ private transient boolean hasSupplementary; @@ -2948,8 +2949,10 @@ loop: for(int x=0, offset=0; x Character.MAX_HIGH_SURROGATE && + lower > Character.MAX_LOW_SURROGATE && upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); return ch -> inRange(lower, ch, upper); } /** - * Charactrs within a explicit value range in a case insensitive manner. + * Characters within a explicit value range in a case insensitive manner. */ static CharPredicate CIRange(int lower, int upper) { return ch -> inRange(lower, ch, upper) || diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java index acac49bd4e9..387b01e5f62 100644 --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -36,7 +36,7 @@ * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 - * 8216332 8214245 8237599 8241055 + * 8216332 8214245 8237599 8241055 8247546 * * @library /test/lib * @library /lib/testlibrary/java/lang diff --git a/test/jdk/java/util/regex/SupplementaryTestCases.txt b/test/jdk/java/util/regex/SupplementaryTestCases.txt index 644a91b6be7..8cd1b91b3fc 100644 --- a/test/jdk/java/util/regex/SupplementaryTestCases.txt +++ b/test/jdk/java/util/regex/SupplementaryTestCases.txt @@ -1,5 +1,5 @@ // -// Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -129,6 +129,31 @@ true \ud800\udc00pqr 0 ///\ud800\udc00 ///false 0 +// unpaired surrogate should match +[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}] +xxx\udca9\ud83dyyy +true \udca9 0 + +// surrogates in a supplementary character should not match +[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}] +\ud83d\udca9 +false 0 + +// unpaired surrogate should match +[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}] +xxx\udca9\ud83dyyy +true \udca9 0 + +// surrogates part of a supplementary character should not match +[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}] +\ud83d\udca9 +false 0 + +// low surrogate part of a supplementary character should not match +[\x{dc00}-\x{dfff}] +\ud83d\udca9 +false 0 + // use of x modifier \ud800\udc61bc(?x)bl\ud800\udc61h \ud800\udc61bcbl\ud800\udc61h