8247546: Pattern matching does not skip correctly over supplementary characters

Reviewed-by: joehw
This commit is contained in:
Naoto Sato 2020-07-29 09:49:43 -07:00
parent 6e32338330
commit 4946a162aa
3 changed files with 36 additions and 8 deletions

View File

@ -1049,9 +1049,10 @@ public final class Pattern
private transient int patternLength; private transient int patternLength;
/** /**
* If the Start node might possibly match supplementary characters. * If the Start node might possibly match supplementary or surrogate
* code points.
* It is set to true during compiling if * It is set to true during compiling if
* (1) There is supplementary char in pattern, or * (1) There is supplementary or surrogate code point in pattern, or
* (2) There is complement node of a "family" CharProperty * (2) There is complement node of a "family" CharProperty
*/ */
private transient boolean hasSupplementary; private transient boolean hasSupplementary;
@ -2948,8 +2949,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
return null; return null;
if (p instanceof BmpCharPredicate) if (p instanceof BmpCharPredicate)
return new BmpCharProperty((BmpCharPredicate)p); return new BmpCharProperty((BmpCharPredicate)p);
else else {
hasSupplementary = true;
return new CharProperty(p); return new CharProperty(p);
}
} }
/** /**
@ -5785,18 +5788,18 @@ NEXT: while (i <= last) {
} }
/** /**
* Charactrs within a explicit value range * Characters within a explicit value range
*/ */
static CharPredicate Range(int lower, int upper) { static CharPredicate Range(int lower, int upper) {
if (upper < Character.MIN_HIGH_SURROGATE || if (upper < Character.MIN_HIGH_SURROGATE ||
lower > Character.MAX_HIGH_SURROGATE && lower > Character.MAX_LOW_SURROGATE &&
upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
return ch -> inRange(lower, ch, upper); return ch -> inRange(lower, ch, upper);
} }
/** /**
* Charactrs within a explicit value range in a case insensitive manner. * Characters within a explicit value range in a case insensitive manner.
*/ */
static CharPredicate CIRange(int lower, int upper) { static CharPredicate CIRange(int lower, int upper) {
return ch -> inRange(lower, ch, upper) || return ch -> inRange(lower, ch, upper) ||

View File

@ -36,7 +36,7 @@
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
* 8216332 8214245 8237599 8241055 * 8216332 8214245 8237599 8241055 8247546
* *
* @library /test/lib * @library /test/lib
* @library /lib/testlibrary/java/lang * @library /lib/testlibrary/java/lang

View File

@ -1,5 +1,5 @@
// //
// Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
// //
// This code is free software; you can redistribute it and/or modify it // This code is free software; you can redistribute it and/or modify it
@ -129,6 +129,31 @@ true \ud800\udc00pqr 0
///\ud800\udc00 ///\ud800\udc00
///false 0 ///false 0
// unpaired surrogate should match
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
xxx\udca9\ud83dyyy
true \udca9 0
// surrogates in a supplementary character should not match
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
\ud83d\udca9
false 0
// unpaired surrogate should match
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
xxx\udca9\ud83dyyy
true \udca9 0
// surrogates part of a supplementary character should not match
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
\ud83d\udca9
false 0
// low surrogate part of a supplementary character should not match
[\x{dc00}-\x{dfff}]
\ud83d\udca9
false 0
// use of x modifier // use of x modifier
\ud800\udc61bc(?x)bl\ud800\udc61h \ud800\udc61bc(?x)bl\ud800\udc61h
\ud800\udc61bcbl\ud800\udc61h \ud800\udc61bcbl\ud800\udc61h