8305486: Add split() variants that keep the delimiters to String and j.u.r.Pattern
Reviewed-by: jpai, rriggs
This commit is contained in:
parent
ad90fb6da3
commit
93ee19f58a
src/java.base/share/classes/java
test/jdk/java/util/regex
@ -3222,6 +3222,109 @@ public final class String
|
||||
* @since 1.4
|
||||
*/
|
||||
public String[] split(String regex, int limit) {
|
||||
return split(regex, limit, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits this string around matches of the given regular expression and
|
||||
* returns both the strings and the matching delimiters.
|
||||
*
|
||||
* <p> The array returned by this method contains each substring of this
|
||||
* string that is terminated by another substring that matches the given
|
||||
* expression or is terminated by the end of the string.
|
||||
* Each substring is immediately followed by the subsequence (the delimiter)
|
||||
* that matches the given expression, <em>except</em> for the last
|
||||
* substring, which is not followed by anything.
|
||||
* The substrings in the array and the delimiters are in the order in which
|
||||
* they occur in the input.
|
||||
* If the expression does not match any part of the input then the resulting
|
||||
* array has just one element, namely this string.
|
||||
*
|
||||
* <p> When there is a positive-width match at the beginning of this
|
||||
* string then an empty leading substring is included at the beginning
|
||||
* of the resulting array. A zero-width match at the beginning however
|
||||
* never produces such empty leading substring nor the empty delimiter.
|
||||
*
|
||||
* <p> The {@code limit} parameter controls the number of times the
|
||||
* pattern is applied and therefore affects the length of the resulting
|
||||
* array.
|
||||
* <ul>
|
||||
* <li> If the <i>limit</i> is positive then the pattern will be applied
|
||||
* at most <i>limit</i> - 1 times, the array's length will be
|
||||
* no greater than 2 × <i>limit</i> - 1, and the array's last
|
||||
* entry will contain all input beyond the last matched delimiter.</li>
|
||||
*
|
||||
* <li> If the <i>limit</i> is zero then the pattern will be applied as
|
||||
* many times as possible, the array can have any length, and trailing
|
||||
* empty strings will be discarded.</li>
|
||||
*
|
||||
* <li> If the <i>limit</i> is negative then the pattern will be applied
|
||||
* as many times as possible and the array can have any length.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p> The input {@code "boo:::and::foo"}, for example, yields the following
|
||||
* results with these parameters:
|
||||
*
|
||||
* <table class="plain" style="margin-left:2em;">
|
||||
* <caption style="display:none">Split example showing regex, limit, and result</caption>
|
||||
* <thead>
|
||||
* <tr>
|
||||
* <th scope="col">Regex</th>
|
||||
* <th scope="col">Limit</th>
|
||||
* <th scope="col">Result</th>
|
||||
* </tr>
|
||||
* </thead>
|
||||
* <tbody>
|
||||
* <tr><th scope="row" rowspan="3" style="font-weight:normal">:+</th>
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th>
|
||||
* <td>{@code { "boo", ":::", "and::foo" }}</td></tr>
|
||||
* <tr><!-- : -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
|
||||
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
|
||||
* <tr><!-- : -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
|
||||
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
|
||||
* <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th>
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
|
||||
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
|
||||
* <tr><!-- o -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
|
||||
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
|
||||
* <tr><!-- o -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th>
|
||||
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o" }}</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
*
|
||||
* @apiNote An invocation of this method of the form
|
||||
* <i>str.</i>{@code splitWithDelimiters(}<i>regex</i>{@code ,} <i>n</i>{@code )}
|
||||
* yields the same result as the expression
|
||||
*
|
||||
* <blockquote>
|
||||
* <code>
|
||||
* {@link java.util.regex.Pattern}.{@link
|
||||
* java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link
|
||||
* java.util.regex.Pattern#splitWithDelimiters(CharSequence,int) splitWithDelimiters}(<i>str</i>, <i>n</i>)
|
||||
* </code>
|
||||
* </blockquote>
|
||||
*
|
||||
* @param regex
|
||||
* the delimiting regular expression
|
||||
*
|
||||
* @param limit
|
||||
* the result threshold, as described above
|
||||
*
|
||||
* @return the array of strings computed by splitting this string
|
||||
* around matches of the given regular expression, alternating
|
||||
* substrings and matching delimiters
|
||||
*
|
||||
* @since 21
|
||||
*/
|
||||
public String[] splitWithDelimiters(String regex, int limit) {
|
||||
return split(regex, limit, true);
|
||||
}
|
||||
|
||||
private String[] split(String regex, int limit, boolean withDelimiters) {
|
||||
/* fastpath if the regex is a
|
||||
* (1) one-char String and this character is not one of the
|
||||
* RegEx's meta characters ".$|()[{^?*+\\", or
|
||||
@ -3230,48 +3333,57 @@ public final class String
|
||||
*/
|
||||
char ch = 0;
|
||||
if (((regex.length() == 1 &&
|
||||
".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
|
||||
(regex.length() == 2 &&
|
||||
regex.charAt(0) == '\\' &&
|
||||
(((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
|
||||
((ch-'a')|('z'-ch)) < 0 &&
|
||||
((ch-'A')|('Z'-ch)) < 0)) &&
|
||||
(ch < Character.MIN_HIGH_SURROGATE ||
|
||||
ch > Character.MAX_LOW_SURROGATE))
|
||||
".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
|
||||
(regex.length() == 2 &&
|
||||
regex.charAt(0) == '\\' &&
|
||||
(((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
|
||||
((ch-'a')|('z'-ch)) < 0 &&
|
||||
((ch-'A')|('Z'-ch)) < 0)) &&
|
||||
(ch < Character.MIN_HIGH_SURROGATE ||
|
||||
ch > Character.MAX_LOW_SURROGATE))
|
||||
{
|
||||
// All the checks above can potentially be constant folded by
|
||||
// a JIT/AOT compiler when the regex is a constant string.
|
||||
// That requires method inlining of the checks, which is only
|
||||
// possible when the actual split logic is in a separate method
|
||||
// because the large split loop can usually not be inlined.
|
||||
return split(ch, limit);
|
||||
return split(ch, limit, withDelimiters);
|
||||
}
|
||||
return Pattern.compile(regex).split(this, limit);
|
||||
Pattern pattern = Pattern.compile(regex);
|
||||
return withDelimiters
|
||||
? pattern.splitWithDelimiters(this, limit)
|
||||
: pattern.split(this, limit);
|
||||
}
|
||||
|
||||
private String[] split(char ch, int limit) {
|
||||
private String[] split(char ch, int limit, boolean withDelimiters) {
|
||||
int matchCount = 0;
|
||||
int off = 0;
|
||||
int next = 0;
|
||||
int next;
|
||||
boolean limited = limit > 0;
|
||||
ArrayList<String> list = new ArrayList<>();
|
||||
String del = withDelimiters ? String.valueOf(ch) : null;
|
||||
while ((next = indexOf(ch, off)) != -1) {
|
||||
if (!limited || list.size() < limit - 1) {
|
||||
if (!limited || matchCount < limit - 1) {
|
||||
list.add(substring(off, next));
|
||||
if (withDelimiters) {
|
||||
list.add(del);
|
||||
}
|
||||
off = next + 1;
|
||||
++matchCount;
|
||||
} else { // last one
|
||||
//assert (list.size() == limit - 1);
|
||||
int last = length();
|
||||
list.add(substring(off, last));
|
||||
off = last;
|
||||
++matchCount;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If no match was found, return this
|
||||
if (off == 0)
|
||||
return new String[]{this};
|
||||
return new String[] {this};
|
||||
|
||||
// Add remaining segment
|
||||
if (!limited || list.size() < limit)
|
||||
if (!limited || matchCount < limit)
|
||||
list.add(substring(off, length()));
|
||||
|
||||
// Construct result
|
||||
@ -3328,7 +3440,7 @@ public final class String
|
||||
* @since 1.4
|
||||
*/
|
||||
public String[] split(String regex) {
|
||||
return split(regex, 0);
|
||||
return split(regex, 0, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1302,6 +1302,100 @@ public final class Pattern
|
||||
* around matches of this pattern
|
||||
*/
|
||||
public String[] split(CharSequence input, int limit) {
|
||||
return split(input, limit, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the given input sequence around matches of this pattern and
|
||||
* returns both the strings and the matching delimiters.
|
||||
*
|
||||
* <p> The array returned by this method contains each substring of the
|
||||
* input sequence that is terminated by another subsequence that matches
|
||||
* this pattern or is terminated by the end of the input sequence.
|
||||
* Each substring is immediately followed by the subsequence (the delimiter)
|
||||
* that matches this pattern, <em>except</em> for the last substring, which
|
||||
* is not followed by anything.
|
||||
* The substrings in the array and the delimiters are in the order in which
|
||||
* they occur in the input.
|
||||
* If this pattern does not match any subsequence of the input then the
|
||||
* resulting array has just one element, namely the input sequence in string
|
||||
* form.
|
||||
*
|
||||
* <p> When there is a positive-width match at the beginning of the input
|
||||
* sequence then an empty leading substring is included at the beginning
|
||||
* of the resulting array.
|
||||
* A zero-width match at the beginning however never produces such empty
|
||||
* leading substring nor the empty delimiter.
|
||||
*
|
||||
* <p> The {@code limit} parameter controls the number of times the
|
||||
* pattern is applied and therefore affects the length of the resulting
|
||||
* array.
|
||||
* <ul>
|
||||
* <li> If the <i>limit</i> is positive then the pattern will be applied
|
||||
* at most <i>limit</i> - 1 times, the array's length will be
|
||||
* no greater than 2 × <i>limit</i> - 1, and the array's last
|
||||
* entry will contain all input beyond the last matched delimiter.</li>
|
||||
*
|
||||
* <li> If the <i>limit</i> is zero then the pattern will be applied as
|
||||
* many times as possible, the array can have any length, and trailing
|
||||
* empty strings, whether substrings or delimiters, will be discarded.</li>
|
||||
*
|
||||
* <li> If the <i>limit</i> is negative then the pattern will be applied
|
||||
* as many times as possible and the array can have any length.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p> The input {@code "boo:::and::foo"}, for example, yields the following
|
||||
* results with these parameters:
|
||||
*
|
||||
* <table class="plain" style="margin-left:2em;">
|
||||
* <caption style="display:none">Split example showing regex, limit, and result</caption>
|
||||
* <thead>
|
||||
* <tr>
|
||||
* <th scope="col">Regex</th>
|
||||
* <th scope="col">Limit</th>
|
||||
* <th scope="col">Result</th>
|
||||
* </tr>
|
||||
* </thead>
|
||||
* <tbody>
|
||||
* <tr><th scope="row" rowspan="3" style="font-weight:normal">:+</th>
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th>
|
||||
* <td>{@code { "boo", ":::", "and::foo" }}</td></tr>
|
||||
* <tr><!-- : -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
|
||||
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
|
||||
* <tr><!-- : -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
|
||||
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
|
||||
* <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th>
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
|
||||
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
|
||||
* <tr><!-- o -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
|
||||
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
|
||||
* <tr><!-- o -->
|
||||
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th>
|
||||
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o" }}</td></tr>
|
||||
* </tbody>
|
||||
* </table>
|
||||
*
|
||||
* @param input
|
||||
* The character sequence to be split
|
||||
*
|
||||
* @param limit
|
||||
* The result threshold, as described above
|
||||
*
|
||||
* @return The array of strings computed by splitting the input
|
||||
* around matches of this pattern, alternating
|
||||
* substrings and matching delimiters
|
||||
*
|
||||
* @since 21
|
||||
*/
|
||||
public String[] splitWithDelimiters(CharSequence input, int limit) {
|
||||
return split(input, limit, true);
|
||||
}
|
||||
|
||||
private String[] split(CharSequence input, int limit, boolean withDelimiters) {
|
||||
int matchCount = 0;
|
||||
int index = 0;
|
||||
boolean matchLimited = limit > 0;
|
||||
ArrayList<String> matchList = new ArrayList<>();
|
||||
@ -1309,7 +1403,7 @@ public final class Pattern
|
||||
|
||||
// Add segments before each match found
|
||||
while(m.find()) {
|
||||
if (!matchLimited || matchList.size() < limit - 1) {
|
||||
if (!matchLimited || matchCount < limit - 1) {
|
||||
if (index == 0 && index == m.start() && m.start() == m.end()) {
|
||||
// no empty leading substring included for zero-width match
|
||||
// at the beginning of the input char sequence.
|
||||
@ -1318,11 +1412,15 @@ public final class Pattern
|
||||
String match = input.subSequence(index, m.start()).toString();
|
||||
matchList.add(match);
|
||||
index = m.end();
|
||||
} else if (matchList.size() == limit - 1) { // last one
|
||||
String match = input.subSequence(index,
|
||||
input.length()).toString();
|
||||
if (withDelimiters) {
|
||||
matchList.add(input.subSequence(m.start(), index).toString());
|
||||
}
|
||||
++matchCount;
|
||||
} else if (matchCount == limit - 1) { // last one
|
||||
String match = input.subSequence(index, input.length()).toString();
|
||||
matchList.add(match);
|
||||
index = m.end();
|
||||
++matchCount;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1331,14 +1429,16 @@ public final class Pattern
|
||||
return new String[] {input.toString()};
|
||||
|
||||
// Add remaining segment
|
||||
if (!matchLimited || matchList.size() < limit)
|
||||
if (!matchLimited || matchCount < limit)
|
||||
matchList.add(input.subSequence(index, input.length()).toString());
|
||||
|
||||
// Construct result
|
||||
int resultSize = matchList.size();
|
||||
if (limit == 0)
|
||||
while (resultSize > 0 && matchList.get(resultSize-1).isEmpty())
|
||||
if (limit == 0) {
|
||||
while (resultSize > 0 && matchList.get(resultSize-1).isEmpty()) {
|
||||
resultSize--;
|
||||
}
|
||||
}
|
||||
String[] result = new String[resultSize];
|
||||
return matchList.subList(0, resultSize).toArray(result);
|
||||
}
|
||||
@ -1378,7 +1478,7 @@ public final class Pattern
|
||||
* around matches of this pattern
|
||||
*/
|
||||
public String[] split(CharSequence input) {
|
||||
return split(input, 0);
|
||||
return split(input, 0, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
122
test/jdk/java/util/regex/SplitWithDelimitersTest.java
Normal file
122
test/jdk/java/util/regex/SplitWithDelimitersTest.java
Normal file
@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 8305486
|
||||
* @summary Tests to exercise the split functionality added in the issue.
|
||||
* @run junit SplitWithDelimitersTest
|
||||
*/
|
||||
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.params.provider.Arguments.arguments;
|
||||
|
||||
public class SplitWithDelimitersTest {
|
||||
|
||||
private static String[] dropOddIndexed(String[] a, int limit) {
|
||||
String[] r = new String[(a.length + 1) / 2];
|
||||
for (int i = 0; i < a.length; i += 2) {
|
||||
r[i / 2] = a[i];
|
||||
}
|
||||
int len = r.length;
|
||||
if (limit == 0) {
|
||||
/* Also drop trailing empty strings */
|
||||
for (; len > 0 && r[len - 1].isEmpty(); --len); // empty body
|
||||
}
|
||||
return len < r.length ? Arrays.copyOf(r, len) : r;
|
||||
}
|
||||
|
||||
static Arguments[] testSplit() {
|
||||
return new Arguments[] {
|
||||
arguments(new String[] {"b", "o", "", "o", ":::and::f", "o", "", "o", ""},
|
||||
"boo:::and::foo", "o", 5),
|
||||
arguments(new String[] {"b", "o", "", "o", ":::and::f", "o", "o"},
|
||||
"boo:::and::foo", "o", 4),
|
||||
arguments(new String[] {"b", "o", "", "o", ":::and::foo"},
|
||||
"boo:::and::foo", "o", 3),
|
||||
arguments(new String[] {"b", "o", "o:::and::foo"},
|
||||
"boo:::and::foo", "o", 2),
|
||||
arguments(new String[] {"boo:::and::foo"},
|
||||
"boo:::and::foo", "o", 1),
|
||||
arguments(new String[] {"b", "o", "", "o", ":::and::f", "o", "", "o"},
|
||||
"boo:::and::foo", "o", 0),
|
||||
arguments(new String[] {"b", "o", "", "o", ":::and::f", "o", "", "o", ""},
|
||||
"boo:::and::foo", "o", -1),
|
||||
|
||||
arguments(new String[] {"boo", ":::", "and", "::", "foo"},
|
||||
"boo:::and::foo", ":+", 3),
|
||||
arguments(new String[] {"boo", ":::", "and::foo"},
|
||||
"boo:::and::foo", ":+", 2),
|
||||
arguments(new String[] {"boo:::and::foo"},
|
||||
"boo:::and::foo", ":+", 1),
|
||||
arguments(new String[] {"boo", ":::", "and", "::", "foo"},
|
||||
"boo:::and::foo", ":+", 0),
|
||||
arguments(new String[] {"boo", ":::", "and", "::", "foo"},
|
||||
"boo:::and::foo", ":+", -1),
|
||||
|
||||
arguments(new String[] {"b", "", "b", "", ""},
|
||||
"bb", "a*|b*", 3),
|
||||
arguments(new String[] {"b", "", "b"},
|
||||
"bb", "a*|b*", 2),
|
||||
arguments(new String[] {"bb"},
|
||||
"bb", "a*|b*", 1),
|
||||
arguments(new String[] {"b", "", "b"},
|
||||
"bb", "a*|b*", 0),
|
||||
arguments(new String[] {"b", "", "b", "", ""},
|
||||
"bb", "a*|b*", -1),
|
||||
|
||||
arguments(new String[] {"", "bb", "", "", ""},
|
||||
"bb", "b*|a*", 3),
|
||||
arguments(new String[] {"", "bb", ""},
|
||||
"bb", "b*|a*", 2),
|
||||
arguments(new String[] {"bb"},
|
||||
"bb", "b*|a*", 1),
|
||||
arguments(new String[] {"", "bb"},
|
||||
"bb", "b*|a*", 0),
|
||||
arguments(new String[] {"", "bb", "", "", ""},
|
||||
"bb", "b*|a*", -1),
|
||||
};
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource
|
||||
void testSplit(String[] expected, String target, String regex, int limit) {
|
||||
String[] computedWith = target.splitWithDelimiters(regex, limit);
|
||||
assertArrayEquals(expected, computedWith);
|
||||
String[] patComputedWith = Pattern.compile(regex).splitWithDelimiters(target, limit);
|
||||
assertArrayEquals(computedWith, patComputedWith);
|
||||
|
||||
String[] computedWithout = target.split(regex, limit);
|
||||
assertArrayEquals(dropOddIndexed(expected, limit), computedWithout);
|
||||
String[] patComputedWithout = Pattern.compile(regex).split(target, limit);
|
||||
assertArrayEquals(computedWithout, patComputedWithout);
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user