diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java
index b23af8e5f14..bec7d5704df 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -25,7 +25,9 @@
package com.sun.tools.javac.util;
+import java.util.HashMap;
import java.util.Locale;
+import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -67,4 +69,186 @@ public class StringUtils {
return m.find(startIndex) ? m.start() : -1;
}
+ /**Call {@link #of(String, String)} to calculate the distance.
+ *
+ *
Usage Examples
+ *
+ * Pick top three vocabulary words whose normalized distance from
+ * the misspelled word is no greater than one-third.
+ *
+ * {@snippet :
+ * record Pair(String word, int distance) { }
+ *
+ * var suggestions = vocabulary.stream()
+ * .map(v -> new Pair(v, DamerauLevenshteinDistance.of(v, misspelledWord)))
+ * .filter(p -> Double.compare(1.0 / 3, ((double) p.distance()) / p.word().length()) >= 0)
+ * .sorted(Comparator.comparingDouble(Pair::distance))
+ * .limit(3)
+ * .toList();
+ * }
+ */
+ public static final class DamerauLevenshteinDistance {
+
+ /*
+ * This is a Java implementation of the algorithm from "An Extension of
+ * the String-to-String Correction Problem" by R. Lowrance and
+ * R. A. Wagner (https://dl.acm.org/doi/10.1145/321879.321880).
+ * That algorithm is O(|a|*|b|) in both space and time.
+ *
+ * This implementation encapsulates arrays and (most of) strings behind
+ * methods to accommodate for algorithm indexing schemes which are -1,
+ * 0, and 1 based and to offset memory and performance overhead if any
+ * strings in the pair contain non-ASCII symbols.
+ */
+
+ private final int INF;
+ private final int[][] h;
+ private final String a;
+ private final String b;
+
+ private static final int Wi = 1; // insert
+ private static final int Wd = 1; // delete
+ private static final int Wc = 1; // change
+ private static final int Ws = 1; // interchange
+
+ static {
+ assert 2L * Ws >= Wi + Wd; // algorithm requirement
+ }
+
+ private int[] smallDA;
+ private Map bigDA;
+
+ /** {@return the edit distance between two strings}
+ * The distance returned from this method has the following properties:
+ *
+ * - {@code a.equals(b) && of(a, b) == 0) || (!a.equals(b) && of(a, b) > 0)}
+ *
- {@code of(a, b) == of(b, a)}
+ *
- {@code of(a, b) + of(b, c) >= of(a, c)}
+ *
+ *
+ * @implSpec
+ * This method is safe to be called by multiple threads.
+ * @throws NullPointerException if any of the two strings are null
+ * @throws ArithmeticException if any step of the calculation
+ * overflows an int
+ */
+ public static int of(String a, String b) {
+ return new DamerauLevenshteinDistance(a, b).calculate();
+ }
+
+ private int calculate() {
+ for (int i = 0; i <= a.length(); i++) {
+ h(i, 0, i * Wd);
+ h(i, -1, INF);
+ }
+ for (int j = 0; j <= b.length(); j++) {
+ h(0, j, j * Wi);
+ h(-1, j, INF);
+ }
+ // algorithm's line #8 that initializes DA is not needed here
+ // because this class encapsulates DA and initializes it
+ // separately
+ for (int i = 1; i <= a.length(); i++) {
+ int db = 0;
+ for (int j = 1; j <= b.length(); j++) {
+ int i1 = da(characterAt(b, j));
+ int j1 = db;
+ boolean eq = characterAt(a, i) == characterAt(b, j);
+ int d = eq ? 0 : Wc;
+ if (eq) {
+ db = j;
+ }
+ int m = min(h(i - 1, j - 1) + d,
+ h(i, j - 1) + Wi,
+ h(i - 1, j) + Wd,
+ h(i1 - 1, j1 - 1) + (i - i1 - 1) * Wd + Ws + (j - j1 - 1) * Wi);
+ h(i, j, m);
+ }
+ da(characterAt(a, i), i);
+ }
+ return h(a.length(), b.length());
+ }
+
+ private int characterAt(String s, int i) {
+ return s.charAt(i - 1);
+ }
+
+ private void h(int i, int j, int value) {
+ h[i + 1][j + 1] = value;
+ }
+
+ private int h(int i, int j) {
+ return h[i + 1][j + 1];
+ }
+
+ /*
+ * This implementation works with UTF-16 strings, but favours strings
+ * that comprise ASCII characters. Measuring distance between a pair
+ * of ASCII strings is likely to be a typical use case for this
+ * implementation.
+ *
+ * If a character for which the value is to be stored does not fit into
+ * the ASCII range, this implementation switches to a different storage
+ * dynamically. Since neither string lengths nor character values
+ * change, any state accumulated so far, including any loops and local
+ * variables, remains valid.
+ *
+ * Note, that if the provided character were a surrogate and this
+ * implementation dealt with code points, which it does not, dynamic
+ * switching of the storage would not be enough. The complete
+ * representation would need to be changed. That would entail
+ * discarding any accumulated state and repeating the computation.
+ */
+
+ private int da(int i) {
+ if (smallDA != null && i < '\u0080') {
+ return smallDA[i];
+ }
+ // if a character cannot be found, it means that the character
+ // hasn't been updated, which means that the associated value
+ // is the default value, which is 0
+ if (bigDA != null) {
+ Integer v = bigDA.get((char) i);
+ return v == null ? 0 : v;
+ } else {
+ return 0;
+ }
+ }
+
+ private void da(int i, int value) {
+ if (bigDA == null && i < '\u0080') {
+ if (smallDA == null) {
+ smallDA = new int[127];
+ }
+ smallDA[i] = value;
+ } else {
+ if (bigDA == null) {
+ bigDA = new HashMap<>();
+ if (smallDA != null) { // rebuild DA accumulated so far
+ for (int j = 0; j < smallDA.length; j++) {
+ int v = smallDA[j];
+ if (v != 0)
+ bigDA.put((char) j, v);
+ }
+ smallDA = null; // no longer needed
+ }
+ }
+ bigDA.put((char) i, value);
+ }
+ assert smallDA == null ^ bigDA == null; // at most one in use
+ }
+
+ private static int min(int a, int b, int c, int d) {
+ return Math.min(a, Math.min(b, Math.min(c, d)));
+ }
+
+ private DamerauLevenshteinDistance(String a, String b) {
+ this.a = a;
+ this.b = b;
+ this.h = new int[this.a.length() + 2][this.b.length() + 2];
+ INF = this.a.length() * Wd + this.b.length() * Wi + 1;
+ if (INF < 0)
+ throw new ArithmeticException("Overflow");
+ }
+ }
}
diff --git a/test/langtools/tools/javac/util/StringUtilsTest.java b/test/langtools/tools/javac/util/StringUtilsTest.java
index 4c6478b12f1..2f456f9c97c 100644
--- a/test/langtools/tools/javac/util/StringUtilsTest.java
+++ b/test/langtools/tools/javac/util/StringUtilsTest.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -23,15 +23,17 @@
/**
* @test
- * @bug 8029800 8043186
+ * @bug 8029800 8043186 8313693
* @summary Unit test StringUtils
* @modules jdk.compiler/com.sun.tools.javac.util
* @run main StringUtilsTest
*/
+import java.util.List;
import java.util.Locale;
import java.util.Objects;
import com.sun.tools.javac.util.StringUtils;
+import com.sun.tools.javac.util.StringUtils.DamerauLevenshteinDistance;
public class StringUtilsTest {
public static void main(String... args) throws Exception {
@@ -53,6 +55,57 @@ public class StringUtilsTest {
assertEquals(2, StringUtils.indexOfIgnoreCase(" lookFor", "lookfor"));
assertEquals(11, StringUtils.indexOfIgnoreCase(" lookFor LOOKfor", "lookfor", 11));
assertEquals(2, StringUtils.indexOfIgnoreCase("\u0130\u0130lookFor", "lookfor"));
+
+ //verify Damerau-Levenshtein
+
+ assertEquals(3, DamerauLevenshteinDistance.of("kitten", "sitting"));
+ // note that the restricted Damerau-Levenshtein distance would be 3, not 2:
+ assertEquals(2, DamerauLevenshteinDistance.of("ca", "abc"));
+ //verify strings comprising only non-LATIN1 characters
+ assertEquals(1, DamerauLevenshteinDistance.of("\u0438\u044e\u043d\u044c",
+ "\u0438\u044e\u043b\u044c"));
+ //verify strings comprising mixed characters: non-LATIN1 and ASCII
+ // it's important to start with ASCII characters, so that we
+ // test switching a storage (see current implementation)
+ assertEquals(2, DamerauLevenshteinDistance.of("c\u043ede", "cod\u0435"));
+
+ //verify metric properties
+ for (String a : List.of("", "a", "b", "abc")) {
+ for (String b : List.of("", "a", "b", "abc")) {
+ assertNonNegativity(a, b);
+ assertSymmetry(a, b);
+ }
+ }
+
+ for (String a : List.of("", "a", "b", "c")) {
+ for (String b : List.of("ab", "ac", "bc")) {
+ for (String c : List.of("abc", "bca", "cab")) {
+ assertTriangleInequality(a, b, c);
+ assertTriangleInequality(b, c, a);
+ assertTriangleInequality(c, a, b);
+ }
+ }
+ }
+ }
+
+ private void assertNonNegativity(String a, String b) {
+ if (a.equals(b)) {
+ assertEquals(0, DamerauLevenshteinDistance.of(a, b));
+ } else {
+ assertTrue(DamerauLevenshteinDistance.of(a, b) > 0);
+ }
+ }
+
+ private void assertSymmetry(String a, String b) {
+ assertEquals(DamerauLevenshteinDistance.of(a, b),
+ DamerauLevenshteinDistance.of(b, a));
+ }
+
+ private void assertTriangleInequality(String a, String b, String c) {
+ int ab = DamerauLevenshteinDistance.of(a, b);
+ int bc = DamerauLevenshteinDistance.of(b, c);
+ int ac = DamerauLevenshteinDistance.of(a, c);
+ assertTrue(ab + bc >= ac);
}
void assertEquals(String expected, String actual) {
@@ -66,4 +119,10 @@ public class StringUtilsTest {
throw new IllegalStateException("expected=" + expected + "; actual=" + actual);
}
}
+
+ void assertTrue(boolean cond) {
+ if (!cond) {
+ throw new IllegalStateException();
+ }
+ }
}