diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java index b23af8e5f14..bec7d5704df 100644 --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/util/StringUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,7 +25,9 @@ package com.sun.tools.javac.util; +import java.util.HashMap; import java.util.Locale; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -67,4 +69,186 @@ public class StringUtils { return m.find(startIndex) ? m.start() : -1; } + /**Call {@link #of(String, String)} to calculate the distance. + * + *

Usage Examples

+ * + * Pick top three vocabulary words whose normalized distance from + * the misspelled word is no greater than one-third. + * + * {@snippet : + * record Pair(String word, int distance) { } + * + * var suggestions = vocabulary.stream() + * .map(v -> new Pair(v, DamerauLevenshteinDistance.of(v, misspelledWord))) + * .filter(p -> Double.compare(1.0 / 3, ((double) p.distance()) / p.word().length()) >= 0) + * .sorted(Comparator.comparingDouble(Pair::distance)) + * .limit(3) + * .toList(); + * } + */ + public static final class DamerauLevenshteinDistance { + + /* + * This is a Java implementation of the algorithm from "An Extension of + * the String-to-String Correction Problem" by R. Lowrance and + * R. A. Wagner (https://dl.acm.org/doi/10.1145/321879.321880). + * That algorithm is O(|a|*|b|) in both space and time. + * + * This implementation encapsulates arrays and (most of) strings behind + * methods to accommodate for algorithm indexing schemes which are -1, + * 0, and 1 based and to offset memory and performance overhead if any + * strings in the pair contain non-ASCII symbols. + */ + + private final int INF; + private final int[][] h; + private final String a; + private final String b; + + private static final int Wi = 1; // insert + private static final int Wd = 1; // delete + private static final int Wc = 1; // change + private static final int Ws = 1; // interchange + + static { + assert 2L * Ws >= Wi + Wd; // algorithm requirement + } + + private int[] smallDA; + private Map bigDA; + + /** {@return the edit distance between two strings} + * The distance returned from this method has the following properties: + *
    + *
  1. {@code a.equals(b) && of(a, b) == 0) || (!a.equals(b) && of(a, b) > 0)} + *
  2. {@code of(a, b) == of(b, a)} + *
  3. {@code of(a, b) + of(b, c) >= of(a, c)} + *
+ * + * @implSpec + * This method is safe to be called by multiple threads. + * @throws NullPointerException if any of the two strings are null + * @throws ArithmeticException if any step of the calculation + * overflows an int + */ + public static int of(String a, String b) { + return new DamerauLevenshteinDistance(a, b).calculate(); + } + + private int calculate() { + for (int i = 0; i <= a.length(); i++) { + h(i, 0, i * Wd); + h(i, -1, INF); + } + for (int j = 0; j <= b.length(); j++) { + h(0, j, j * Wi); + h(-1, j, INF); + } + // algorithm's line #8 that initializes DA is not needed here + // because this class encapsulates DA and initializes it + // separately + for (int i = 1; i <= a.length(); i++) { + int db = 0; + for (int j = 1; j <= b.length(); j++) { + int i1 = da(characterAt(b, j)); + int j1 = db; + boolean eq = characterAt(a, i) == characterAt(b, j); + int d = eq ? 0 : Wc; + if (eq) { + db = j; + } + int m = min(h(i - 1, j - 1) + d, + h(i, j - 1) + Wi, + h(i - 1, j) + Wd, + h(i1 - 1, j1 - 1) + (i - i1 - 1) * Wd + Ws + (j - j1 - 1) * Wi); + h(i, j, m); + } + da(characterAt(a, i), i); + } + return h(a.length(), b.length()); + } + + private int characterAt(String s, int i) { + return s.charAt(i - 1); + } + + private void h(int i, int j, int value) { + h[i + 1][j + 1] = value; + } + + private int h(int i, int j) { + return h[i + 1][j + 1]; + } + + /* + * This implementation works with UTF-16 strings, but favours strings + * that comprise ASCII characters. Measuring distance between a pair + * of ASCII strings is likely to be a typical use case for this + * implementation. + * + * If a character for which the value is to be stored does not fit into + * the ASCII range, this implementation switches to a different storage + * dynamically. Since neither string lengths nor character values + * change, any state accumulated so far, including any loops and local + * variables, remains valid. + * + * Note, that if the provided character were a surrogate and this + * implementation dealt with code points, which it does not, dynamic + * switching of the storage would not be enough. The complete + * representation would need to be changed. That would entail + * discarding any accumulated state and repeating the computation. + */ + + private int da(int i) { + if (smallDA != null && i < '\u0080') { + return smallDA[i]; + } + // if a character cannot be found, it means that the character + // hasn't been updated, which means that the associated value + // is the default value, which is 0 + if (bigDA != null) { + Integer v = bigDA.get((char) i); + return v == null ? 0 : v; + } else { + return 0; + } + } + + private void da(int i, int value) { + if (bigDA == null && i < '\u0080') { + if (smallDA == null) { + smallDA = new int[127]; + } + smallDA[i] = value; + } else { + if (bigDA == null) { + bigDA = new HashMap<>(); + if (smallDA != null) { // rebuild DA accumulated so far + for (int j = 0; j < smallDA.length; j++) { + int v = smallDA[j]; + if (v != 0) + bigDA.put((char) j, v); + } + smallDA = null; // no longer needed + } + } + bigDA.put((char) i, value); + } + assert smallDA == null ^ bigDA == null; // at most one in use + } + + private static int min(int a, int b, int c, int d) { + return Math.min(a, Math.min(b, Math.min(c, d))); + } + + private DamerauLevenshteinDistance(String a, String b) { + this.a = a; + this.b = b; + this.h = new int[this.a.length() + 2][this.b.length() + 2]; + INF = this.a.length() * Wd + this.b.length() * Wi + 1; + if (INF < 0) + throw new ArithmeticException("Overflow"); + } + } } diff --git a/test/langtools/tools/javac/util/StringUtilsTest.java b/test/langtools/tools/javac/util/StringUtilsTest.java index 4c6478b12f1..2f456f9c97c 100644 --- a/test/langtools/tools/javac/util/StringUtilsTest.java +++ b/test/langtools/tools/javac/util/StringUtilsTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,15 +23,17 @@ /** * @test - * @bug 8029800 8043186 + * @bug 8029800 8043186 8313693 * @summary Unit test StringUtils * @modules jdk.compiler/com.sun.tools.javac.util * @run main StringUtilsTest */ +import java.util.List; import java.util.Locale; import java.util.Objects; import com.sun.tools.javac.util.StringUtils; +import com.sun.tools.javac.util.StringUtils.DamerauLevenshteinDistance; public class StringUtilsTest { public static void main(String... args) throws Exception { @@ -53,6 +55,57 @@ public class StringUtilsTest { assertEquals(2, StringUtils.indexOfIgnoreCase(" lookFor", "lookfor")); assertEquals(11, StringUtils.indexOfIgnoreCase(" lookFor LOOKfor", "lookfor", 11)); assertEquals(2, StringUtils.indexOfIgnoreCase("\u0130\u0130lookFor", "lookfor")); + + //verify Damerau-Levenshtein + + assertEquals(3, DamerauLevenshteinDistance.of("kitten", "sitting")); + // note that the restricted Damerau-Levenshtein distance would be 3, not 2: + assertEquals(2, DamerauLevenshteinDistance.of("ca", "abc")); + //verify strings comprising only non-LATIN1 characters + assertEquals(1, DamerauLevenshteinDistance.of("\u0438\u044e\u043d\u044c", + "\u0438\u044e\u043b\u044c")); + //verify strings comprising mixed characters: non-LATIN1 and ASCII + // it's important to start with ASCII characters, so that we + // test switching a storage (see current implementation) + assertEquals(2, DamerauLevenshteinDistance.of("c\u043ede", "cod\u0435")); + + //verify metric properties + for (String a : List.of("", "a", "b", "abc")) { + for (String b : List.of("", "a", "b", "abc")) { + assertNonNegativity(a, b); + assertSymmetry(a, b); + } + } + + for (String a : List.of("", "a", "b", "c")) { + for (String b : List.of("ab", "ac", "bc")) { + for (String c : List.of("abc", "bca", "cab")) { + assertTriangleInequality(a, b, c); + assertTriangleInequality(b, c, a); + assertTriangleInequality(c, a, b); + } + } + } + } + + private void assertNonNegativity(String a, String b) { + if (a.equals(b)) { + assertEquals(0, DamerauLevenshteinDistance.of(a, b)); + } else { + assertTrue(DamerauLevenshteinDistance.of(a, b) > 0); + } + } + + private void assertSymmetry(String a, String b) { + assertEquals(DamerauLevenshteinDistance.of(a, b), + DamerauLevenshteinDistance.of(b, a)); + } + + private void assertTriangleInequality(String a, String b, String c) { + int ab = DamerauLevenshteinDistance.of(a, b); + int bc = DamerauLevenshteinDistance.of(b, c); + int ac = DamerauLevenshteinDistance.of(a, c); + assertTrue(ab + bc >= ac); } void assertEquals(String expected, String actual) { @@ -66,4 +119,10 @@ public class StringUtilsTest { throw new IllegalStateException("expected=" + expected + "; actual=" + actual); } } + + void assertTrue(boolean cond) { + if (!cond) { + throw new IllegalStateException(); + } + } }