8313693: Introduce an internal utility for the Damerau–Levenshtein distance calculation

Reviewed-by: jlahoda, jjg
This commit is contained in:
Pavel Rappo 2023-08-09 16:08:23 +00:00
parent 360f65d7b1
commit 593ba2fe47
2 changed files with 246 additions and 3 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,7 +25,9 @@
package com.sun.tools.javac.util;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -67,4 +69,186 @@ public class StringUtils {
return m.find(startIndex) ? m.start() : -1;
}
/**Call {@link #of(String, String)} to calculate the distance.
*
* <h2>Usage Examples</h2>
*
* Pick top three vocabulary words whose normalized distance from
* the misspelled word is no greater than one-third.
*
* {@snippet :
* record Pair(String word, int distance) { }
*
* var suggestions = vocabulary.stream()
* .map(v -> new Pair(v, DamerauLevenshteinDistance.of(v, misspelledWord)))
* .filter(p -> Double.compare(1.0 / 3, ((double) p.distance()) / p.word().length()) >= 0)
* .sorted(Comparator.comparingDouble(Pair::distance))
* .limit(3)
* .toList();
* }
*/
public static final class DamerauLevenshteinDistance {
/*
* This is a Java implementation of the algorithm from "An Extension of
* the String-to-String Correction Problem" by R. Lowrance and
* R. A. Wagner (https://dl.acm.org/doi/10.1145/321879.321880).
* That algorithm is O(|a|*|b|) in both space and time.
*
* This implementation encapsulates arrays and (most of) strings behind
* methods to accommodate for algorithm indexing schemes which are -1,
* 0, and 1 based and to offset memory and performance overhead if any
* strings in the pair contain non-ASCII symbols.
*/
private final int INF;
private final int[][] h;
private final String a;
private final String b;
private static final int Wi = 1; // insert
private static final int Wd = 1; // delete
private static final int Wc = 1; // change
private static final int Ws = 1; // interchange
static {
assert 2L * Ws >= Wi + Wd; // algorithm requirement
}
private int[] smallDA;
private Map<Character, Integer> bigDA;
/** {@return the edit distance between two strings}
* The distance returned from this method has the following properties:
* <ol>
* <li> {@code a.equals(b) && of(a, b) == 0) || (!a.equals(b) && of(a, b) > 0)}
* <li> {@code of(a, b) == of(b, a)}
* <li> {@code of(a, b) + of(b, c) >= of(a, c)}
* </ol>
*
* @implSpec
* This method is safe to be called by multiple threads.
* @throws NullPointerException if any of the two strings are null
* @throws ArithmeticException if any step of the calculation
* overflows an int
*/
public static int of(String a, String b) {
return new DamerauLevenshteinDistance(a, b).calculate();
}
private int calculate() {
for (int i = 0; i <= a.length(); i++) {
h(i, 0, i * Wd);
h(i, -1, INF);
}
for (int j = 0; j <= b.length(); j++) {
h(0, j, j * Wi);
h(-1, j, INF);
}
// algorithm's line #8 that initializes DA is not needed here
// because this class encapsulates DA and initializes it
// separately
for (int i = 1; i <= a.length(); i++) {
int db = 0;
for (int j = 1; j <= b.length(); j++) {
int i1 = da(characterAt(b, j));
int j1 = db;
boolean eq = characterAt(a, i) == characterAt(b, j);
int d = eq ? 0 : Wc;
if (eq) {
db = j;
}
int m = min(h(i - 1, j - 1) + d,
h(i, j - 1) + Wi,
h(i - 1, j) + Wd,
h(i1 - 1, j1 - 1) + (i - i1 - 1) * Wd + Ws + (j - j1 - 1) * Wi);
h(i, j, m);
}
da(characterAt(a, i), i);
}
return h(a.length(), b.length());
}
private int characterAt(String s, int i) {
return s.charAt(i - 1);
}
private void h(int i, int j, int value) {
h[i + 1][j + 1] = value;
}
private int h(int i, int j) {
return h[i + 1][j + 1];
}
/*
* This implementation works with UTF-16 strings, but favours strings
* that comprise ASCII characters. Measuring distance between a pair
* of ASCII strings is likely to be a typical use case for this
* implementation.
*
* If a character for which the value is to be stored does not fit into
* the ASCII range, this implementation switches to a different storage
* dynamically. Since neither string lengths nor character values
* change, any state accumulated so far, including any loops and local
* variables, remains valid.
*
* Note, that if the provided character were a surrogate and this
* implementation dealt with code points, which it does not, dynamic
* switching of the storage would not be enough. The complete
* representation would need to be changed. That would entail
* discarding any accumulated state and repeating the computation.
*/
private int da(int i) {
if (smallDA != null && i < '\u0080') {
return smallDA[i];
}
// if a character cannot be found, it means that the character
// hasn't been updated, which means that the associated value
// is the default value, which is 0
if (bigDA != null) {
Integer v = bigDA.get((char) i);
return v == null ? 0 : v;
} else {
return 0;
}
}
private void da(int i, int value) {
if (bigDA == null && i < '\u0080') {
if (smallDA == null) {
smallDA = new int[127];
}
smallDA[i] = value;
} else {
if (bigDA == null) {
bigDA = new HashMap<>();
if (smallDA != null) { // rebuild DA accumulated so far
for (int j = 0; j < smallDA.length; j++) {
int v = smallDA[j];
if (v != 0)
bigDA.put((char) j, v);
}
smallDA = null; // no longer needed
}
}
bigDA.put((char) i, value);
}
assert smallDA == null ^ bigDA == null; // at most one in use
}
private static int min(int a, int b, int c, int d) {
return Math.min(a, Math.min(b, Math.min(c, d)));
}
private DamerauLevenshteinDistance(String a, String b) {
this.a = a;
this.b = b;
this.h = new int[this.a.length() + 2][this.b.length() + 2];
INF = this.a.length() * Wd + this.b.length() * Wi + 1;
if (INF < 0)
throw new ArithmeticException("Overflow");
}
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -23,15 +23,17 @@
/**
* @test
* @bug 8029800 8043186
* @bug 8029800 8043186 8313693
* @summary Unit test StringUtils
* @modules jdk.compiler/com.sun.tools.javac.util
* @run main StringUtilsTest
*/
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import com.sun.tools.javac.util.StringUtils;
import com.sun.tools.javac.util.StringUtils.DamerauLevenshteinDistance;
public class StringUtilsTest {
public static void main(String... args) throws Exception {
@ -53,6 +55,57 @@ public class StringUtilsTest {
assertEquals(2, StringUtils.indexOfIgnoreCase(" lookFor", "lookfor"));
assertEquals(11, StringUtils.indexOfIgnoreCase(" lookFor LOOKfor", "lookfor", 11));
assertEquals(2, StringUtils.indexOfIgnoreCase("\u0130\u0130lookFor", "lookfor"));
//verify Damerau-Levenshtein
assertEquals(3, DamerauLevenshteinDistance.of("kitten", "sitting"));
// note that the restricted Damerau-Levenshtein distance would be 3, not 2:
assertEquals(2, DamerauLevenshteinDistance.of("ca", "abc"));
//verify strings comprising only non-LATIN1 characters
assertEquals(1, DamerauLevenshteinDistance.of("\u0438\u044e\u043d\u044c",
"\u0438\u044e\u043b\u044c"));
//verify strings comprising mixed characters: non-LATIN1 and ASCII
// it's important to start with ASCII characters, so that we
// test switching a storage (see current implementation)
assertEquals(2, DamerauLevenshteinDistance.of("c\u043ede", "cod\u0435"));
//verify metric properties
for (String a : List.of("", "a", "b", "abc")) {
for (String b : List.of("", "a", "b", "abc")) {
assertNonNegativity(a, b);
assertSymmetry(a, b);
}
}
for (String a : List.of("", "a", "b", "c")) {
for (String b : List.of("ab", "ac", "bc")) {
for (String c : List.of("abc", "bca", "cab")) {
assertTriangleInequality(a, b, c);
assertTriangleInequality(b, c, a);
assertTriangleInequality(c, a, b);
}
}
}
}
private void assertNonNegativity(String a, String b) {
if (a.equals(b)) {
assertEquals(0, DamerauLevenshteinDistance.of(a, b));
} else {
assertTrue(DamerauLevenshteinDistance.of(a, b) > 0);
}
}
private void assertSymmetry(String a, String b) {
assertEquals(DamerauLevenshteinDistance.of(a, b),
DamerauLevenshteinDistance.of(b, a));
}
private void assertTriangleInequality(String a, String b, String c) {
int ab = DamerauLevenshteinDistance.of(a, b);
int bc = DamerauLevenshteinDistance.of(b, c);
int ac = DamerauLevenshteinDistance.of(a, c);
assertTrue(ab + bc >= ac);
}
void assertEquals(String expected, String actual) {
@ -66,4 +119,10 @@ public class StringUtilsTest {
throw new IllegalStateException("expected=" + expected + "; actual=" + actual);
}
}
void assertTrue(boolean cond) {
if (!cond) {
throw new IllegalStateException();
}
}
}