8303056: Improve support for Unicode characters and digits in JavaDoc search

Reviewed-by: jjg
This commit is contained in:
Hannes Wallnöfer 2023-08-09 09:50:21 +00:00
parent 9cf12bb977
commit 52ec4bcb1b
5 changed files with 121 additions and 19 deletions

View File

@ -44,6 +44,9 @@ const categories = {
const highlight = "<span class='result-highlight'>$&</span>";
const NO_MATCH = {};
const MAX_RESULTS = 300;
const UNICODE_LETTER = 0;
const UNICODE_DIGIT = 1;
const UNICODE_OTHER = 2;
function checkUnnamed(name, separator) {
return name === "<Unnamed>" || !name ? "" : name + separator;
}
@ -127,13 +130,13 @@ function createMatcher(term, camelCase) {
var pattern = "";
var upperCase = [];
term.trim().split(/\s+/).forEach(function(w, index, array) {
var tokens = w.split(/(?=[A-Z,.()<>?[\/])/);
var tokens = w.split(/(?=[\p{Lu},.()<>?[\/])/u);
for (var i = 0; i < tokens.length; i++) {
var s = tokens[i];
// ',' and '?' are the only delimiters commonly followed by space in java signatures
pattern += "(" + $.ui.autocomplete.escapeRegex(s).replace(/[,?]/g, "$&\\s*?") + ")";
pattern += "(" + escapeUnicodeRegex(s).replace(/[,?]/g, "$&\\s*?") + ")";
upperCase.push(false);
var isWordToken = /\w$/.test(s);
var isWordToken = /[\p{L}\p{Nd}_]$/u.test(s);
if (isWordToken) {
if (i === tokens.length - 1 && index < array.length - 1) {
// space in query string matches all delimiters
@ -143,7 +146,7 @@ function createMatcher(term, camelCase) {
if (!camelCase && isUpperCase(s) && s.length === 1) {
pattern += "()";
} else {
pattern += "([a-z0-9$<>?[\\]]*?)";
pattern += "([\\p{L}\\p{Nd}\\p{Sc}<>?[\\]]*?)";
}
upperCase.push(isUpperCase(s[0]));
}
@ -153,10 +156,14 @@ function createMatcher(term, camelCase) {
}
}
});
var re = new RegExp(pattern, "gi");
var re = new RegExp(pattern, "gui");
re.upperCase = upperCase;
return re;
}
// Unicode regular expressions do not allow certain characters to be escaped
function escapeUnicodeRegex(pattern) {
return pattern.replace(/[\[\]{}()*+?.\\^$|\s]/g, '\\$&');
}
function findMatch(matcher, input, startOfName, endOfName) {
var from = startOfName;
matcher.lastIndex = from;
@ -176,20 +183,25 @@ function findMatch(matcher, input, startOfName, endOfName) {
var start = match.index;
var prevEnd = -1;
for (var i = 1; i < match.length; i += 2) {
var isUpper = isUpperCase(input[start]);
var charType = getCharType(input[start]);
var isMatcherUpper = matcher.upperCase[i];
// capturing groups come in pairs, match and non-match
boundaries.push(start, start + match[i].length);
// make sure groups are anchored on a left word boundary
var prevChar = input[start - 1] || "";
var nextChar = input[start + 1] || "";
if (start !== 0 && !/[\W_]/.test(prevChar) && !/[\W_]/.test(input[start])) {
if (isUpper && (isLowerCase(prevChar) || isLowerCase(nextChar))) {
score -= 0.1;
} else if (isMatcherUpper && start === prevEnd) {
score -= isUpper ? 0.1 : 1.0;
} else {
if (start !== 0) {
if (charType === UNICODE_DIGIT && getCharType(prevChar) === UNICODE_DIGIT) {
return NO_MATCH;
} else if (charType === UNICODE_LETTER && getCharType(prevChar) === UNICODE_LETTER) {
var isUpper = isUpperCase(input[start]);
if (isUpper && (isLowerCase(prevChar) || isLowerCase(nextChar))) {
score -= 0.1;
} else if (isMatcherUpper && start === prevEnd) {
score -= isUpper ? 0.1 : 1.0;
} else {
return NO_MATCH;
}
}
}
prevEnd = start + match[i].length;
@ -214,15 +226,30 @@ function findMatch(matcher, input, startOfName, endOfName) {
boundaries: boundaries
};
}
function isLetter(s) {
return /\p{L}/u.test(s);
}
function isUpperCase(s) {
return s !== s.toLowerCase();
return /\p{Lu}/u.test(s);
}
function isLowerCase(s) {
return s !== s.toUpperCase();
return /\p{Ll}/u.test(s);
}
function isDigit(s) {
return /\p{Nd}/u.test(s);
}
function getCharType(s) {
if (isLetter(s)) {
return UNICODE_LETTER;
} else if (isDigit(s)) {
return UNICODE_DIGIT;
} else {
return UNICODE_OTHER;
}
}
function rateNoise(str) {
return (str.match(/([.(])/g) || []).length / 5
+ (str.match(/([A-Z]+)/g) || []).length / 10
+ (str.match(/(\p{Lu}+)/gu) || []).length / 10
+ str.length / 20;
}
function doSearch(request, response) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -23,7 +23,7 @@
/*
* @test
* @bug 8178982 8220497 8210683 8241982 8297216
* @bug 8178982 8220497 8210683 8241982 8297216 8303056
* @summary Test the search feature of javadoc.
* @library ../../lib
* @library /test/lib
@ -335,6 +335,46 @@ public class TestSearchScript extends JavadocTester {
checkSearch(inv, "with map", List.of(
"listpkg.Nolist.withTypeParams(Map<String, ? extends Collection>)"));
// search for numeric strings
checkSearch(inv, "1", List.of("listpkg.MyList.abc123xyz()"));
checkSearch(inv, "12", List.of("listpkg.MyList.abc123xyz()"));
checkSearch(inv, "12 x", List.of("listpkg.MyList.abc123xyz()"));
checkSearch(inv, "123 x", List.of("listpkg.MyList.abc123xyz()"));
checkSearch(inv, "1 x", List.of("listpkg.MyList.abc123xyz()"));
checkSearch(inv, "2 x", List.of());
checkSearch(inv, "3", List.of("listpkg.MyList.M_3X"));
checkSearch(inv, "3x", List.of("listpkg.MyList.M_3X"));
checkSearch(inv, "_3", List.of("listpkg.MyList.M_3X"));
checkSearch(inv, "3 x", List.of("listpkg.MyList.M_3X"));
// Unicode camel-case tests
checkSearch(inv, "νέα λίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "δημ νέα λίσ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "δ ν λ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "ν λ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "δημιουργήστεΝέαΛίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "δηΝέΛίσ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "δΝΛ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "ΝΛ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "δημ λίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()"));
checkSearch(inv, "сделать новый список", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "сде нов спи", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "с н с", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "н с", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "сделатьНовыйСписок", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "сдеНовСпис", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "сНС", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "сН", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
checkSearch(inv, "сдеН Спи", List.of("listpkg.ListProvider.сделатьНовыйСписок()"));
// Negative Unicode camel-case tests
checkSearch(inv, "Νέα ίστα", List.of());
checkSearch(inv, "α λίστα", List.of());
checkSearch(inv, "ηΝΛ", List.of());
checkSearch(inv, "овый", List.of());
checkSearch(inv, "д н с", List.of());
checkSearch(inv, "пи", List.of());
checkSearch(inv, "НОВЫЙС ПИСОК", List.of());
}
@Test
@ -364,7 +404,7 @@ public class TestSearchScript extends JavadocTester {
}
void checkList(String query, List<?> result, List<?> expected) {
checking("Checking resut for query \"" + query + "\"");
checking("Checking result for query \"" + query + "\"");
if (!expected.equals(result)) {
failed("Expected: " + expected + ", got: " + result);
} else {

View File

@ -26,6 +26,7 @@ package listpkg;
/**
* Example class containing "list" matching full name.
* @param <E> type parameter
*/
public interface List<E> {

View File

@ -27,9 +27,30 @@ package listpkg;
* Example class containing "list" matching at beginning of name.
*/
public class ListProvider {
/**
* Constructor.
*/
public ListProvider() {}
/**
* English camel-case name
*/
public List makeNewList() {
return null;
}
/**
* Greek camel-case name
*/
public List δημιουργήστεΝέαΛίστα() {
return null;
}
/**
* Russion camel-case name
*/
public List сделатьНовыйСписок() {
return null;
}
}

View File

@ -23,5 +23,18 @@
package listpkg;
public class MyList implements List {
/**
* A class.
*/
public abstract class MyList implements List {
/**
* Field name containing a digit.
*/
public static final int M_3X = 2;
/**
* Method name containing digits.
*/
public void abc123xyz() {}
}