From 52ec4bcb1bab15dbf0a9b2488d33a23cdc1cb0e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20Walln=C3=B6fer?= Date: Wed, 9 Aug 2023 09:50:21 +0000 Subject: [PATCH] 8303056: Improve support for Unicode characters and digits in JavaDoc search Reviewed-by: jjg --- .../formats/html/resources/search.js.template | 57 ++++++++++++++----- .../testSearchScript/TestSearchScript.java | 46 ++++++++++++++- .../doclet/testSearchScript/listpkg/List.java | 1 + .../listpkg/ListProvider.java | 21 +++++++ .../testSearchScript/listpkg/MyList.java | 15 ++++- 5 files changed, 121 insertions(+), 19 deletions(-) diff --git a/src/jdk.javadoc/share/classes/jdk/javadoc/internal/doclets/formats/html/resources/search.js.template b/src/jdk.javadoc/share/classes/jdk/javadoc/internal/doclets/formats/html/resources/search.js.template index c44c4d50306..f98a2faf035 100644 --- a/src/jdk.javadoc/share/classes/jdk/javadoc/internal/doclets/formats/html/resources/search.js.template +++ b/src/jdk.javadoc/share/classes/jdk/javadoc/internal/doclets/formats/html/resources/search.js.template @@ -44,6 +44,9 @@ const categories = { const highlight = "$&"; const NO_MATCH = {}; const MAX_RESULTS = 300; +const UNICODE_LETTER = 0; +const UNICODE_DIGIT = 1; +const UNICODE_OTHER = 2; function checkUnnamed(name, separator) { return name === "" || !name ? "" : name + separator; } @@ -127,13 +130,13 @@ function createMatcher(term, camelCase) { var pattern = ""; var upperCase = []; term.trim().split(/\s+/).forEach(function(w, index, array) { - var tokens = w.split(/(?=[A-Z,.()<>?[\/])/); + var tokens = w.split(/(?=[\p{Lu},.()<>?[\/])/u); for (var i = 0; i < tokens.length; i++) { var s = tokens[i]; // ',' and '?' are the only delimiters commonly followed by space in java signatures - pattern += "(" + $.ui.autocomplete.escapeRegex(s).replace(/[,?]/g, "$&\\s*?") + ")"; + pattern += "(" + escapeUnicodeRegex(s).replace(/[,?]/g, "$&\\s*?") + ")"; upperCase.push(false); - var isWordToken = /\w$/.test(s); + var isWordToken = /[\p{L}\p{Nd}_]$/u.test(s); if (isWordToken) { if (i === tokens.length - 1 && index < array.length - 1) { // space in query string matches all delimiters @@ -143,7 +146,7 @@ function createMatcher(term, camelCase) { if (!camelCase && isUpperCase(s) && s.length === 1) { pattern += "()"; } else { - pattern += "([a-z0-9$<>?[\\]]*?)"; + pattern += "([\\p{L}\\p{Nd}\\p{Sc}<>?[\\]]*?)"; } upperCase.push(isUpperCase(s[0])); } @@ -153,10 +156,14 @@ function createMatcher(term, camelCase) { } } }); - var re = new RegExp(pattern, "gi"); + var re = new RegExp(pattern, "gui"); re.upperCase = upperCase; return re; } +// Unicode regular expressions do not allow certain characters to be escaped +function escapeUnicodeRegex(pattern) { + return pattern.replace(/[\[\]{}()*+?.\\^$|\s]/g, '\\$&'); +} function findMatch(matcher, input, startOfName, endOfName) { var from = startOfName; matcher.lastIndex = from; @@ -176,20 +183,25 @@ function findMatch(matcher, input, startOfName, endOfName) { var start = match.index; var prevEnd = -1; for (var i = 1; i < match.length; i += 2) { - var isUpper = isUpperCase(input[start]); + var charType = getCharType(input[start]); var isMatcherUpper = matcher.upperCase[i]; // capturing groups come in pairs, match and non-match boundaries.push(start, start + match[i].length); // make sure groups are anchored on a left word boundary var prevChar = input[start - 1] || ""; var nextChar = input[start + 1] || ""; - if (start !== 0 && !/[\W_]/.test(prevChar) && !/[\W_]/.test(input[start])) { - if (isUpper && (isLowerCase(prevChar) || isLowerCase(nextChar))) { - score -= 0.1; - } else if (isMatcherUpper && start === prevEnd) { - score -= isUpper ? 0.1 : 1.0; - } else { + if (start !== 0) { + if (charType === UNICODE_DIGIT && getCharType(prevChar) === UNICODE_DIGIT) { return NO_MATCH; + } else if (charType === UNICODE_LETTER && getCharType(prevChar) === UNICODE_LETTER) { + var isUpper = isUpperCase(input[start]); + if (isUpper && (isLowerCase(prevChar) || isLowerCase(nextChar))) { + score -= 0.1; + } else if (isMatcherUpper && start === prevEnd) { + score -= isUpper ? 0.1 : 1.0; + } else { + return NO_MATCH; + } } } prevEnd = start + match[i].length; @@ -214,15 +226,30 @@ function findMatch(matcher, input, startOfName, endOfName) { boundaries: boundaries }; } +function isLetter(s) { + return /\p{L}/u.test(s); +} function isUpperCase(s) { - return s !== s.toLowerCase(); + return /\p{Lu}/u.test(s); } function isLowerCase(s) { - return s !== s.toUpperCase(); + return /\p{Ll}/u.test(s); +} +function isDigit(s) { + return /\p{Nd}/u.test(s); +} +function getCharType(s) { + if (isLetter(s)) { + return UNICODE_LETTER; + } else if (isDigit(s)) { + return UNICODE_DIGIT; + } else { + return UNICODE_OTHER; + } } function rateNoise(str) { return (str.match(/([.(])/g) || []).length / 5 - + (str.match(/([A-Z]+)/g) || []).length / 10 + + (str.match(/(\p{Lu}+)/gu) || []).length / 10 + str.length / 20; } function doSearch(request, response) { diff --git a/test/langtools/jdk/javadoc/doclet/testSearchScript/TestSearchScript.java b/test/langtools/jdk/javadoc/doclet/testSearchScript/TestSearchScript.java index 658d555cf66..cb480901885 100644 --- a/test/langtools/jdk/javadoc/doclet/testSearchScript/TestSearchScript.java +++ b/test/langtools/jdk/javadoc/doclet/testSearchScript/TestSearchScript.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ /* * @test - * @bug 8178982 8220497 8210683 8241982 8297216 + * @bug 8178982 8220497 8210683 8241982 8297216 8303056 * @summary Test the search feature of javadoc. * @library ../../lib * @library /test/lib @@ -335,6 +335,46 @@ public class TestSearchScript extends JavadocTester { checkSearch(inv, "with map", List.of( "listpkg.Nolist.withTypeParams(Map)")); + // search for numeric strings + checkSearch(inv, "1", List.of("listpkg.MyList.abc123xyz()")); + checkSearch(inv, "12", List.of("listpkg.MyList.abc123xyz()")); + checkSearch(inv, "12 x", List.of("listpkg.MyList.abc123xyz()")); + checkSearch(inv, "123 x", List.of("listpkg.MyList.abc123xyz()")); + checkSearch(inv, "1 x", List.of("listpkg.MyList.abc123xyz()")); + checkSearch(inv, "2 x", List.of()); + checkSearch(inv, "3", List.of("listpkg.MyList.M_3X")); + checkSearch(inv, "3x", List.of("listpkg.MyList.M_3X")); + checkSearch(inv, "_3", List.of("listpkg.MyList.M_3X")); + checkSearch(inv, "3 x", List.of("listpkg.MyList.M_3X")); + + // Unicode camel-case tests + checkSearch(inv, "νέα λίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "δημ νέα λίσ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "δ ν λ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "ν λ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "δημιουργήστεΝέαΛίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "δηΝέΛίσ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "δΝΛ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "ΝΛ", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "δημ λίστα", List.of("listpkg.ListProvider.δημιουργήστεΝέαΛίστα()")); + checkSearch(inv, "сделать новый список", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "сде нов спи", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "с н с", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "н с", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "сделатьНовыйСписок", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "сдеНовСпис", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "сНС", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "сН", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + checkSearch(inv, "сдеН Спи", List.of("listpkg.ListProvider.сделатьНовыйСписок()")); + + // Negative Unicode camel-case tests + checkSearch(inv, "Νέα ίστα", List.of()); + checkSearch(inv, "α λίστα", List.of()); + checkSearch(inv, "ηΝΛ", List.of()); + checkSearch(inv, "овый", List.of()); + checkSearch(inv, "д н с", List.of()); + checkSearch(inv, "пи", List.of()); + checkSearch(inv, "НОВЫЙС ПИСОК", List.of()); } @Test @@ -364,7 +404,7 @@ public class TestSearchScript extends JavadocTester { } void checkList(String query, List result, List expected) { - checking("Checking resut for query \"" + query + "\""); + checking("Checking result for query \"" + query + "\""); if (!expected.equals(result)) { failed("Expected: " + expected + ", got: " + result); } else { diff --git a/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/List.java b/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/List.java index 0f1bc4d7de3..1929a95d7d6 100644 --- a/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/List.java +++ b/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/List.java @@ -26,6 +26,7 @@ package listpkg; /** * Example class containing "list" matching full name. + * @param type parameter */ public interface List { diff --git a/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/ListProvider.java b/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/ListProvider.java index f61286516ac..eda67037e0f 100644 --- a/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/ListProvider.java +++ b/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/ListProvider.java @@ -27,9 +27,30 @@ package listpkg; * Example class containing "list" matching at beginning of name. */ public class ListProvider { + /** + * Constructor. + */ public ListProvider() {} + /** + * English camel-case name + */ public List makeNewList() { return null; } + + /** + * Greek camel-case name + */ + public List δημιουργήστεΝέαΛίστα() { + return null; + } + + /** + * Russion camel-case name + */ + public List сделатьНовыйСписок() { + return null; + } + } diff --git a/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/MyList.java b/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/MyList.java index 815961dee72..67ddff835e4 100644 --- a/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/MyList.java +++ b/test/langtools/jdk/javadoc/doclet/testSearchScript/listpkg/MyList.java @@ -23,5 +23,18 @@ package listpkg; -public class MyList implements List { +/** + * A class. + */ +public abstract class MyList implements List { + + /** + * Field name containing a digit. + */ + public static final int M_3X = 2; + + /** + * Method name containing digits. + */ + public void abc123xyz() {} }