diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/tree/DocTreeMaker.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/tree/DocTreeMaker.java index e45b84dbd68..a4884165f6f 100644 --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/tree/DocTreeMaker.java +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/tree/DocTreeMaker.java @@ -27,10 +27,8 @@ package com.sun.tools.javac.tree; import java.text.BreakIterator; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.List; -import java.util.ListIterator; import java.util.Set; import javax.lang.model.element.Name; @@ -110,10 +108,6 @@ public class DocTreeMaker implements DocTreeFactory { /** The context key for the tree factory. */ protected static final Context.Key treeMakerKey = new Context.Key<>(); - // A subset of block tags, which acts as sentence breakers, appearing - // anywhere but the zero'th position in the first sentence. - final Set sentenceBreakTags; - /** Get the TreeMaker instance. */ public static DocTreeMaker instance(Context context) { DocTreeMaker instance = context.get(treeMakerKey); @@ -127,6 +121,7 @@ public class DocTreeMaker implements DocTreeFactory { public int pos; private final JavacTrees trees; + private final SentenceBreaker breaker; /** Utility class to parse reference signatures. */ private final ReferenceParser referenceParser; @@ -139,7 +134,7 @@ public class DocTreeMaker implements DocTreeFactory { this.pos = Position.NOPOS; trees = JavacTrees.instance(context); referenceParser = new ReferenceParser(ParserFactory.instance(context)); - sentenceBreakTags = Set.of("H1", "H2", "H3", "H4", "H5", "H6", "PRE", "P"); + breaker = new SentenceBreaker(this); } /** Reassign current position. @@ -518,224 +513,245 @@ public class DocTreeMaker implements DocTreeFactory { return new ArrayList<>(pair.fst); } - /* - * Breaks up the body tags into the first sentence and its successors. - * The first sentence is determined with the presence of a period, - * block tag, or a sentence break, as returned by the BreakIterator. - * Trailing whitespaces are trimmed. - */ - private Pair, List> splitBody(Collection list) { - // pos is modified as we create trees, therefore - // we save the pos and restore it later. - final int savedpos = this.pos; - try { - ListBuffer body = new ListBuffer<>(); - // split body into first sentence and body - ListBuffer fs = new ListBuffer<>(); - if (list.isEmpty()) { - return new Pair<>(fs.toList(), body.toList()); - } - boolean foundFirstSentence = false; - ArrayList alist = new ArrayList<>(list); - ListIterator itr = alist.listIterator(); - while (itr.hasNext()) { - boolean isFirst = !itr.hasPrevious(); - DocTree dt = itr.next(); - int spos = ((DCTree) dt).pos; - if (foundFirstSentence) { - body.add((DCTree) dt); - continue; - } - switch (dt.getKind()) { - case RETURN: - case SUMMARY: - foundFirstSentence = true; - break; - case TEXT: - DCText tt = (DCText) dt; - String s = tt.getBody(); - DocTree peekedNext = itr.hasNext() - ? alist.get(itr.nextIndex()) - : null; - int sbreak = getSentenceBreak(s, peekedNext); - if (sbreak > 0) { - s = s.substring(0, sbreak).stripTrailing(); - DCText text = this.at(spos).newTextTree(s); - fs.add(text); - foundFirstSentence = true; - int nwPos = skipWhiteSpace(tt.getBody(), sbreak); - if (nwPos > 0) { - DCText text2 = this.at(spos + nwPos).newTextTree(tt.getBody().substring(nwPos)); - body.add(text2); - } - continue; - } else if (itr.hasNext()) { - // if the next doctree is a break, remove trailing spaces - peekedNext = alist.get(itr.nextIndex()); - boolean sbrk = isSentenceBreak(peekedNext, false); - if (sbrk) { - DocTree next = itr.next(); - s = s.stripTrailing(); - DCText text = this.at(spos).newTextTree(s); - fs.add(text); - body.add((DCTree) next); - foundFirstSentence = true; - continue; - } - } - break; - default: - if (isSentenceBreak(dt, isFirst)) { - body.add((DCTree) dt); - foundFirstSentence = true; - continue; - } - break; - } - fs.add((DCTree) dt); - } - return new Pair<>(fs.toList(), body.toList()); - } finally { - this.pos = savedpos; - } - } - - private boolean isTextTree(DocTree tree) { - return tree.getKind() == Kind.TEXT; - } - - /* - * Computes the first sentence break, a simple dot-space algorithm. - */ - private int defaultSentenceBreak(String s) { - // scan for period followed by whitespace - int period = -1; - for (int i = 0; i < s.length(); i++) { - switch (s.charAt(i)) { - case '.': - period = i; - break; - - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - if (period >= 0) { - return i; - } - break; - - default: - period = -1; - break; - } - } - return -1; - } - - /* - * Computes the first sentence, if using a default breaker, - * the break is returned, if not then a -1, indicating that - * more doctree elements are required to be examined. - * - * BreakIterator.next points to the start of the following sentence, - * and does not provide an easy way to disambiguate between "sentence break", - * "possible sentence break" and "not a sentence break" at the end of the input. - * For example, BreakIterator.next returns the index for the end - * of the string for all of these examples, - * using vertical bars to delimit the bounds of the example text - * |Abc| (not a valid end of sentence break, if followed by more text) - * |Abc.| (maybe a valid end of sentence break, depending on the following text) - * |Abc. | (maybe a valid end of sentence break, depending on the following text) - * |"Abc." | (maybe a valid end of sentence break, depending on the following text) - * |Abc. | (definitely a valid end of sentence break) - * |"Abc." | (definitely a valid end of sentence break) - * Therefore, we have to probe further to determine whether - * there really is a sentence break or not at the end of this run of text. - */ - private int getSentenceBreak(String s, DocTree dt) { - BreakIterator breakIterator = trees.getBreakIterator(); - if (breakIterator == null) { - return defaultSentenceBreak(s); - } - breakIterator.setText(s); - final int sbrk = breakIterator.next(); - // This is the last doctree, found the droid we are looking for - if (dt == null) { - return sbrk; - } - - // If the break is well within the span of the string ie. not - // at EOL, then we have a clear break. - if (sbrk < s.length() - 1) { - return sbrk; - } - - if (isTextTree(dt)) { - // Two adjacent text trees, a corner case, perhaps - // produced by a tool synthesizing a doctree. In - // this case, does the break lie within the first span, - // then we have the droid, otherwise allow the callers - // logic to handle the break in the adjacent doctree. - TextTree ttnext = (TextTree) dt; - String combined = s + ttnext.getBody(); - breakIterator.setText(combined); - int sbrk2 = breakIterator.next(); - if (sbrk < sbrk2) { - return sbrk; - } - } - - // Is the adjacent tree a sentence breaker ? - if (isSentenceBreak(dt, false)) { - return sbrk; - } - - // At this point the adjacent tree is either a javadoc tag ({@..), - // html tag (<..) or an entity (&..). Perform a litmus test, by - // concatenating a sentence, to validate the break earlier identified. - String combined = s + "Dummy Sentence."; - breakIterator.setText(combined); - int sbrk2 = breakIterator.next(); - if (sbrk2 <= sbrk) { - return sbrk2; - } - return -1; // indeterminate at this time - } - - private boolean isSentenceBreak(Name tagName) { - return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString())); - } - - private boolean isSentenceBreak(DocTree dt, boolean isFirstDocTree) { - switch (dt.getKind()) { - case START_ELEMENT: - StartElementTree set = (StartElementTree)dt; - return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(set.getName()); - case END_ELEMENT: - EndElementTree eet = (EndElementTree)dt; - return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(eet.getName()); - default: - return false; - } - } - - /* - * Returns the position of the first non-whitespace character. - */ - private int skipWhiteSpace(String s, int start) { - for (int i = start; i < s.length(); i++) { - char c = s.charAt(i); - if (!Character.isWhitespace(c)) { - return i; - } - } - return -1; - } - @SuppressWarnings("unchecked") - private List cast(List list) { + private static List cast(List list) { return (List) list; } + + Pair, List> splitBody(List list) { + return breaker.splitBody(list); + } + + static class SentenceBreaker { + final DocTreeMaker m; + + // A subset of block tags, which acts as sentence breakers, appearing + // anywhere but the zero'th position in the first sentence. + static final Set sentenceBreakTags = Set.of( + "H1", "H2", "H3", "H4", "H5", "H6", + "PRE", "P"); + + SentenceBreaker(DocTreeMaker m) { + this.m = m; + } + + /* + * Breaks up the body tags into the first sentence and its successors. + * The first sentence is determined with the presence of a period, + * block tag, or a sentence break, as returned by the BreakIterator. + * Trailing whitespaces are trimmed. + */ + Pair, List> splitBody(List list) { + if (list.isEmpty()) { + return new Pair<>(List.of(), List.of()); + } + // pos is modified as we create trees, therefore + // we save the pos and restore it later. + final var savedPos = m.pos; + try { + // split list into first sentence and body + var fs = new ListBuffer(); + var body = new ListBuffer(); + var alist = new ArrayList<>(cast(list)); // copy to allow indexed access for peeking + var iter = alist.listIterator(); + var foundFirstSentence = false; + while (iter.hasNext() && !foundFirstSentence) { + boolean isFirst = !iter.hasPrevious(); + DCTree dt = iter.next(); + switch (dt.getKind()) { + case RETURN, SUMMARY -> { + fs.add(dt); + foundFirstSentence = true; + } + + case TEXT -> { + var dtPos = dt.pos; + var s = ((DCText) dt).getBody(); + var peekedNext = iter.hasNext() + ? alist.get(iter.nextIndex()) + : null; + int sbreak = getSentenceBreak(s, peekedNext); + if (sbreak > 0) { + var fsPart = m.at(dtPos).newTextTree(s.substring(0, sbreak).stripTrailing()); + fs.add(fsPart); + int offsetPos = skipWhiteSpace(s, sbreak); + if (offsetPos > 0) { + DCText bodyPart = m.at(dtPos + offsetPos).newTextTree(s.substring(offsetPos)); + body.add(bodyPart); + } + foundFirstSentence = true; + } else if (peekedNext != null) { + // if the next doctree is a break, remove trailing spaces + if (isSentenceBreak(peekedNext, false)) { + DCTree next = iter.next(); + DCText fsPart = m.at(dtPos).newTextTree(s.stripTrailing()); + fs.add(fsPart); + body.add(next); + foundFirstSentence = true; + } else { + fs.add(dt); + } + } else { + fs.add(dt); + } + } + + default -> { + // This ignores certain block tags if they appear first in the list, + // allowing the content of that tag to provide the first sentence. + // It would be better if other block tags always terminated the + // first sentence as well, like lists and tables. + if (isSentenceBreak(dt, isFirst)) { + body.add(dt); + foundFirstSentence = true; + } else { + fs.add(dt); + } + } + } + } + + // if there are remaining elements, then we have found the first + // sentence, and remaining elements are for the body. + while (iter.hasNext()) { + body.add(iter.next()); + } + + return new Pair<>(fs.toList(), body.toList()); + } finally { + m.pos = savedPos; + } + } + + /* + * Computes the first sentence break, a simple dot-space algorithm. + */ + private int defaultSentenceBreak(String s) { + // scan for period followed by whitespace + int period = -1; + for (int i = 0; i < s.length(); i++) { + switch (s.charAt(i)) { + case '.': + period = i; + break; + + case ' ': + case '\f': + case '\n': + case '\r': + case '\t': + if (period >= 0) { + return i; + } + break; + + default: + period = -1; + break; + } + } + return -1; + } + + /* + * Computes the first sentence, if using a default breaker, + * the break is returned, if not then a -1, indicating that + * more doctree elements are required to be examined. + * + * BreakIterator.next points to the start of the following sentence, + * and does not provide an easy way to disambiguate between "sentence break", + * "possible sentence break" and "not a sentence break" at the end of the input. + * For example, BreakIterator.next returns the index for the end + * of the string for all of these examples, + * using vertical bars to delimit the bounds of the example text + * |Abc| (not a valid end of sentence break, if followed by more text) + * |Abc.| (maybe a valid end of sentence break, depending on the following text) + * |Abc. | (maybe a valid end of sentence break, depending on the following text) + * |"Abc." | (maybe a valid end of sentence break, depending on the following text) + * |Abc. | (definitely a valid end of sentence break) + * |"Abc." | (definitely a valid end of sentence break) + * Therefore, we have to probe further to determine whether + * there really is a sentence break or not at the end of this run of text. + */ + private int getSentenceBreak(String s, DCTree nextTree) { + BreakIterator breakIterator = m.trees.getBreakIterator(); + if (breakIterator == null) { + return defaultSentenceBreak(s); + } + breakIterator.setText(s); + final int sbrk = breakIterator.next(); + // This is the last doctree, found the droid we are looking for + if (nextTree == null) { + return sbrk; + } + + // If the break is well within the span of the string ie. not + // at EOL, then we have a clear break. + if (sbrk < s.length() - 1) { + return sbrk; + } + + if (nextTree.getKind() == Kind.TEXT) { + // Two adjacent text trees, a corner case, perhaps + // produced by a tool synthesizing a doctree. In + // this case, does the break lie within the first span, + // then we have the droid, otherwise allow the callers + // logic to handle the break in the adjacent doctree. + TextTree ttnext = (TextTree) nextTree; + String combined = s + ttnext.getBody(); + breakIterator.setText(combined); + int sbrk2 = breakIterator.next(); + if (sbrk < sbrk2) { + return sbrk; + } + } + + // Is the adjacent tree a sentence breaker ? + if (isSentenceBreak(nextTree, false)) { + return sbrk; + } + + // At this point the adjacent tree is either a javadoc tag ({@..), + // html tag (<..) or an entity (&..). Perform a litmus test, by + // concatenating a sentence, to validate the break earlier identified. + String combined = s + "Dummy Sentence."; + breakIterator.setText(combined); + int sbrk2 = breakIterator.next(); + if (sbrk2 <= sbrk) { + return sbrk2; + } + return -1; // indeterminate at this time + } + + private boolean isSentenceBreak(DCTree dt, boolean isFirstDocTree) { + switch (dt.getKind()) { + case START_ELEMENT: + StartElementTree set = (StartElementTree) dt; + return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(set.getName()); + case END_ELEMENT: + EndElementTree eet = (EndElementTree) dt; + return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(eet.getName()); + default: + return false; + } + } + + private boolean isSentenceBreak(Name tagName) { + return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString())); + } + + /* + * Returns the position of the first non-whitespace character. + */ + private int skipWhiteSpace(String s, int start) { + for (int i = start; i < s.length(); i++) { + char c = s.charAt(i); + if (!Character.isWhitespace(c)) { + return i; + } + } + return -1; + } + } + } diff --git a/test/langtools/tools/javac/tree/SourceDocTreeScannerTest.java b/test/langtools/tools/javac/tree/SourceDocTreeScannerTest.java index 5f453c3fa02..72cc2d2e9d1 100644 --- a/test/langtools/tools/javac/tree/SourceDocTreeScannerTest.java +++ b/test/langtools/tools/javac/tree/SourceDocTreeScannerTest.java @@ -61,7 +61,6 @@ import com.sun.tools.javac.tree.DCTree; import com.sun.tools.javac.tree.DCTree.DCDocComment; import com.sun.tools.javac.tree.DCTree.DCReference; import com.sun.tools.javac.tree.JCTree.JCCompilationUnit; -import com.sun.tools.javac.util.List; import com.sun.tools.javac.util.Pair; public class SourceDocTreeScannerTest extends AbstractTreeScannerTest {