8304433: cleanup sentence breaker code in DocTreeMaker

Reviewed-by: hannesw
This commit is contained in:
Jonathan Gibbons 2023-03-20 15:14:25 +00:00
parent c396f1ed8b
commit 80e979720a
2 changed files with 240 additions and 225 deletions

View File

@ -27,10 +27,8 @@ package com.sun.tools.javac.tree;
import java.text.BreakIterator; import java.text.BreakIterator;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.ListIterator;
import java.util.Set; import java.util.Set;
import javax.lang.model.element.Name; import javax.lang.model.element.Name;
@ -110,10 +108,6 @@ public class DocTreeMaker implements DocTreeFactory {
/** The context key for the tree factory. */ /** The context key for the tree factory. */
protected static final Context.Key<DocTreeMaker> treeMakerKey = new Context.Key<>(); protected static final Context.Key<DocTreeMaker> treeMakerKey = new Context.Key<>();
// A subset of block tags, which acts as sentence breakers, appearing
// anywhere but the zero'th position in the first sentence.
final Set<String> sentenceBreakTags;
/** Get the TreeMaker instance. */ /** Get the TreeMaker instance. */
public static DocTreeMaker instance(Context context) { public static DocTreeMaker instance(Context context) {
DocTreeMaker instance = context.get(treeMakerKey); DocTreeMaker instance = context.get(treeMakerKey);
@ -127,6 +121,7 @@ public class DocTreeMaker implements DocTreeFactory {
public int pos; public int pos;
private final JavacTrees trees; private final JavacTrees trees;
private final SentenceBreaker breaker;
/** Utility class to parse reference signatures. */ /** Utility class to parse reference signatures. */
private final ReferenceParser referenceParser; private final ReferenceParser referenceParser;
@ -139,7 +134,7 @@ public class DocTreeMaker implements DocTreeFactory {
this.pos = Position.NOPOS; this.pos = Position.NOPOS;
trees = JavacTrees.instance(context); trees = JavacTrees.instance(context);
referenceParser = new ReferenceParser(ParserFactory.instance(context)); referenceParser = new ReferenceParser(ParserFactory.instance(context));
sentenceBreakTags = Set.of("H1", "H2", "H3", "H4", "H5", "H6", "PRE", "P"); breaker = new SentenceBreaker(this);
} }
/** Reassign current position. /** Reassign current position.
@ -518,224 +513,245 @@ public class DocTreeMaker implements DocTreeFactory {
return new ArrayList<>(pair.fst); return new ArrayList<>(pair.fst);
} }
/*
* Breaks up the body tags into the first sentence and its successors.
* The first sentence is determined with the presence of a period,
* block tag, or a sentence break, as returned by the BreakIterator.
* Trailing whitespaces are trimmed.
*/
private Pair<List<DCTree>, List<DCTree>> splitBody(Collection<? extends DocTree> list) {
// pos is modified as we create trees, therefore
// we save the pos and restore it later.
final int savedpos = this.pos;
try {
ListBuffer<DCTree> body = new ListBuffer<>();
// split body into first sentence and body
ListBuffer<DCTree> fs = new ListBuffer<>();
if (list.isEmpty()) {
return new Pair<>(fs.toList(), body.toList());
}
boolean foundFirstSentence = false;
ArrayList<DocTree> alist = new ArrayList<>(list);
ListIterator<DocTree> itr = alist.listIterator();
while (itr.hasNext()) {
boolean isFirst = !itr.hasPrevious();
DocTree dt = itr.next();
int spos = ((DCTree) dt).pos;
if (foundFirstSentence) {
body.add((DCTree) dt);
continue;
}
switch (dt.getKind()) {
case RETURN:
case SUMMARY:
foundFirstSentence = true;
break;
case TEXT:
DCText tt = (DCText) dt;
String s = tt.getBody();
DocTree peekedNext = itr.hasNext()
? alist.get(itr.nextIndex())
: null;
int sbreak = getSentenceBreak(s, peekedNext);
if (sbreak > 0) {
s = s.substring(0, sbreak).stripTrailing();
DCText text = this.at(spos).newTextTree(s);
fs.add(text);
foundFirstSentence = true;
int nwPos = skipWhiteSpace(tt.getBody(), sbreak);
if (nwPos > 0) {
DCText text2 = this.at(spos + nwPos).newTextTree(tt.getBody().substring(nwPos));
body.add(text2);
}
continue;
} else if (itr.hasNext()) {
// if the next doctree is a break, remove trailing spaces
peekedNext = alist.get(itr.nextIndex());
boolean sbrk = isSentenceBreak(peekedNext, false);
if (sbrk) {
DocTree next = itr.next();
s = s.stripTrailing();
DCText text = this.at(spos).newTextTree(s);
fs.add(text);
body.add((DCTree) next);
foundFirstSentence = true;
continue;
}
}
break;
default:
if (isSentenceBreak(dt, isFirst)) {
body.add((DCTree) dt);
foundFirstSentence = true;
continue;
}
break;
}
fs.add((DCTree) dt);
}
return new Pair<>(fs.toList(), body.toList());
} finally {
this.pos = savedpos;
}
}
private boolean isTextTree(DocTree tree) {
return tree.getKind() == Kind.TEXT;
}
/*
* Computes the first sentence break, a simple dot-space algorithm.
*/
private int defaultSentenceBreak(String s) {
// scan for period followed by whitespace
int period = -1;
for (int i = 0; i < s.length(); i++) {
switch (s.charAt(i)) {
case '.':
period = i;
break;
case ' ':
case '\f':
case '\n':
case '\r':
case '\t':
if (period >= 0) {
return i;
}
break;
default:
period = -1;
break;
}
}
return -1;
}
/*
* Computes the first sentence, if using a default breaker,
* the break is returned, if not then a -1, indicating that
* more doctree elements are required to be examined.
*
* BreakIterator.next points to the start of the following sentence,
* and does not provide an easy way to disambiguate between "sentence break",
* "possible sentence break" and "not a sentence break" at the end of the input.
* For example, BreakIterator.next returns the index for the end
* of the string for all of these examples,
* using vertical bars to delimit the bounds of the example text
* |Abc| (not a valid end of sentence break, if followed by more text)
* |Abc.| (maybe a valid end of sentence break, depending on the following text)
* |Abc. | (maybe a valid end of sentence break, depending on the following text)
* |"Abc." | (maybe a valid end of sentence break, depending on the following text)
* |Abc. | (definitely a valid end of sentence break)
* |"Abc." | (definitely a valid end of sentence break)
* Therefore, we have to probe further to determine whether
* there really is a sentence break or not at the end of this run of text.
*/
private int getSentenceBreak(String s, DocTree dt) {
BreakIterator breakIterator = trees.getBreakIterator();
if (breakIterator == null) {
return defaultSentenceBreak(s);
}
breakIterator.setText(s);
final int sbrk = breakIterator.next();
// This is the last doctree, found the droid we are looking for
if (dt == null) {
return sbrk;
}
// If the break is well within the span of the string ie. not
// at EOL, then we have a clear break.
if (sbrk < s.length() - 1) {
return sbrk;
}
if (isTextTree(dt)) {
// Two adjacent text trees, a corner case, perhaps
// produced by a tool synthesizing a doctree. In
// this case, does the break lie within the first span,
// then we have the droid, otherwise allow the callers
// logic to handle the break in the adjacent doctree.
TextTree ttnext = (TextTree) dt;
String combined = s + ttnext.getBody();
breakIterator.setText(combined);
int sbrk2 = breakIterator.next();
if (sbrk < sbrk2) {
return sbrk;
}
}
// Is the adjacent tree a sentence breaker ?
if (isSentenceBreak(dt, false)) {
return sbrk;
}
// At this point the adjacent tree is either a javadoc tag ({@..),
// html tag (<..) or an entity (&..). Perform a litmus test, by
// concatenating a sentence, to validate the break earlier identified.
String combined = s + "Dummy Sentence.";
breakIterator.setText(combined);
int sbrk2 = breakIterator.next();
if (sbrk2 <= sbrk) {
return sbrk2;
}
return -1; // indeterminate at this time
}
private boolean isSentenceBreak(Name tagName) {
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
}
private boolean isSentenceBreak(DocTree dt, boolean isFirstDocTree) {
switch (dt.getKind()) {
case START_ELEMENT:
StartElementTree set = (StartElementTree)dt;
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(set.getName());
case END_ELEMENT:
EndElementTree eet = (EndElementTree)dt;
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(eet.getName());
default:
return false;
}
}
/*
* Returns the position of the first non-whitespace character.
*/
private int skipWhiteSpace(String s, int start) {
for (int i = start; i < s.length(); i++) {
char c = s.charAt(i);
if (!Character.isWhitespace(c)) {
return i;
}
}
return -1;
}
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private List<DCTree> cast(List<? extends DocTree> list) { private static List<DCTree> cast(List<? extends DocTree> list) {
return (List<DCTree>) list; return (List<DCTree>) list;
} }
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
return breaker.splitBody(list);
}
static class SentenceBreaker {
final DocTreeMaker m;
// A subset of block tags, which acts as sentence breakers, appearing
// anywhere but the zero'th position in the first sentence.
static final Set<String> sentenceBreakTags = Set.of(
"H1", "H2", "H3", "H4", "H5", "H6",
"PRE", "P");
SentenceBreaker(DocTreeMaker m) {
this.m = m;
}
/*
* Breaks up the body tags into the first sentence and its successors.
* The first sentence is determined with the presence of a period,
* block tag, or a sentence break, as returned by the BreakIterator.
* Trailing whitespaces are trimmed.
*/
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
if (list.isEmpty()) {
return new Pair<>(List.of(), List.of());
}
// pos is modified as we create trees, therefore
// we save the pos and restore it later.
final var savedPos = m.pos;
try {
// split list into first sentence and body
var fs = new ListBuffer<DCTree>();
var body = new ListBuffer<DCTree>();
var alist = new ArrayList<>(cast(list)); // copy to allow indexed access for peeking
var iter = alist.listIterator();
var foundFirstSentence = false;
while (iter.hasNext() && !foundFirstSentence) {
boolean isFirst = !iter.hasPrevious();
DCTree dt = iter.next();
switch (dt.getKind()) {
case RETURN, SUMMARY -> {
fs.add(dt);
foundFirstSentence = true;
}
case TEXT -> {
var dtPos = dt.pos;
var s = ((DCText) dt).getBody();
var peekedNext = iter.hasNext()
? alist.get(iter.nextIndex())
: null;
int sbreak = getSentenceBreak(s, peekedNext);
if (sbreak > 0) {
var fsPart = m.at(dtPos).newTextTree(s.substring(0, sbreak).stripTrailing());
fs.add(fsPart);
int offsetPos = skipWhiteSpace(s, sbreak);
if (offsetPos > 0) {
DCText bodyPart = m.at(dtPos + offsetPos).newTextTree(s.substring(offsetPos));
body.add(bodyPart);
}
foundFirstSentence = true;
} else if (peekedNext != null) {
// if the next doctree is a break, remove trailing spaces
if (isSentenceBreak(peekedNext, false)) {
DCTree next = iter.next();
DCText fsPart = m.at(dtPos).newTextTree(s.stripTrailing());
fs.add(fsPart);
body.add(next);
foundFirstSentence = true;
} else {
fs.add(dt);
}
} else {
fs.add(dt);
}
}
default -> {
// This ignores certain block tags if they appear first in the list,
// allowing the content of that tag to provide the first sentence.
// It would be better if other block tags always terminated the
// first sentence as well, like lists and tables.
if (isSentenceBreak(dt, isFirst)) {
body.add(dt);
foundFirstSentence = true;
} else {
fs.add(dt);
}
}
}
}
// if there are remaining elements, then we have found the first
// sentence, and remaining elements are for the body.
while (iter.hasNext()) {
body.add(iter.next());
}
return new Pair<>(fs.toList(), body.toList());
} finally {
m.pos = savedPos;
}
}
/*
* Computes the first sentence break, a simple dot-space algorithm.
*/
private int defaultSentenceBreak(String s) {
// scan for period followed by whitespace
int period = -1;
for (int i = 0; i < s.length(); i++) {
switch (s.charAt(i)) {
case '.':
period = i;
break;
case ' ':
case '\f':
case '\n':
case '\r':
case '\t':
if (period >= 0) {
return i;
}
break;
default:
period = -1;
break;
}
}
return -1;
}
/*
* Computes the first sentence, if using a default breaker,
* the break is returned, if not then a -1, indicating that
* more doctree elements are required to be examined.
*
* BreakIterator.next points to the start of the following sentence,
* and does not provide an easy way to disambiguate between "sentence break",
* "possible sentence break" and "not a sentence break" at the end of the input.
* For example, BreakIterator.next returns the index for the end
* of the string for all of these examples,
* using vertical bars to delimit the bounds of the example text
* |Abc| (not a valid end of sentence break, if followed by more text)
* |Abc.| (maybe a valid end of sentence break, depending on the following text)
* |Abc. | (maybe a valid end of sentence break, depending on the following text)
* |"Abc." | (maybe a valid end of sentence break, depending on the following text)
* |Abc. | (definitely a valid end of sentence break)
* |"Abc." | (definitely a valid end of sentence break)
* Therefore, we have to probe further to determine whether
* there really is a sentence break or not at the end of this run of text.
*/
private int getSentenceBreak(String s, DCTree nextTree) {
BreakIterator breakIterator = m.trees.getBreakIterator();
if (breakIterator == null) {
return defaultSentenceBreak(s);
}
breakIterator.setText(s);
final int sbrk = breakIterator.next();
// This is the last doctree, found the droid we are looking for
if (nextTree == null) {
return sbrk;
}
// If the break is well within the span of the string ie. not
// at EOL, then we have a clear break.
if (sbrk < s.length() - 1) {
return sbrk;
}
if (nextTree.getKind() == Kind.TEXT) {
// Two adjacent text trees, a corner case, perhaps
// produced by a tool synthesizing a doctree. In
// this case, does the break lie within the first span,
// then we have the droid, otherwise allow the callers
// logic to handle the break in the adjacent doctree.
TextTree ttnext = (TextTree) nextTree;
String combined = s + ttnext.getBody();
breakIterator.setText(combined);
int sbrk2 = breakIterator.next();
if (sbrk < sbrk2) {
return sbrk;
}
}
// Is the adjacent tree a sentence breaker ?
if (isSentenceBreak(nextTree, false)) {
return sbrk;
}
// At this point the adjacent tree is either a javadoc tag ({@..),
// html tag (<..) or an entity (&..). Perform a litmus test, by
// concatenating a sentence, to validate the break earlier identified.
String combined = s + "Dummy Sentence.";
breakIterator.setText(combined);
int sbrk2 = breakIterator.next();
if (sbrk2 <= sbrk) {
return sbrk2;
}
return -1; // indeterminate at this time
}
private boolean isSentenceBreak(DCTree dt, boolean isFirstDocTree) {
switch (dt.getKind()) {
case START_ELEMENT:
StartElementTree set = (StartElementTree) dt;
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(set.getName());
case END_ELEMENT:
EndElementTree eet = (EndElementTree) dt;
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(eet.getName());
default:
return false;
}
}
private boolean isSentenceBreak(Name tagName) {
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
}
/*
* Returns the position of the first non-whitespace character.
*/
private int skipWhiteSpace(String s, int start) {
for (int i = start; i < s.length(); i++) {
char c = s.charAt(i);
if (!Character.isWhitespace(c)) {
return i;
}
}
return -1;
}
}
} }

View File

@ -61,7 +61,6 @@ import com.sun.tools.javac.tree.DCTree;
import com.sun.tools.javac.tree.DCTree.DCDocComment; import com.sun.tools.javac.tree.DCTree.DCDocComment;
import com.sun.tools.javac.tree.DCTree.DCReference; import com.sun.tools.javac.tree.DCTree.DCReference;
import com.sun.tools.javac.tree.JCTree.JCCompilationUnit; import com.sun.tools.javac.tree.JCTree.JCCompilationUnit;
import com.sun.tools.javac.util.List;
import com.sun.tools.javac.util.Pair; import com.sun.tools.javac.util.Pair;
public class SourceDocTreeScannerTest extends AbstractTreeScannerTest { public class SourceDocTreeScannerTest extends AbstractTreeScannerTest {