8304433: cleanup sentence breaker code in DocTreeMaker

Reviewed-by: hannesw
This commit is contained in:
Jonathan Gibbons 2023-03-20 15:14:25 +00:00
parent c396f1ed8b
commit 80e979720a
2 changed files with 240 additions and 225 deletions

View File

@ -27,10 +27,8 @@ package com.sun.tools.javac.tree;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import javax.lang.model.element.Name;
@ -110,10 +108,6 @@ public class DocTreeMaker implements DocTreeFactory {
/** The context key for the tree factory. */
protected static final Context.Key<DocTreeMaker> treeMakerKey = new Context.Key<>();
// A subset of block tags, which acts as sentence breakers, appearing
// anywhere but the zero'th position in the first sentence.
final Set<String> sentenceBreakTags;
/** Get the TreeMaker instance. */
public static DocTreeMaker instance(Context context) {
DocTreeMaker instance = context.get(treeMakerKey);
@ -127,6 +121,7 @@ public class DocTreeMaker implements DocTreeFactory {
public int pos;
private final JavacTrees trees;
private final SentenceBreaker breaker;
/** Utility class to parse reference signatures. */
private final ReferenceParser referenceParser;
@ -139,7 +134,7 @@ public class DocTreeMaker implements DocTreeFactory {
this.pos = Position.NOPOS;
trees = JavacTrees.instance(context);
referenceParser = new ReferenceParser(ParserFactory.instance(context));
sentenceBreakTags = Set.of("H1", "H2", "H3", "H4", "H5", "H6", "PRE", "P");
breaker = new SentenceBreaker(this);
}
/** Reassign current position.
@ -518,90 +513,114 @@ public class DocTreeMaker implements DocTreeFactory {
return new ArrayList<>(pair.fst);
}
@SuppressWarnings("unchecked")
private static List<DCTree> cast(List<? extends DocTree> list) {
return (List<DCTree>) list;
}
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
return breaker.splitBody(list);
}
static class SentenceBreaker {
final DocTreeMaker m;
// A subset of block tags, which acts as sentence breakers, appearing
// anywhere but the zero'th position in the first sentence.
static final Set<String> sentenceBreakTags = Set.of(
"H1", "H2", "H3", "H4", "H5", "H6",
"PRE", "P");
SentenceBreaker(DocTreeMaker m) {
this.m = m;
}
/*
* Breaks up the body tags into the first sentence and its successors.
* The first sentence is determined with the presence of a period,
* block tag, or a sentence break, as returned by the BreakIterator.
* Trailing whitespaces are trimmed.
*/
private Pair<List<DCTree>, List<DCTree>> splitBody(Collection<? extends DocTree> list) {
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
if (list.isEmpty()) {
return new Pair<>(List.of(), List.of());
}
// pos is modified as we create trees, therefore
// we save the pos and restore it later.
final int savedpos = this.pos;
final var savedPos = m.pos;
try {
ListBuffer<DCTree> body = new ListBuffer<>();
// split body into first sentence and body
ListBuffer<DCTree> fs = new ListBuffer<>();
if (list.isEmpty()) {
return new Pair<>(fs.toList(), body.toList());
}
boolean foundFirstSentence = false;
ArrayList<DocTree> alist = new ArrayList<>(list);
ListIterator<DocTree> itr = alist.listIterator();
while (itr.hasNext()) {
boolean isFirst = !itr.hasPrevious();
DocTree dt = itr.next();
int spos = ((DCTree) dt).pos;
if (foundFirstSentence) {
body.add((DCTree) dt);
continue;
}
// split list into first sentence and body
var fs = new ListBuffer<DCTree>();
var body = new ListBuffer<DCTree>();
var alist = new ArrayList<>(cast(list)); // copy to allow indexed access for peeking
var iter = alist.listIterator();
var foundFirstSentence = false;
while (iter.hasNext() && !foundFirstSentence) {
boolean isFirst = !iter.hasPrevious();
DCTree dt = iter.next();
switch (dt.getKind()) {
case RETURN:
case SUMMARY:
case RETURN, SUMMARY -> {
fs.add(dt);
foundFirstSentence = true;
break;
case TEXT:
DCText tt = (DCText) dt;
String s = tt.getBody();
DocTree peekedNext = itr.hasNext()
? alist.get(itr.nextIndex())
}
case TEXT -> {
var dtPos = dt.pos;
var s = ((DCText) dt).getBody();
var peekedNext = iter.hasNext()
? alist.get(iter.nextIndex())
: null;
int sbreak = getSentenceBreak(s, peekedNext);
if (sbreak > 0) {
s = s.substring(0, sbreak).stripTrailing();
DCText text = this.at(spos).newTextTree(s);
fs.add(text);
foundFirstSentence = true;
int nwPos = skipWhiteSpace(tt.getBody(), sbreak);
if (nwPos > 0) {
DCText text2 = this.at(spos + nwPos).newTextTree(tt.getBody().substring(nwPos));
body.add(text2);
var fsPart = m.at(dtPos).newTextTree(s.substring(0, sbreak).stripTrailing());
fs.add(fsPart);
int offsetPos = skipWhiteSpace(s, sbreak);
if (offsetPos > 0) {
DCText bodyPart = m.at(dtPos + offsetPos).newTextTree(s.substring(offsetPos));
body.add(bodyPart);
}
continue;
} else if (itr.hasNext()) {
foundFirstSentence = true;
} else if (peekedNext != null) {
// if the next doctree is a break, remove trailing spaces
peekedNext = alist.get(itr.nextIndex());
boolean sbrk = isSentenceBreak(peekedNext, false);
if (sbrk) {
DocTree next = itr.next();
s = s.stripTrailing();
DCText text = this.at(spos).newTextTree(s);
fs.add(text);
body.add((DCTree) next);
if (isSentenceBreak(peekedNext, false)) {
DCTree next = iter.next();
DCText fsPart = m.at(dtPos).newTextTree(s.stripTrailing());
fs.add(fsPart);
body.add(next);
foundFirstSentence = true;
continue;
} else {
fs.add(dt);
}
}
break;
default:
if (isSentenceBreak(dt, isFirst)) {
body.add((DCTree) dt);
foundFirstSentence = true;
continue;
}
break;
}
fs.add((DCTree) dt);
}
return new Pair<>(fs.toList(), body.toList());
} finally {
this.pos = savedpos;
} else {
fs.add(dt);
}
}
private boolean isTextTree(DocTree tree) {
return tree.getKind() == Kind.TEXT;
default -> {
// This ignores certain block tags if they appear first in the list,
// allowing the content of that tag to provide the first sentence.
// It would be better if other block tags always terminated the
// first sentence as well, like lists and tables.
if (isSentenceBreak(dt, isFirst)) {
body.add(dt);
foundFirstSentence = true;
} else {
fs.add(dt);
}
}
}
}
// if there are remaining elements, then we have found the first
// sentence, and remaining elements are for the body.
while (iter.hasNext()) {
body.add(iter.next());
}
return new Pair<>(fs.toList(), body.toList());
} finally {
m.pos = savedPos;
}
}
/*
@ -654,15 +673,15 @@ public class DocTreeMaker implements DocTreeFactory {
* Therefore, we have to probe further to determine whether
* there really is a sentence break or not at the end of this run of text.
*/
private int getSentenceBreak(String s, DocTree dt) {
BreakIterator breakIterator = trees.getBreakIterator();
private int getSentenceBreak(String s, DCTree nextTree) {
BreakIterator breakIterator = m.trees.getBreakIterator();
if (breakIterator == null) {
return defaultSentenceBreak(s);
}
breakIterator.setText(s);
final int sbrk = breakIterator.next();
// This is the last doctree, found the droid we are looking for
if (dt == null) {
if (nextTree == null) {
return sbrk;
}
@ -672,13 +691,13 @@ public class DocTreeMaker implements DocTreeFactory {
return sbrk;
}
if (isTextTree(dt)) {
if (nextTree.getKind() == Kind.TEXT) {
// Two adjacent text trees, a corner case, perhaps
// produced by a tool synthesizing a doctree. In
// this case, does the break lie within the first span,
// then we have the droid, otherwise allow the callers
// logic to handle the break in the adjacent doctree.
TextTree ttnext = (TextTree) dt;
TextTree ttnext = (TextTree) nextTree;
String combined = s + ttnext.getBody();
breakIterator.setText(combined);
int sbrk2 = breakIterator.next();
@ -688,7 +707,7 @@ public class DocTreeMaker implements DocTreeFactory {
}
// Is the adjacent tree a sentence breaker ?
if (isSentenceBreak(dt, false)) {
if (isSentenceBreak(nextTree, false)) {
return sbrk;
}
@ -704,23 +723,23 @@ public class DocTreeMaker implements DocTreeFactory {
return -1; // indeterminate at this time
}
private boolean isSentenceBreak(Name tagName) {
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
}
private boolean isSentenceBreak(DocTree dt, boolean isFirstDocTree) {
private boolean isSentenceBreak(DCTree dt, boolean isFirstDocTree) {
switch (dt.getKind()) {
case START_ELEMENT:
StartElementTree set = (StartElementTree) dt;
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(set.getName());
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(set.getName());
case END_ELEMENT:
EndElementTree eet = (EndElementTree) dt;
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(eet.getName());
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(eet.getName());
default:
return false;
}
}
private boolean isSentenceBreak(Name tagName) {
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
}
/*
* Returns the position of the first non-whitespace character.
*/
@ -733,9 +752,6 @@ public class DocTreeMaker implements DocTreeFactory {
}
return -1;
}
}
@SuppressWarnings("unchecked")
private List<DCTree> cast(List<? extends DocTree> list) {
return (List<DCTree>) list;
}
}

View File

@ -61,7 +61,6 @@ import com.sun.tools.javac.tree.DCTree;
import com.sun.tools.javac.tree.DCTree.DCDocComment;
import com.sun.tools.javac.tree.DCTree.DCReference;
import com.sun.tools.javac.tree.JCTree.JCCompilationUnit;
import com.sun.tools.javac.util.List;
import com.sun.tools.javac.util.Pair;
public class SourceDocTreeScannerTest extends AbstractTreeScannerTest {