8304433: cleanup sentence breaker code in DocTreeMaker
Reviewed-by: hannesw
This commit is contained in:
parent
c396f1ed8b
commit
80e979720a
@ -27,10 +27,8 @@ package com.sun.tools.javac.tree;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.lang.model.element.Name;
|
||||
@ -110,10 +108,6 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
/** The context key for the tree factory. */
|
||||
protected static final Context.Key<DocTreeMaker> treeMakerKey = new Context.Key<>();
|
||||
|
||||
// A subset of block tags, which acts as sentence breakers, appearing
|
||||
// anywhere but the zero'th position in the first sentence.
|
||||
final Set<String> sentenceBreakTags;
|
||||
|
||||
/** Get the TreeMaker instance. */
|
||||
public static DocTreeMaker instance(Context context) {
|
||||
DocTreeMaker instance = context.get(treeMakerKey);
|
||||
@ -127,6 +121,7 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
public int pos;
|
||||
|
||||
private final JavacTrees trees;
|
||||
private final SentenceBreaker breaker;
|
||||
|
||||
/** Utility class to parse reference signatures. */
|
||||
private final ReferenceParser referenceParser;
|
||||
@ -139,7 +134,7 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
this.pos = Position.NOPOS;
|
||||
trees = JavacTrees.instance(context);
|
||||
referenceParser = new ReferenceParser(ParserFactory.instance(context));
|
||||
sentenceBreakTags = Set.of("H1", "H2", "H3", "H4", "H5", "H6", "PRE", "P");
|
||||
breaker = new SentenceBreaker(this);
|
||||
}
|
||||
|
||||
/** Reassign current position.
|
||||
@ -518,90 +513,114 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
return new ArrayList<>(pair.fst);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static List<DCTree> cast(List<? extends DocTree> list) {
|
||||
return (List<DCTree>) list;
|
||||
}
|
||||
|
||||
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
|
||||
return breaker.splitBody(list);
|
||||
}
|
||||
|
||||
static class SentenceBreaker {
|
||||
final DocTreeMaker m;
|
||||
|
||||
// A subset of block tags, which acts as sentence breakers, appearing
|
||||
// anywhere but the zero'th position in the first sentence.
|
||||
static final Set<String> sentenceBreakTags = Set.of(
|
||||
"H1", "H2", "H3", "H4", "H5", "H6",
|
||||
"PRE", "P");
|
||||
|
||||
SentenceBreaker(DocTreeMaker m) {
|
||||
this.m = m;
|
||||
}
|
||||
|
||||
/*
|
||||
* Breaks up the body tags into the first sentence and its successors.
|
||||
* The first sentence is determined with the presence of a period,
|
||||
* block tag, or a sentence break, as returned by the BreakIterator.
|
||||
* Trailing whitespaces are trimmed.
|
||||
*/
|
||||
private Pair<List<DCTree>, List<DCTree>> splitBody(Collection<? extends DocTree> list) {
|
||||
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
|
||||
if (list.isEmpty()) {
|
||||
return new Pair<>(List.of(), List.of());
|
||||
}
|
||||
// pos is modified as we create trees, therefore
|
||||
// we save the pos and restore it later.
|
||||
final int savedpos = this.pos;
|
||||
final var savedPos = m.pos;
|
||||
try {
|
||||
ListBuffer<DCTree> body = new ListBuffer<>();
|
||||
// split body into first sentence and body
|
||||
ListBuffer<DCTree> fs = new ListBuffer<>();
|
||||
if (list.isEmpty()) {
|
||||
return new Pair<>(fs.toList(), body.toList());
|
||||
}
|
||||
boolean foundFirstSentence = false;
|
||||
ArrayList<DocTree> alist = new ArrayList<>(list);
|
||||
ListIterator<DocTree> itr = alist.listIterator();
|
||||
while (itr.hasNext()) {
|
||||
boolean isFirst = !itr.hasPrevious();
|
||||
DocTree dt = itr.next();
|
||||
int spos = ((DCTree) dt).pos;
|
||||
if (foundFirstSentence) {
|
||||
body.add((DCTree) dt);
|
||||
continue;
|
||||
}
|
||||
// split list into first sentence and body
|
||||
var fs = new ListBuffer<DCTree>();
|
||||
var body = new ListBuffer<DCTree>();
|
||||
var alist = new ArrayList<>(cast(list)); // copy to allow indexed access for peeking
|
||||
var iter = alist.listIterator();
|
||||
var foundFirstSentence = false;
|
||||
while (iter.hasNext() && !foundFirstSentence) {
|
||||
boolean isFirst = !iter.hasPrevious();
|
||||
DCTree dt = iter.next();
|
||||
switch (dt.getKind()) {
|
||||
case RETURN:
|
||||
case SUMMARY:
|
||||
case RETURN, SUMMARY -> {
|
||||
fs.add(dt);
|
||||
foundFirstSentence = true;
|
||||
break;
|
||||
case TEXT:
|
||||
DCText tt = (DCText) dt;
|
||||
String s = tt.getBody();
|
||||
DocTree peekedNext = itr.hasNext()
|
||||
? alist.get(itr.nextIndex())
|
||||
}
|
||||
|
||||
case TEXT -> {
|
||||
var dtPos = dt.pos;
|
||||
var s = ((DCText) dt).getBody();
|
||||
var peekedNext = iter.hasNext()
|
||||
? alist.get(iter.nextIndex())
|
||||
: null;
|
||||
int sbreak = getSentenceBreak(s, peekedNext);
|
||||
if (sbreak > 0) {
|
||||
s = s.substring(0, sbreak).stripTrailing();
|
||||
DCText text = this.at(spos).newTextTree(s);
|
||||
fs.add(text);
|
||||
foundFirstSentence = true;
|
||||
int nwPos = skipWhiteSpace(tt.getBody(), sbreak);
|
||||
if (nwPos > 0) {
|
||||
DCText text2 = this.at(spos + nwPos).newTextTree(tt.getBody().substring(nwPos));
|
||||
body.add(text2);
|
||||
var fsPart = m.at(dtPos).newTextTree(s.substring(0, sbreak).stripTrailing());
|
||||
fs.add(fsPart);
|
||||
int offsetPos = skipWhiteSpace(s, sbreak);
|
||||
if (offsetPos > 0) {
|
||||
DCText bodyPart = m.at(dtPos + offsetPos).newTextTree(s.substring(offsetPos));
|
||||
body.add(bodyPart);
|
||||
}
|
||||
continue;
|
||||
} else if (itr.hasNext()) {
|
||||
foundFirstSentence = true;
|
||||
} else if (peekedNext != null) {
|
||||
// if the next doctree is a break, remove trailing spaces
|
||||
peekedNext = alist.get(itr.nextIndex());
|
||||
boolean sbrk = isSentenceBreak(peekedNext, false);
|
||||
if (sbrk) {
|
||||
DocTree next = itr.next();
|
||||
s = s.stripTrailing();
|
||||
DCText text = this.at(spos).newTextTree(s);
|
||||
fs.add(text);
|
||||
body.add((DCTree) next);
|
||||
if (isSentenceBreak(peekedNext, false)) {
|
||||
DCTree next = iter.next();
|
||||
DCText fsPart = m.at(dtPos).newTextTree(s.stripTrailing());
|
||||
fs.add(fsPart);
|
||||
body.add(next);
|
||||
foundFirstSentence = true;
|
||||
continue;
|
||||
} else {
|
||||
fs.add(dt);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (isSentenceBreak(dt, isFirst)) {
|
||||
body.add((DCTree) dt);
|
||||
foundFirstSentence = true;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
fs.add((DCTree) dt);
|
||||
}
|
||||
return new Pair<>(fs.toList(), body.toList());
|
||||
} finally {
|
||||
this.pos = savedpos;
|
||||
} else {
|
||||
fs.add(dt);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isTextTree(DocTree tree) {
|
||||
return tree.getKind() == Kind.TEXT;
|
||||
default -> {
|
||||
// This ignores certain block tags if they appear first in the list,
|
||||
// allowing the content of that tag to provide the first sentence.
|
||||
// It would be better if other block tags always terminated the
|
||||
// first sentence as well, like lists and tables.
|
||||
if (isSentenceBreak(dt, isFirst)) {
|
||||
body.add(dt);
|
||||
foundFirstSentence = true;
|
||||
} else {
|
||||
fs.add(dt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if there are remaining elements, then we have found the first
|
||||
// sentence, and remaining elements are for the body.
|
||||
while (iter.hasNext()) {
|
||||
body.add(iter.next());
|
||||
}
|
||||
|
||||
return new Pair<>(fs.toList(), body.toList());
|
||||
} finally {
|
||||
m.pos = savedPos;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -654,15 +673,15 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
* Therefore, we have to probe further to determine whether
|
||||
* there really is a sentence break or not at the end of this run of text.
|
||||
*/
|
||||
private int getSentenceBreak(String s, DocTree dt) {
|
||||
BreakIterator breakIterator = trees.getBreakIterator();
|
||||
private int getSentenceBreak(String s, DCTree nextTree) {
|
||||
BreakIterator breakIterator = m.trees.getBreakIterator();
|
||||
if (breakIterator == null) {
|
||||
return defaultSentenceBreak(s);
|
||||
}
|
||||
breakIterator.setText(s);
|
||||
final int sbrk = breakIterator.next();
|
||||
// This is the last doctree, found the droid we are looking for
|
||||
if (dt == null) {
|
||||
if (nextTree == null) {
|
||||
return sbrk;
|
||||
}
|
||||
|
||||
@ -672,13 +691,13 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
return sbrk;
|
||||
}
|
||||
|
||||
if (isTextTree(dt)) {
|
||||
if (nextTree.getKind() == Kind.TEXT) {
|
||||
// Two adjacent text trees, a corner case, perhaps
|
||||
// produced by a tool synthesizing a doctree. In
|
||||
// this case, does the break lie within the first span,
|
||||
// then we have the droid, otherwise allow the callers
|
||||
// logic to handle the break in the adjacent doctree.
|
||||
TextTree ttnext = (TextTree) dt;
|
||||
TextTree ttnext = (TextTree) nextTree;
|
||||
String combined = s + ttnext.getBody();
|
||||
breakIterator.setText(combined);
|
||||
int sbrk2 = breakIterator.next();
|
||||
@ -688,7 +707,7 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
}
|
||||
|
||||
// Is the adjacent tree a sentence breaker ?
|
||||
if (isSentenceBreak(dt, false)) {
|
||||
if (isSentenceBreak(nextTree, false)) {
|
||||
return sbrk;
|
||||
}
|
||||
|
||||
@ -704,23 +723,23 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
return -1; // indeterminate at this time
|
||||
}
|
||||
|
||||
private boolean isSentenceBreak(Name tagName) {
|
||||
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
|
||||
}
|
||||
|
||||
private boolean isSentenceBreak(DocTree dt, boolean isFirstDocTree) {
|
||||
private boolean isSentenceBreak(DCTree dt, boolean isFirstDocTree) {
|
||||
switch (dt.getKind()) {
|
||||
case START_ELEMENT:
|
||||
StartElementTree set = (StartElementTree) dt;
|
||||
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(set.getName());
|
||||
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(set.getName());
|
||||
case END_ELEMENT:
|
||||
EndElementTree eet = (EndElementTree) dt;
|
||||
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(eet.getName());
|
||||
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(eet.getName());
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSentenceBreak(Name tagName) {
|
||||
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the position of the first non-whitespace character.
|
||||
*/
|
||||
@ -733,9 +752,6 @@ public class DocTreeMaker implements DocTreeFactory {
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<DCTree> cast(List<? extends DocTree> list) {
|
||||
return (List<DCTree>) list;
|
||||
}
|
||||
}
|
||||
|
@ -61,7 +61,6 @@ import com.sun.tools.javac.tree.DCTree;
|
||||
import com.sun.tools.javac.tree.DCTree.DCDocComment;
|
||||
import com.sun.tools.javac.tree.DCTree.DCReference;
|
||||
import com.sun.tools.javac.tree.JCTree.JCCompilationUnit;
|
||||
import com.sun.tools.javac.util.List;
|
||||
import com.sun.tools.javac.util.Pair;
|
||||
|
||||
public class SourceDocTreeScannerTest extends AbstractTreeScannerTest {
|
||||
|
Loading…
Reference in New Issue
Block a user