/* * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package tidystats; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.FileSystem; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Generate statistics from the files generated by tidy.sh. * *
The tidy.sh script is used to run tidy on all the HTML files * in a directory, creating files in a new directory, and for each * HTML file, it writes the console output from tidy into a file * beside the fixed up file, with an additional .tidy extension. * *
This program will scan a directory for *.tidy files and * analyze the messages reported by tidy, in order to generate a * report with statistics on the various messages that were * reported by tidy. * *
Typical usage: *
* $ bash /path/to/tidy.sh /path/to/htmldir * $ javac -d /path/to/classes /path/to/Main.java * $ java -cp /path/to/classes tidystats.Main /path/to/htmldir.tidy ** *
Internally, the program works by matching lines in the *.tidy
* files against a series of regular expressions that are used to
* categorize the messages. The set of regular expressions was
* empirically determined by running the program on the output from
* running tidy.sh on all the generated JDK documentation. It is
* possible that tidy may generate more/different messages on other
* doc sets, in which case, the set of regexes in the program should
* be updated.
*/
public class Main {
public static void main(String... args) throws IOException {
new Main().run(args);
}
void run(String... args) throws IOException {
FileSystem fs = FileSystems.getDefault();
List by "),
Pattern.compile(".*Warning: <.*> element removed from HTML5"),
Pattern.compile(".*Warning: <.*> attribute \".*\" not allowed for HTML5"),
Pattern.compile(".*Warning: The summary attribute on the proprietary attribute \"pre\""),
Pattern.compile(".*Warning: discarding unexpected <.*>"),
Pattern.compile(".*Warning: discarding unexpected "),
Pattern.compile(".*Warning: entity \".*\" doesn't end in ';'"),
Pattern.compile(".*Warning: inserting implicit <.*>"),
Pattern.compile(".*Warning: inserting missing 'title' element"),
Pattern.compile(".*Warning: missing declaration"),
Pattern.compile(".*Warning: missing <.*>"),
Pattern.compile(".*Warning: missing before <.*>"),
Pattern.compile(".*Warning: nested emphasis <.*>"),
Pattern.compile(".*Warning: plain text isn't allowed in <.*> elements"),
Pattern.compile(".*Warning: replacing
"),
Pattern.compile(".*Warning: replacing invalid numeric character reference .*"),
Pattern.compile(".*Warning: replacing unexpected .* by "),
Pattern.compile(".*Warning: trimming empty <.*>"),
Pattern.compile(".*Warning: unescaped & or unknown entity \".*\""),
Pattern.compile(".*Warning: unescaped & which should be written as &"),
Pattern.compile(".*Warning: using
in place of element is obsolete in HTML5"),
Pattern.compile(".*Warning: replacing invalid UTF-8 bytes \\(char. code U\\+.*\\)")
};
int files;
int ok;
int warns;
int errs;
int css;
int overflow;
}