/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.commandline; import static org.languagetool.tools.StringTools.filterXML; import static org.languagetool.tools.StringTools.readerToString; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; import javax.xml.parsers.ParserConfigurationException; import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.BOMInputStream; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.Languages; import org.languagetool.MultiThreadedJLanguageTool; import org.languagetool.bitext.TabBitextReader; import org.languagetool.language.AmericanEnglish; import org.languagetool.language.English; import org.languagetool.language.LanguageIdentifier; import org.languagetool.rules.Rule; import org.languagetool.rules.bitext.BitextRule; import org.languagetool.rules.patterns.AbstractPatternRule; import org.languagetool.rules.patterns.PatternRuleLoader; import org.languagetool.tools.JnaTools; import org.languagetool.tools.StringTools.ApiPrintMode; import org.languagetool.tools.Tools; import org.xml.sax.SAXException; /** * The command line tool to check plain text files. */ class Main { private final CommandLineOptions options; private MultiThreadedJLanguageTool lt; private boolean profileRules; private boolean bitextMode; private MultiThreadedJLanguageTool srcLt; private List<BitextRule> bRules; private Rule currentRule; Main(CommandLineOptions options) throws IOException { this.options = options; profileRules = false; bitextMode = false; srcLt = null; bRules = null; lt = new MultiThreadedJLanguageTool(options.getLanguage(), options.getMotherTongue()); lt.setCleanOverlappingMatches(false); if (options.getRuleFile() != null) { addExternalRules(options.getRuleFile()); } if (options.getLanguageModel() != null) { lt.activateLanguageModelRules(options.getLanguageModel()); } Tools.selectRules(lt, options.getDisabledCategories(), options.getEnabledCategories(), new HashSet<>(options.getDisabledRules()), new HashSet<>(options.getEnabledRules()), options.isUseEnabledOnly()); } private void addExternalRules(String filename) throws IOException { PatternRuleLoader ruleLoader = new PatternRuleLoader(); try (InputStream is = new FileInputStream(filename)) { List<AbstractPatternRule> externalRules = ruleLoader.getRules(is, filename); for (AbstractPatternRule externalRule : externalRules) { lt.addRule(externalRule); } } } boolean isSpellCheckingActive() { List<Rule> rules = lt.getAllActiveRules(); for (Rule rule : rules) { if (rule.isDictionaryBasedSpellingRule()) { return true; } } return false; } JLanguageTool getJLanguageTool() { return lt; } private void setListUnknownWords(boolean listUnknownWords) { lt.setListUnknownWords(listUnknownWords); } private void cleanUp() { if (lt != null) { lt.shutdown(); } if (srcLt != null) { srcLt.shutdown(); } JLanguageTool.removeTemporaryFiles(); } private void setProfilingMode() { profileRules = true; } private void setBitextMode(Language sourceLang, List<String> disabledRules, List<String> enabledRules, File bitextRuleFile) throws IOException, ParserConfigurationException, SAXException { bitextMode = true; Language target = lt.getLanguage(); lt = new MultiThreadedJLanguageTool(target, null); srcLt = new MultiThreadedJLanguageTool(sourceLang); Tools.selectRules(lt, disabledRules, enabledRules, true); Tools.selectRules(srcLt, disabledRules, enabledRules, true); bRules = Tools.getBitextRules(sourceLang, lt.getLanguage(), bitextRuleFile); List<BitextRule> bRuleList = new ArrayList<>(bRules); for (BitextRule bitextRule : bRules) { for (String disabledRule : disabledRules) { if (bitextRule.getId().equals(disabledRule)) { bRuleList.remove(bitextRule); } } } bRules = bRuleList; if (enabledRules.size() > 0) { bRuleList = new ArrayList<>(); for (String enabledRule : enabledRules) { for (BitextRule bitextRule : bRules) { if (bitextRule.getId().equals(enabledRule)) { bRuleList.add(bitextRule); } } } bRules = bRuleList; } } private void runOnFile(String filename, String encoding, boolean xmlFiltering) throws IOException { if (bitextMode) { TabBitextReader reader = new TabBitextReader(filename, encoding); if (options.isApplySuggestions()) { CommandLineTools.correctBitext(reader, srcLt, lt, bRules); } else { CommandLineTools.checkBitext(reader, srcLt, lt, bRules, options.isXmlFormat()); } } else { String text = getFilteredText(filename, encoding, xmlFiltering); if (isStdIn(filename)) { System.err.println("Working on STDIN..."); } else { System.err.println("Working on " + filename + "..."); } if (options.isAutoDetect()) { Language language = detectLanguageOfString(text); if (language == null) { System.err.println("Could not detect language well enough, using American English"); language = new AmericanEnglish(); } changeLanguage(language, options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules()); System.err.println("Using " + language.getName() + " for file " + filename); } if (options.isApplySuggestions()) { System.out.print(Tools.correctText(text, lt)); } else if (profileRules) { CommandLineTools.profileRulesOnText(text, lt); } else if (!options.isTaggerOnly()) { CommandLineTools.checkText(text, lt, options.isXmlFormat(), options.isJsonFormat(), 0, options.isListUnknown()); } else { CommandLineTools.tagText(text, lt); } if (options.isListUnknown() && !options.isXmlFormat() && !options.isJsonFormat()) { System.out.println("Unknown words: " + lt.getUnknownWords()); } } } private void runOnFileLineByLine(String filename, String encoding) throws IOException { System.err.println("Warning: running in line by line mode. Cross-paragraph checks will not work.\n"); if (options.isVerbose()) { lt.setOutput(System.err); } if (!options.isXmlFormat() && !options.isApplySuggestions()) { if (isStdIn(filename)) { System.err.println("Working on STDIN..."); } else { System.err.println("Working on " + filename + "..."); } } if (profileRules && isStdIn(filename)) { throw new IllegalArgumentException("Profiling mode cannot be used with input from STDIN"); } int runCount = 1; List<Rule> rules = lt.getAllActiveRules(); if (profileRules) { System.out.printf("Testing %d rules\n", rules.size()); System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec."); runCount = rules.size(); } int lineOffset = 0; int tmpLineOffset = 0; handleLine(ApiPrintMode.START_API, 0, new StringBuilder()); StringBuilder sb = new StringBuilder(); for (int ruleIndex = 0; !rules.isEmpty() && ruleIndex < runCount; ruleIndex++) { currentRule = rules.get(ruleIndex); try ( InputStreamReader isr = getInputStreamReader(filename, encoding); BufferedReader br = new BufferedReader(isr) ) { String line; int lineCount = 0; while ((line = br.readLine()) != null) { sb.append(line); lineCount++; // to detect language from the first input line if (lineCount == 1 && options.isAutoDetect()) { Language language = detectLanguageOfString(line); if (language == null) { System.err.println("Could not detect language well enough, using American English"); language = new AmericanEnglish(); } System.err.println("Language used is: " + language.getName()); language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph( options.isSingleLineBreakMarksParagraph()); changeLanguage(language, options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules()); } sb.append('\n'); tmpLineOffset++; if (isBreakPoint(line)) { handleLine(ApiPrintMode.CONTINUE_API, lineOffset, sb); if (profileRules) { lt.sentenceTokenize(sb.toString()).size(); } sb = new StringBuilder(); lineOffset = tmpLineOffset; } } } finally { if (sb.length() > 0) { if (profileRules) { lt.sentenceTokenize(sb.toString()).size(); } } handleLine(ApiPrintMode.END_API, tmpLineOffset - 1, sb); } } } private int handleLine(ApiPrintMode mode, int lineOffset, StringBuilder sb) throws IOException { int matches = 0; String s = filterXML(sb.toString()); if (options.isApplySuggestions()) { System.out.print(Tools.correctText(s, lt)); } else if (profileRules) { matches += Tools.profileRulesOnLine(s, lt, currentRule); } else if (!options.isTaggerOnly()) { matches += CommandLineTools.checkText(s, lt, options.isXmlFormat(), options.isJsonFormat(), -1, lineOffset, matches, mode, options.isListUnknown(), Collections.<String>emptyList()); } else { CommandLineTools.tagText(s, lt); } return matches; } private boolean isBreakPoint(String line) { return lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara() || "".equals(line); } private InputStreamReader getInputStreamReader(String filename, String encoding) throws IOException { String charsetName = encoding != null ? encoding : Charset.defaultCharset().name(); InputStream is = System.in; if (!isStdIn(filename)) { is = new FileInputStream(new File(filename)); BOMInputStream bomIn = new BOMInputStream(is, true, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,ByteOrderMark.UTF_32LE); if (bomIn.hasBOM() && encoding == null) { charsetName = bomIn.getBOMCharsetName(); } is = bomIn; } return new InputStreamReader(new BufferedInputStream(is), charsetName); } private boolean isStdIn(String filename) { return "-".equals(filename); } private void runRecursive(String filename, String encoding, boolean xmlFiltering) { File dir = new File(filename); File[] files = dir.listFiles(); if (files == null) { throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a directory, cannot use recursion"); } for (File file : files) { try { if (file.isDirectory()) { runRecursive(file.getAbsolutePath(), encoding, xmlFiltering); } else { if (options.isLineByLine()) { runOnFileLineByLine(file.getAbsolutePath(), encoding); } else { runOnFile(file.getAbsolutePath(), encoding, xmlFiltering); } } } catch (Exception e) { throw new RuntimeException("Could not check text in file " + file, e); } } } /** * Loads filename and filters out XML. Note that the XML * filtering can lead to incorrect positions in the list of matching rules. */ private String getFilteredText(String filename, String encoding, boolean xmlFiltering) throws IOException { if (options.isVerbose()) { lt.setOutput(System.err); } // don't use StringTools.readStream() as that might add newlines which aren't there: try (InputStreamReader reader = getInputStreamReader(filename, encoding)) { String fileContents = readerToString(reader); if (xmlFiltering) { return filterXML(fileContents); } else { return fileContents; } } } private void changeLanguage(Language language, Language motherTongue, List<String> disabledRules, List<String> enabledRules) { try { lt = new MultiThreadedJLanguageTool(language, motherTongue); Tools.selectRules(lt, disabledRules, enabledRules, true); if (options.isVerbose()) { lt.setOutput(System.err); } } catch (Exception e) { throw new RuntimeException("Could not create LanguageTool instance for language " + language, e); } } /** * Command line tool to check plain text files. */ public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException { JnaTools.setBugWorkaroundProperty(); CommandLineParser commandLineParser = new CommandLineParser(); CommandLineOptions options = null; try { options = commandLineParser.parseOptions(args); } catch (WrongParameterNumberException e) { commandLineParser.printUsage(); System.exit(1); } catch (IllegalArgumentException e) { System.err.println(e.toString()); System.exit(1); } catch (UnknownParameterException e) { if (e.getMessage() != null) { System.err.println(e.getMessage()); } else { System.err.println(e.toString()); } commandLineParser.printUsage(System.err); System.exit(1); } if (options.isPrintUsage()) { commandLineParser.printUsage(); System.exit(1); } if (options.isPrintVersion()) { System.out.println("LanguageTool version " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")"); System.exit(0); } if (options.isPrintLanguages()) { printLanguages(); System.exit(0); } if (options.getFilename() == null) { options.setFilename("-"); } String languageHint = null; if (options.getLanguage() == null) { if (!options.isXmlFormat() && !options.isAutoDetect()) { System.err.println("No language specified, using English (no spell checking active, " + "specify a language variant like 'en-GB' if available)"); } options.setLanguage(new English()); } else if (!options.isXmlFormat() && !options.isApplySuggestions()) { languageHint = "Expected text language: " + options.getLanguage().getName(); } options.getLanguage().getSentenceTokenizer().setSingleLineBreaksMarksParagraph( options.isSingleLineBreakMarksParagraph()); Main prg = new Main(options); if (options.getFalseFriendFile() != null) { List<AbstractPatternRule> ffRules = prg.lt.loadFalseFriendRules(options.getFalseFriendFile()); for (AbstractPatternRule ffRule : ffRules) { prg.lt.addRule(ffRule); } } if (prg.lt.getAllActiveRules().size() == 0) { List<String> catIds = options.getEnabledCategories().stream().map(i -> i.toString()).collect(Collectors.toList()); throw new RuntimeException("No rules are active. Please make sure your rule ids " + "(" + options.getEnabledRules() + ") and " + "category ids (" + catIds + ") are correct"); } if (languageHint != null) { String spellHint = prg.isSpellCheckingActive() ? "" : " (no spell checking active, specify a language variant like 'en-GB' if available)"; System.err.println(languageHint + spellHint); } prg.setListUnknownWords(options.isListUnknown()); if (options.isProfile()) { prg.setProfilingMode(); } if (options.isBitext()) { if (options.getMotherTongue() == null) { throw new IllegalArgumentException("You have to set the source language (as mother tongue) in bitext mode"); } File bitextRuleFile = options.getBitextRuleFile() != null ? new File(options.getBitextRuleFile()) : null; prg.setBitextMode(options.getMotherTongue(), options.getDisabledRules(), options.getEnabledRules(), bitextRuleFile); } if (options.isRecursive()) { prg.runRecursive(options.getFilename(), options.getEncoding(), options.isXmlFiltering()); } else { if (options.isLineByLine()) { prg.runOnFileLineByLine(options.getFilename(), options.getEncoding()); } else { prg.runOnFile(options.getFilename(), options.getEncoding(), options.isXmlFiltering()); } } prg.cleanUp(); } private static void printLanguages() { List<String> languages = new ArrayList<>(); for (Language language : Languages.get()) { languages.add(language.getShortCodeWithCountryAndVariant() + " " + language.getName()); } Collections.sort(languages); for (String s : languages) { System.out.println(s); } } private static Language detectLanguageOfString(String text) { LanguageIdentifier identifier = new LanguageIdentifier(); return identifier.detectLanguage(text); } }