/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Scanner; import org.apache.log4j.Logger; import br.usp.pcs.lta.cogroo.configuration.LegacyRuntimeConfiguration; import br.usp.pcs.lta.cogroo.configuration.RuntimeConfigurationI; import br.usp.pcs.lta.cogroo.entity.Mistake; import br.usp.pcs.lta.cogroo.entity.Sentence; import br.usp.pcs.lta.cogroo.entity.Token; import br.usp.pcs.lta.cogroo.entity.impl.runtime.MistakeImpl; import br.usp.pcs.lta.cogroo.grammarchecker.CheckerResult; import br.usp.pcs.lta.cogroo.grammarchecker.CogrooI; import br.usp.pcs.lta.cogroo.tools.ProcessingEngine; import br.usp.pcs.lta.cogroo.tools.checker.Checker; import br.usp.pcs.lta.cogroo.tools.dictionary.CogrooTagDictionary; import br.usp.pcs.lta.cogroo.tools.dictionary.impl.Merger; import br.usp.pcs.lta.cogroo.tools.sentencedetector.SentenceDetectorI; public class MultiCogroo implements CogrooI { protected SentenceDetectorI sentDetect; protected ProcessingEngine tokenizer; protected ProcessingEngine pretagger; // protected ProcessingEngine nameFind; protected ProcessingEngine tagger; protected ProcessingEngine featurizer; protected ProcessingEngine chunker; protected ProcessingEngine shallowParser; protected CogrooTagDictionary tagDictionary; protected Checker checker; protected Merger merger; private PostPOSTagger postPOSTagger; protected static final Logger LOGGER = Logger.getLogger(MultiCogroo.class); public MultiCogroo() { } public MultiCogroo(RuntimeConfigurationI config) { if (LOGGER.isDebugEnabled()) { LOGGER.debug(">>> init()"); } LOGGER.info("Loading Resources..."); try { // Loads dictionaries. LOGGER.info("Loading Dictionaries..."); long start = System.nanoTime(); this.tagDictionary = config.getTagDictionary(); LOGGER.info("Dictionaries loaded in " + (System.nanoTime() - start) / 1000000 + "ms"); this.pretagger = new MultiPretagger(config); // Loads models. LOGGER.info("Loading Models..."); long modelLoadingStart = System.nanoTime(); start = modelLoadingStart; this.sentDetect = new MultiSentenceDetector(config); if (LOGGER.isDebugEnabled()) { LOGGER.debug(" [Sentence Detector]\t\tmodel loaded in\t[" + (System.nanoTime() - start) / 1000000 + "ms]"); } start = System.nanoTime(); this.tokenizer = new MultiTokenizer(config); if (LOGGER.isDebugEnabled()) { LOGGER.debug(" [Tokenizer]\t\tmodel loaded in\t[" + (System.nanoTime() - start) / 1000000 + "ms]"); } // start = System.nanoTime(); // this.nameFind = config.getNameFinder(); // if (LOGGER.isDebugEnabled()) { // LOGGER.debug(" [Namefinder]\t\tmodel loaded in\t[" // + (System.nanoTime() - start) / 1000000 + "ms]"); // } start = System.nanoTime(); this.tagger = new MultiPOSTagger(config); if (LOGGER.isDebugEnabled()) { LOGGER.debug(" [Tagger]\t\tmodel loaded in\t[" + (System.nanoTime() - start) / 1000000 + "ms]"); } postPOSTagger = new PostPOSTagger(config.getTagDictionary()); start = System.nanoTime(); this.chunker = new MultiChunker(config); if (LOGGER.isDebugEnabled()) { LOGGER.debug(" [Chunker]\t\tmodel loaded in\t[" + (System.nanoTime() - start) / 1000000 + "ms]"); } start = System.nanoTime(); this.shallowParser = new MultiShallowParser(config); if (LOGGER.isDebugEnabled()) { LOGGER.debug(" [Shallow parser]\tmodel loaded in\t[" + (System.nanoTime() - start) / 1000000 + "ms]"); } LOGGER.info("Models loaded in " + (System.nanoTime() - modelLoadingStart) / 1000000 + "ms"); } catch (Exception e) { throw new RuntimeException("CoGrOO loading failed", e); } // Forces initialization of the rules subsystem. LOGGER.info("Loading Rules..."); long start = System.nanoTime(); // this.rulesApplier = config.getRulesApplier(); this.checker = config.getChecker(); LOGGER.info("Rules loaded in " + (System.nanoTime() - start) / 1000000 + "ms"); LOGGER.info("Loading completed!"); if (LOGGER.isDebugEnabled()) { LOGGER.debug("<<< init()"); } } /* * (non-Javadoc) * * @see * br.usp.pcs.lta.cogroo.grammarchecker.CogrooI#checkText(java.lang.String) */ public List<Mistake> checkText(String text) { List<Mistake> mistakes = null; try { mistakes = analyseAndCheckText(text).mistakes; } catch (IndexOutOfBoundsException e) { LOGGER.fatal("Failed to process text: " + text, e); throw new RuntimeException(e); } return mistakes; } public int checkFirstSentence(String paraText, List<Mistake> outMistakes) { CheckerResult res = analyseAndCheckText(paraText, true); outMistakes.addAll(res.mistakes); return res.sentences.get(0).getSentence().length(); } public CheckerResult analyseAndCheckText(String text) { return analyseAndCheckText(text, false); } private CheckerResult analyseAndCheckText(String text, boolean isFirsSentenceOnly) { long start = System.nanoTime(); /* * If an exception occurs when processing the sentence, simply returns an * empty mistakes list. CoGrOO must never die because of a bad user entry, * since its setup time is very expensive. */ if (LOGGER.isDebugEnabled()) { LOGGER.debug(">>> checkAndGetSentence()"); LOGGER.debug("Text entered: " + text); } List<Sentence> sentences = null; List<Mistake> mistakes = new ArrayList<Mistake>(); if (text != null && !"".equals(text)) { // Protect // against bad user entries. try { sentences = this.sentDetect.process(text); for (Sentence sentence : sentences) { // Prepares the sentence to apply rules. this.tokenizer.process(sentence); // this.nameFind.process(sentence); this.pretagger.process(sentence); try { this.tagger.process(sentence); } catch (ArrayIndexOutOfBoundsException e) { System.out.println("tagger failed"); return null; } if (MultiCogrooSettings.TOK) { this.postPOSTagger.process(sentence); } // this.merger.generalizePOSTags(sentence, this.tagDictionary); this.chunker.process(sentence); this.shallowParser.process(sentence); // Just verifying. if (LOGGER.isDebugEnabled()) { StringBuilder trace = new StringBuilder(); trace.append("Show tree [" + sentence.getSentence() + "]: \n"); List<Token> tokens = sentence.getTokens(); for (int i = 0; i < tokens.size(); i++) { trace.append("\t[" + tokens.get(i).getSyntacticTag() + "][" + tokens.get(i).getChunkTag() + "] " + tokens.get(i) + " --> {" + tokens.get(i).getPrimitive() + "}_" + tokens.get(i).getMorphologicalTag() + "\n"); } LOGGER.debug(trace.toString()); } // Mistakes to be returned. mistakes.addAll(this.checker.check(sentence)); // Just verifying. if (LOGGER.isDebugEnabled()) { for (Mistake mistake : mistakes) { LOGGER.debug("rule[" + ((MistakeImpl) mistake).getRuleIdentifier() + "], span[" + ((MistakeImpl) mistake).getStart() + ", " + ((MistakeImpl) mistake).getEnd() + "]"); } } if (isFirsSentenceOnly) { break; } } } catch (Exception e) { LOGGER.error("Error processing text: " + text + " sentences: " + sentences, e); } LOGGER.debug("Check sentence time: " + (System.nanoTime() - start) / 1000 + "us"); } return new CheckerResult(sentences, mistakes); } /** * @param args */ public static void main(String[] args) { long start = System.nanoTime(); CogrooI cogroo = new MultiCogroo( new LegacyRuntimeConfiguration( "/Users/wcolen/Documents/wrks/corpuswrk/cogroo3/CoGrOOBase/target/CoGrOOBase-3.1.3a-SNAPSHOT-bin")); // THE // CoGrOO! System.out.println("Loading time [" + ((System.nanoTime() - start) / 1000000) + "ms]"); System.out.println("Default: " + Charset.defaultCharset()); Scanner kb = new Scanner(System.in); System.out.print("Enter the sentence: "); String input = kb.nextLine(); while (!input.equals("q")) { if (input.equals("0")) { // input = // "couves-flores, amores-perfeitos, gentis-homens, quintas-feiras, guarda-roupas, alto-falantes, reco-recos, águas-de-colônia, cavalos-vapor, palavras-chave, bota-fora, saca-rolhas, louva-a-deus"; // input = "Os olhos das meninas são bonitos nas estrelas."; // input = "Os inimigos que eram fácil derrotar estão próximo."; // input = "A construção do trecho inicial da Linha 5-Lilás."; input = "Enviei os documentos à Vossa Excelência. Enviei os documentos à Vossa Santidade."; // 114: Jamais ocorreu-nos tal idéia. // 115: Júlio namorou com Marina durante três anos. // } try { CheckerResult cr = cogroo.analyseAndCheckText(input); for (Mistake mistake : cr.mistakes) { System.out.println("[" + mistake.getStart() + ".." + mistake.getEnd() + "] = [" + input.substring(((MistakeImpl) mistake).getStart(), ((MistakeImpl) mistake).getEnd()) + "]"); System.out.println(mistake.toString()); } for (Sentence s : cr.sentences) { System.out.println(s.getSentence()); System.out.println(s.getSyntaxTree()); System.out.println(s); } } catch (Exception e) { e.printStackTrace(); } System.out.print("Enter the sentence: "); input = kb.nextLine(); } } public CogrooTagDictionary getTagDictionary() { return this.tagDictionary; } }