/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import br.usp.pcs.lta.cogroo.configuration.LegacyRuntimeConfiguration; import br.usp.pcs.lta.cogroo.entity.Sentence; import br.usp.pcs.lta.cogroo.entity.Token; import br.usp.pcs.lta.cogroo.entity.impl.runtime.MorphologicalTag; import br.usp.pcs.lta.cogroo.entity.tree.Leaf; import br.usp.pcs.lta.cogroo.entity.tree.Node; import br.usp.pcs.lta.cogroo.entity.tree.TreeElement; import br.usp.pcs.lta.cogroo.grammarchecker.CheckerResult; import br.usp.pcs.lta.cogroo.grammarchecker.CogrooI; import br.usp.pcs.lta.cogroo.tag.LegacyTagInterpreter; import br.usp.pcs.lta.cogroo.tag.TagInterpreterI; public class ProcessReport { CogrooI cogroo; private String report; private String output; public ProcessReport(String resources, String report, String output) { // UIMA will load modules using envvar! System.out.println("Using uima modules from: " + System.getenv("UIMA_DATAPATH")); // the rest we load normally this.cogroo = new MultiCogroo(new LegacyRuntimeConfiguration(resources)); this.report = report; this.output = output; } private void processFile() throws IOException { BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(this.report), "utf-8")); BufferedWriter out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(this.output), "utf-8")); String line = in.readLine(); while (line != null) { out.append(line).append("\n"); String text = getText(line); if (text != null) { out.append(process(text)).append("\n\n"); } line = in.readLine(); } in.close(); out.close(); } private String process(String text) { CheckerResult results = cogroo.analyseAndCheckText(text); StringBuilder sb = new StringBuilder(); if(results == null) { return "Processing failed: " + text; } for (Sentence sentence : results.sentences) { sb.append("Flat structure for: ").append(sentence.getSentence()) .append("\n"); for (Token token : sentence.getTokens()) { // print the text sb.append(token.getLexeme()); // print the lemma sb.append(" [" + token.getPrimitive() + " "); // print the morphological tag, we use a tag interpreter here if(token.getMorphologicalTag() == null) { throw new IllegalArgumentException("Morphological tag is missing! " + sentence.toString()); } sb.append("" + mtagToStr(token.getMorphologicalTag()) + "] "); } sb.append("\n").append("Syntax tree: " + sentence.getSyntaxTree()); } return sb.toString(); } private String getText(String line) { if (line.startsWith("[")) { String[] parts = line.split("\\t+"); return parts[parts.length - 1]; } return null; } private static String printTree(Node root) { StringBuffer sb = new StringBuffer(); // print itself and its children for (int i = 0; i < root.getLevel(); i++) { sb.append("\t"); } sb.append(root.getSyntacticTag()); sb.append("{"); if (root.getMorphologicalTag() != null) { sb.append(root.getMorphologicalTag()); } sb.append("\n"); for (TreeElement element : root.getElements()) { sb.append(printTree(element)); } sb.append("\n"); for (int i = 0; i < root.getLevel(); i++) { sb.append("\t"); } sb.append("}\n"); return sb.toString(); } private static String printTree(Leaf leaf) { StringBuffer sb = new StringBuffer(); // print itself and its children for (int i = 0; i < leaf.getLevel(); i++) { sb.append("\t"); } // print the text sb.append(leaf.getLexeme()); // print the lemma sb.append("\tlemma[" + leaf.getLemma() + "] "); // print the morphological tag, we use a tag interpreter here sb.append("\ttag[" + leaf.getMorphologicalTag() + "] \n"); return sb.toString(); } private static String printTree(TreeElement element) { if (element instanceof Node) return printTree((Node) element); else return printTree((Leaf) element); } // tag interpreter is responsible for serializing and reading tags. // .. the LegacyTagInterpreter follow a variant of GC tagset: // .. http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html private static final TagInterpreterI tagInterpreter = new LegacyTagInterpreter(); private static String mtagToStr(MorphologicalTag tag) { return tagInterpreter.serialize(tag); } /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { System.out.println("Executing MultiCogroo ProcessReport..."); System.out.println(" path: " + args[0]); System.out.println(" in: " + args[1]); System.out.println(" out: " + args[2]); ProcessReport pr = new ProcessReport(args[0], args[1], args[2]); pr.processFile(); } }