/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cogroo.uima.readers; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.tools.formats.ad.ADSentenceStream; import opennlp.tools.formats.ad.ADSentenceStream.Sentence; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import cogroo.uima.readers.entities.Paragraph; import cogroo.uima.readers.entities.SentenceEx; import cogroo.uima.readers.entities.Text; public class Reader implements ObjectStream<Text> { private ADSentenceStream sentenceStream; private int text = -1; private int para = -1; private boolean isSameText; private boolean isSamePara; private Sentence sent; private Pattern metaPattern = Pattern .compile("^[a-zA-Z]+(\\d+)-(\\d+)\\w?\\s+p=(\\d+).*"); public Reader(InputStream in, String charset) throws IOException { this.sentenceStream = new ADSentenceStream(new PlainTextByLineStream(in, charset)); sent = this.sentenceStream.read(); updateMeta(); } public Text read() throws IOException { if (sent == null) { return null; } int thisText = text; List<Paragraph> paragraphs = new ArrayList<Paragraph>(); do { int thisPara = para; List<SentenceEx> sentences = new ArrayList<SentenceEx>(); do { SentenceEx se = new SentenceEx(sent); sentences.add(se); sent = sentenceStream.read(); updateMeta(); } while (isSamePara); paragraphs.add(new Paragraph(sentences, thisPara)); } while (isSameText); return new Text(paragraphs, thisText); } private void updateMeta() { if (this.sent != null) { String meta = this.sent.getMetadata(); Matcher m = metaPattern.matcher(meta); int currentText; int currentPara; if (m.matches()) { currentText = Integer.parseInt(m.group(1)); currentPara = Integer.parseInt(m.group(3)); } else { throw new RuntimeException("Invalid metadata: " + meta); } isSamePara = isSameText = false; if (currentText == text) isSameText = true; if (currentPara == para) isSamePara = true; text = currentText; para = currentPara; } else { this.isSamePara = this.isSameText = false; } } public void reset() throws IOException, UnsupportedOperationException { this.sentenceStream.reset(); } public void close() throws IOException { this.sentenceStream.close(); } /** * @param args * @throws IOException */ public static void mainBosque(String[] args) throws IOException { FileInputStream in = new FileInputStream( "/Users/wcolen/Documents/wrks/corpus/Bosque/Bosque_CF_8.0.ad.txt"); Reader r = new Reader(in, "ISO-8859-1"); File out = new File("/Users/wcolen/Documents/wrks/corpus/bosque_texto.txt"); BufferedWriter bw = new BufferedWriter(new FileWriter(out)); Text t = r.read(); while (t != null) { // System.out.println(t.getId()); for (Paragraph p : t.getParagraphs()) { // System.out.println("[text: " + p.getId() + "]"); for (SentenceEx s : p.getSentences()) { //bw.append("\n\n[" + s.getSentence().getMetadata() + "]\n\n"); bw.append(s.getSentence().getText() + " "); } bw.append("\n"); } bw.append("\n\n"); t = r.read(); } r.close(); bw.close(); } public static void mainMetro(String[] args) throws IOException { FileInputStream in = new FileInputStream( "/Users/wcolen/Documents/wrks/corpus/Metro/Metro.txt"); Reader r = new Reader(in, "UTF-8"); File out = new File("/Users/wcolen/Documents/wrks/corpus/metro_texto.txt"); BufferedWriter bw = new BufferedWriter(new FileWriter(out)); Text t = r.read(); while (t != null) { // System.out.println(t.getId()); for (Paragraph p : t.getParagraphs()) { // System.out.println("[text: " + p.getId() + "]"); for (SentenceEx s : p.getSentences()) { //bw.append("\n\n[" + s.getSentence().getMetadata() + "]\n\n"); bw.append(s.getSentence().getText() + " "); } bw.append("\n"); } bw.append("\n\n"); t = r.read(); } r.close(); bw.close(); } public static void main(String[] args) throws IOException { FileInputStream in = new FileInputStream( "/Users/wcolen/Documents/wrks/corpus/Comunidade/comunidade.txt"); Reader r = new Reader(in, "UTF-8"); File out = new File("/Users/wcolen/Documents/wrks/corpus/comunidade_texto.txt"); BufferedWriter bw = new BufferedWriter(new FileWriter(out)); Text t = r.read(); while (t != null) { // System.out.println(t.getId()); for (Paragraph p : t.getParagraphs()) { // System.out.println("[text: " + p.getId() + "]"); for (SentenceEx s : p.getSentences()) { //bw.append("\n\n[" + s.getSentence().getMetadata() + "]\n\n"); if(s.getGrammarErrors().size() > 0) { bw.append(s.getSentence().getText() + " "); bw.append("\n"); } } } //bw.append("\n\n"); t = r.read(); } r.close(); bw.close(); } }