/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.imscwb.util; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.Queue; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; public class TextIterable implements Iterable<CorpusText>, Iterator<CorpusText> { private final static Pattern TITLE_PATTERN = Pattern.compile("\"(.+)\""); private final static int BUFFER_SIZE = 100; private final Queue<Resource> fileQueue; private BufferedReader reader; private final String encoding; private final Queue<CorpusText> texts; private Resource currentResource; public TextIterable(Collection<Resource> files, String encoding) { this.encoding = encoding; this.texts = new LinkedList<CorpusText>(); this.fileQueue = new LinkedList<Resource>(files); try { this.reader = getReader(); fillTextQueue(BUFFER_SIZE); } catch (IOException e) { throw new RuntimeException(e); } } public Resource getCurrentResource() { return currentResource; } @Override public Iterator<CorpusText> iterator() { return this; } @Override public boolean hasNext() { if (texts.isEmpty()) { try { fillTextQueue(BUFFER_SIZE); } catch (IOException e) { e.printStackTrace(); } } return !texts.isEmpty(); } @Override public CorpusText next(){ return texts.poll(); } @Override public void remove() { throw new UnsupportedOperationException(); } private void fillTextQueue(int bufferSize) throws IOException { // check if reader is still valid if (reader == null) { return; } String line; boolean insideSentence = false; CorpusText text = null; CorpusSentence currentSentence = null; while (texts.size() < bufferSize && (line = reader.readLine()) != null) { if (line.startsWith("<text ")) { String title = getTitle(line); text = new CorpusText(title); } if (line.equals("<s>")) { insideSentence = true; currentSentence = new CorpusSentence(); continue; } if (line.equals("</s>")) { if (text == null) { throw new IOException("Inside sentence without text."); } text.addSentence(currentSentence); currentSentence = null; insideSentence = false; } if (line.equals("</text>")) { texts.add(text); text = null; insideSentence = false; } if (insideSentence && currentSentence != null) { TabTokenizer tokenizer = new TabTokenizer(line); for (int i=0; i<3; i++) { if (!tokenizer.hasNext()) { throw new IOException("Ill-formed line: " + line); } switch (i) { case 0 : currentSentence.addToken(tokenizer.next()); break; case 1 : currentSentence.addPOS(tokenizer.next()); break; case 2 : currentSentence.addLemma(tokenizer.next()); break; } } } } if (!reader.ready()) { reader = getReader(); } } private String getTitle(String line) { Matcher m = TITLE_PATTERN.matcher(line); if (m.find()) { return line.substring(m.start(1), m.end(1)); } else { return ""; } } private BufferedReader getReader() throws FileNotFoundException, IOException { BufferedReader r = null; if (!fileQueue.isEmpty()) { currentResource = fileQueue.poll(); InputStream resolvedStream = CompressionUtils.getInputStream(currentResource.getPath(), currentResource.getInputStream()); r = new BufferedReader(new InputStreamReader(resolvedStream, encoding)); } return r; } }