/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.classifier.maxent; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.ObjectStream; import org.slf4j.Logger; public class CategoryDataStream implements ObjectStream<DocumentSample> { public static final Logger log = org.slf4j.LoggerFactory.getLogger(CategoryDataStream.class); File[] inputFiles; int inputFilesIndex = 0; String encoding; BufferedReader reader; Tokenizer tokenizer; String line; File currentFile; public CategoryDataStream(File[] inputFiles, Tokenizer tokenizer) { this(inputFiles, "UTF-8", tokenizer); } public CategoryDataStream(File[] inputFiles, String encoding, Tokenizer tokenizer) { this.inputFiles = inputFiles; this.encoding = encoding; if (tokenizer == null) { this.tokenizer = SimpleTokenizer.INSTANCE; } else { this.tokenizer = tokenizer; } } public CategoryDataStream(String fileName, Tokenizer tokenizer) { this(fileName, "UTF-8", tokenizer); } public CategoryDataStream(String fileName, String encoding, Tokenizer tokenizer) { this((File[]) null, encoding, tokenizer); inputFiles = new File[1]; inputFiles[0] = new File(fileName); } public void reset() { close(); line = null; inputFilesIndex = 0; } public void close() { if (reader != null) { try { reader.close(); } catch (IOException ex) { log.warn("IOException on close", ex); } } } /** Set the current buffered line to null, and attempt to obtain the next * line of training data, if we run out of lines in one file, move on to * the next. If we are out of files, line will remain null when this * method returns. * * @throws RuntimeException if there's a problem reading any of the input * files. */ protected void getNextLine() { line = null; try { while (line == null) { if (reader == null) { // no more files to read; if (inputFilesIndex >= inputFiles.length) break; // open the next file. currentFile = inputFiles[inputFilesIndex]; reader = new BufferedReader( new InputStreamReader( new FileInputStream(currentFile), encoding)); } line = reader.readLine(); if (line == null) { // done with this reader, move to the next file. reader = null; inputFilesIndex++; } } } catch (IOException e) { throw new RuntimeException("Error reading input from: " + currentFile, e); } } public boolean hasNext() { getNextLine(); return line != null; } //<start id="maxent.examples.train.event"/> public DocumentSample read() { if (line == null && !hasNext()) { //<co id="mee.train.read"/> return null; } int split = line.indexOf('\t'); //<co id="mee.train.cat"/> if (split < 0) throw new RuntimeException("Invalid line in " + inputFiles[inputFilesIndex]); String category = line.substring(0,split); String document = line.substring(split+1); line = null; // mark line as consumed String[] tokens = tokenizer.tokenize(document); //<co id="mee.train.tok"/> return new DocumentSample(category, tokens); //<co id="mee.train.sample"/> } /*<calloutlist> <callout arearefs="mee.train.read">Read a line training data</callout> <callout arearefs="mee.train.cat">Extract category</callout> <callout arearefs="mee.train.tok">Tokenize content</callout> <callout arearefs="mee.train.tok">Create sample</callout> </calloutlist>*/ //<end id="maxent.examples.train.event"/> }