/**
* Licensed to The Apereo Foundation under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
*
* The Apereo Foundation licenses this file to you under the Educational
* Community License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License
* at:
*
* http://opensource.org/licenses/ecl2.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*/
package org.opencastproject.textextractor.tesseract;
import org.opencastproject.textextractor.api.TextFrame;
import org.opencastproject.textextractor.api.TextLine;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
/**
* This class represents a tesseract output frame that holds a number of lines found on an image. Note that Tesseract
* does not inlcude positioning or other information with the text output.
*/
public class TesseractTextFrame implements TextFrame {
/** Words found on an output frame */
protected ArrayList<TextLine> lines = new ArrayList<TextLine>();
/**
* Parses the tesseract output file and extracts the text information contained therein.
*
* @param is
* the input stream
* @return the ocropus text information
* @throws IOException
* if reading the ocropus output fails
*/
public static TextFrame parse(InputStream is) throws IOException {
BufferedReader in = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line;
TesseractTextFrame textFrame = new TesseractTextFrame();
while ((line = in.readLine()) != null) {
textFrame.lines.add(new TesseractLine(line));
}
return textFrame;
}
/**
* {@inheritDoc}
*
* @see org.opencastproject.textextractor.api.TextFrame#hasText()
*/
@Override
public boolean hasText() {
return lines.size() > 0;
}
/**
* {@inheritDoc}
*
* @see org.opencastproject.textextractor.api.TextFrame#getLines()
*/
@Override
public TextLine[] getLines() {
return lines.toArray(new TesseractLine[lines.size()]);
}
}