/**
*
* Copyright 2009-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
package org.opensextant.xtext.converters;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.opensextant.util.FileUtility;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConvertedDocument;
/**
* Default conversion is almost a pass through from Tika's auto parser and BodyContentHandler.
* Encoding, author, create date and title are saved to ConvertedDoc. The text of the document
* is stripped of extra blank lines.
*
* @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
*/
public class DefaultConverter extends ConverterAdapter {
/* 1 MB of text from a given document */
public final static int MAX_TEXT_SIZE = 0x100000;
private final Detector detector = new DefaultDetector();
private final Parser parser = new AutoDetectParser(detector);
private final ParseContext ctx = new ParseContext();
private int maxBuffer = MAX_TEXT_SIZE;
public DefaultConverter() {
ctx.set(Parser.class, parser);
}
public DefaultConverter(int sz) {
this();
maxBuffer = sz;
}
/**
* Common implementation -- take an input stream and return a ConvertedDoc;
*
* @param input stream for raw file
* @param doc raw file
* @return converted doc
* @throws IOException if underlying Tika parser/writer had an IO problem, an parser
* problem, or MAX_TEXT_SIZE is reached.
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc)
throws IOException {
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler(maxBuffer);
try {
parser.parse(input, handler, metadata, ctx);
} catch (NoClassDefFoundError classErr){
throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
input.close();
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));
// v1.5: until this version this blank line reducer was in place.
// Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of \n in a row.
// Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the last data row.
// TextUtils.reduce_line_breaks(txt)
String t = handler.toString();
if (t != null) {
if (textdoc.filename!= null && FileUtility.isSpreadsheet(textdoc.filename)) {
textdoc.setText(t.trim());
} else {
textdoc.setText(TextUtils.reduce_line_breaks(t));
}
}
return textdoc;
}
}