//
// Copyright (C) 2004-2006 - Mirko Nasato <mirko@artofsolving.com>
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
// http://www.gnu.org/copyleft/lesser.html
//
// JOOConverter - The Open Source Java/OpenOffice Document Converter
//
package org.nuxeo.ecm.platform.convert.tests;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public final class DocumentUTUtils {
private static final int BYTE_ORDER_MARK_CHAR = 0xFEFF;
// This is an utility class, but making ctor private somehow confuses
// Maven Surefire.
private DocumentUTUtils() {
}
/**
* Extracts the text from a PDF file.
*
* @return the document content as plain text
*/
public static String readPdfText(File pdfFile) throws IOException {
PDFTextStripper textStripper = new PDFTextStripper();
PDDocument document = PDDocument.load(pdfFile);
String text = textStripper.getText(document);
document.close();
return text.trim();
}
public static String readContent(File file) throws IOException {
char[] buffer = new char[2048];
InputStreamReader reader = new InputStreamReader(new FileInputStream(file), "UTF-8");
StringWriter writer = new StringWriter();
int length;
while ((length = reader.read(buffer, 0, 2048)) != -1) {
writer.write(buffer, 0, length);
}
String content = stripByteOrderMarkChar(writer.toString());
return content.trim();
}
private static String stripByteOrderMarkChar(String content) {
if (content.length() > 0 && content.charAt(0) == BYTE_ORDER_MARK_CHAR) {
return content.substring(1).trim();
}
return content;
}
}