/*
* Copyright 2013 Eolya Consulting - http://www.eolya.fr/
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package fr.eolya.extraction.tika;
import java.io.InputStream;
import java.util.Map;
import org.junit.Test;
import fr.eolya.extraction.htmlformater.HtmlToPlaintTextSimple;
import fr.eolya.extraction.htmlformater.IHtmlFormater;
import junit.framework.TestCase;
public class TikaWrapperTest extends TestCase {
static final String pdfToTextPath = "/usr/local/bin/pdftotext";
static final String swfToHtmlPath = "/opt/tools/swf2html";
//static final String djVuTextPath = "/opt/tools/djvu/bin/djvutxt";
@Test
public void testPdfTika() {
boolean content = true;
boolean verbose = true;
String format = TikaWrapper.OUTPUT_FORMAT_TEXT;
String fileName = "/fr/eolya/extraction/doc/java.pdf";
InputStream i = getClass().getResourceAsStream(fileName);
TikaWrapper mfte;
try {
mfte = new TikaWrapper(format);
mfte.process(i);
dumpDoc(mfte, fileName, content, verbose);
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void testPdfNoTika() {
boolean content = true;
boolean verbose = true;
String format = TikaWrapper.OUTPUT_FORMAT_HTML;
String fileName = "../doc/java.pdf";
InputStream i = getClass().getResourceAsStream(fileName);
assertFalse(i==null);
TikaWrapper mfte;
try {
mfte = new TikaWrapper(format);
mfte.setPdfToTextPath(pdfToTextPath);
mfte.process(i, TikaWrapper.CONTENT_TYPE_PDF);
dumpDoc(mfte, fileName, content, verbose);
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void testSwfNoTika() {
boolean content = true;
boolean verbose = true;
String format = TikaWrapper.OUTPUT_FORMAT_TEXT;
String fileName = "../doc/reflection.swf";
InputStream i = getClass().getResourceAsStream(fileName);
assertFalse(i==null);
TikaWrapper mfte;
try {
mfte = new TikaWrapper(format);
mfte.setSwfToHtmlPath(swfToHtmlPath);
IHtmlFormater formater = new HtmlToPlaintTextSimple();
mfte.setHtmlFormater(formater);
mfte.process(i, TikaWrapper.CONTENT_TYPE_SWF);
dumpDoc(mfte, fileName, content, verbose);
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void testSeveral() {
boolean content = true;
boolean verbose = true;
String format = TikaWrapper.OUTPUT_FORMAT_TEXT;
TikaWrapper mfte;
try {
mfte = new TikaWrapper(format);
String fileName = "../doc/java.pdf";
InputStream i = getClass().getResourceAsStream(fileName);
assertFalse(i==null);
mfte.process(i);
dumpDoc(mfte, fileName, content, verbose);
fileName = "../doc/fr.pdf";
i = getClass().getResourceAsStream(fileName);
assertFalse(i==null);
mfte.process(i);
dumpDoc(mfte, fileName, content, verbose);
fileName = "../doc/en.pdf";
i = getClass().getResourceAsStream(fileName);
assertFalse(i==null);
mfte.process(i);
dumpDoc(mfte, fileName, content, verbose);
} catch (Exception e) {
e.printStackTrace();
}
}
// @Test
// public void testDjVu() {
//
// // more files for tests
// // ftp://vpa.users.odessa.comstar.net.ua/public/Sci_Library/Phys%20Library/PPop_Popular-level/
//
// boolean content = true;
// boolean verbose = true;
// String format = TikaWrapper.OUTPUT_FORMAT_TEXT;
// String fileName = "../doc/Test.djvu";
//
// InputStream i = getClass().getResourceAsStream(fileName);
//
// TikaWrapper mfte;
// try {
// mfte = new TikaWrapper(format);
// mfte.setDjVuTextPath(djVuTextPath);
//
// IHtmlFormater formater = new HtmlToPlaintTextSimple();
// mfte.setHtmlFormater(formater);
//
// mfte.process(i, TikaWrapper.CONTENT_TYPE_DJVU);
// dumpDoc(mfte, fileName, content, verbose);
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
private static void dumpDoc(TikaWrapper mfte, String url, boolean content, boolean verbose) {
System.out.println("========================================================");
System.out.println("url: " + url);
System.out.println("Title: " + mfte.getMetaTitle());
System.out.println("Author: " + mfte.getMetaAuthor());
System.out.println("Created: " + mfte.getMetaCreated());
System.out.println("Modified: " + mfte.getMetaModified());
System.out.println("Content-Type: " + mfte.getMetaContentType());
System.out.println("CharSet: " + mfte.getMetaCharSet());
if (verbose && mfte.getMetas()!=null) {
System.out.println("========================================================");
for (Map.Entry<String, String> entry : mfte.getMetas().entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue());
}
}
if (content && mfte.getText()!=null) {
System.out.println("========================================================");
System.out.println(mfte.getText());
}
System.out.println("\n\n");
}
}