/**
* This package contains several test classes.
*/
package org.bbaw.wsp.cms.dochandler.parser.test;
import org.bbaw.wsp.cms.dochandler.parser.document.IDocument;
import org.bbaw.wsp.cms.dochandler.parser.document.PdfDocument;
import org.bbaw.wsp.cms.dochandler.parser.text.parser.DocumentParser;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
/**
* Test of the {@link DocumentParser} which saves the fulltexts and maybe
* extracted metadata in an {@link IDocument}.
*
* @author Sascha Feldmann (wsp-shk1)
*
*/
public class DocumentParserTest {
private static final String EXAMPLE_EDOC_URL = "http://edoc.bbaw.de/volltexte/2010/1486/index.html";
private static final String EXAMPLE_EDOC_HTTP_URL = "http://edoc.bbaw.de/volltexte/2010/1486/pdf/Heft_17.pdf";
private static final String EXAMPLE_PDF_URL = "C:/Dokumente und Einstellungen/wsp-shk1/Eigene Dateien/opus32_bbaw_volltexte_20120607/volltexte/2010/1314/pdf/DOKUMENTATION_Symposium_Wissenschaft_und_Wiedervereinigung.pdf";
private static final String EULER_DOCS_FOLDER = "C:/Dokumente und Einstellungen/wsp-shk1/Eigene Dateien/ParserTest/TestDokumente/Vorhaben/Euler";
public static void main(String[] args) throws UnsupportedEncodingException, ApplicationException {
DocumentParser parser = new DocumentParser();
// testEDoc(parser);
// testPdf(parser);
// testEDocHttp(parser);
// allFormatTest(parser);
testEulerDocs(parser);
}
/*
* Test of all euler.bbaw.de *.docs.
*/
private static void testEulerDocs(DocumentParser parser) throws ApplicationException {
File docFolder = new File(EULER_DOCS_FOLDER);
for (File f : docFolder.listFiles()) {
System.out.println("Parsing "+f);
IDocument documentModel = parser.parse(EXAMPLE_EDOC_URL);
System.out.println("DocumentModel: \n\n" + documentModel);
System.out.println("-----------------------\n\n\n");
}
}
/*
* Test of an eDoc.
*/
public static void testEDoc(DocumentParser parser) throws UnsupportedEncodingException {
try {
IDocument documentModel = parser.parse(EXAMPLE_EDOC_URL);
System.out.println("DocumentModel: \n\n" + documentModel);
} catch (ApplicationException e) {
System.out.println(e);
}
}
/*
* Test of an eDoc via HTTP.
*/
public static void testEDocHttp(DocumentParser parser) {
try {
IDocument documentModel = parser.parse(EXAMPLE_EDOC_HTTP_URL);
System.out.println("DocumentModel: \n\n" + documentModel);
} catch (ApplicationException e) {
System.out.println(e);
}
}
/*
* Test of a "normal" pdf (not an eDoc).
*/
public static void testPdf(DocumentParser parser) throws ApplicationException {
IDocument documentModel = parser.parse(EXAMPLE_PDF_URL);
// Test Ligatur:
PdfDocument pdfDoc = (PdfDocument) documentModel;
String result = pdfDoc.getTextOrig();
System.out.println(result);
}
public static void allFormatTest(DocumentParser parser) throws ApplicationException {
List<String> urls = new ArrayList<String>();
// urls.add("//192.168.1.203/wsp-web-test/090210_Konzept.pdf");
urls.add("//192.168.1.203/wsp-web-test/Czmiel_Juergens_proposal_DH2012_TheAcademysDigitalStoreOfKnowledge.doc");
// urls.add("//192.168.1.203/wsp-web-test/Czmiel_Juergens_proposal_DH2012_TheAcademysDigitalStoreOfKnowledge.odt");
// urls.add("//192.168.1.203/wsp-web-test/Czmiel_Juergens_proposal_DH2012_TheAcademysDigitalStoreOfKnowledge.pdf");
// urls.add("//192.168.1.203/wsp-web-test/DH2012ReviewCzmielJuergens.txt");
for (String url : urls) {
IDocument doc = parser.parse(url);
System.out.println("Parsed a document");
System.out.println("DocumentType: " + doc.getURL());
System.out.println("Fulltext:\n\n################\n" + doc.getTextOrig());
System.out.println("\n#################\n\n");
}
}
}