package org.bbaw.wsp.cms.dochandler.parser.text.parser;
import org.bbaw.wsp.cms.dochandler.parser.document.IDocument;
import org.bbaw.wsp.cms.dochandler.parser.document.PdfDocument;
import org.bbaw.wsp.cms.document.MetadataRecord;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
/**
* This class parses an eDoc. An eDoc consists of a basic pdf file and an
* index.html file which contains the associated metadata. It's represented by
* the folder structure: [year] / [eDocID] - index.html - /pdf/[eDoc.pdf]
*
* @author Sascha Feldmann (wsp-shk1)
* @date 15.08.2012
*
*/
public class EdocParserImpl extends HtmlParserImpl {
private static EdocParserImpl instance;
/**
* Return the only existing instance. The instance uses an Apache PdfBox
* stripper.
*
* @return
*/
public static EdocParserImpl getInstance() {
if (instance == null) {
return new EdocParserImpl();
}
return instance;
}
private EdocParserImpl() {
super();
}
/**
* Parse an eDoc and return the object returned by the {@link ISaveStrategy}
*
* @param startUri
* the URI where the harvesting was started.
* @param uri
* the URI to the eDoc's index.html (which contains the reference to
* the eDoc).
*
* @return Object returned by the {@link ISaveStrategy}
* @throws ApplicationException
*/
public Object parse(final String startUri, final String uri) throws ApplicationException {
// Parse eDoc index
final Object parsedDocIndex = super.parse(startUri, uri);
if (parsedDocIndex instanceof IDocument) {
MetadataRecord metadata = new MetadataRecord();
EdocIndexMetadataFetcherTool.fetchHtmlDirectly(uri, metadata);
String eDocUrl = metadata.getRealDocUrl();
if (eDocUrl != null) {
// Parse eDoc
// System.out.println("eDocUrl: " + eDocUrl);
final Object parsedEDoc = PdfParserImpl.getInstance().parse(startUri, eDocUrl);
if (parsedEDoc instanceof PdfDocument) {
final PdfDocument parsedPDF = (PdfDocument) parsedEDoc;
parsedPDF.setMetadata(metadata);
return parsedPDF;
}
}
else {
throw new ApplicationException("Couldn't fetch the eDoc's URL from the file: "+uri);
}
}
return null;
}
}