/** * This package contains all fulltext parser classes, such as the parser classes for special formats and strategy classes which handle the parsers' results. */ package org.bbaw.wsp.cms.dochandler.parser.text.parser; import org.bbaw.wsp.cms.dochandler.parser.document.IDocument; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; /** * Instances of this class parse heterogeneous documents including KOBV eDocs. * * @author Sascha Feldmann (wsp-shk1) * @date 28.08.2012 * * Last change: - isEdoc() now checks the eDoc via HTTP * */ public class DocumentParser { /** * PDF file extension. */ public static final String EXT_PDF = ".pdf"; /** * DOC file extension. */ public static final String EXT_DOC = ".doc"; /** * Open document text extension. */ public static final String EXT_ODT = ".odt"; /** * XML extension. */ public static final String EXT_XML = ".xml"; /** * JPG extension. */ public static final String EXT_JPG = ".jpg"; /** * TIFF extension. */ public static final String EXT_TIFF = ".tiff"; /** * PNG extension. */ public static final String EXT_PNG = ".png"; /** * HTML extension. */ public static final String EXT_HTML = ".html"; /** * XHTML extension. */ public static final String EXT_XHTML = ".xhtml"; /** * HTM extension. */ public static final String EXT_HTM = ".htm"; /** * TXT extension. */ public static final String EXT_TXT = ".txt"; protected DocumentModelStrategy documentModelBuilder; /** * Create a new DocumentParser instance. An instance will offer a * parse()-method which gets a URL to be parsed. */ public DocumentParser() { this.documentModelBuilder = new DocumentModelStrategy(); } /** * Parse any kind of document. * * @throws ApplicationException * if there's no parser available for the type of resource. * @param url * - the URL to the document. * @return an {@link IDocument} containing the fulltext and maybe metadata for * the parsed document. */ public IDocument parse(final String url) throws ApplicationException { ResourceParser parser = null; if (EdocIndexMetadataFetcherTool.isEDocIndex(url)) { parser = EdocParserImpl.getInstance(); } else if (getExtension(url).equals(EXT_PDF)) { parser = PdfParserImpl.getInstance(); } else if (getExtension(url).equals(EXT_DOC)) { parser = DocParserImpl.getInstance(); } else if (getExtension(url).equals(EXT_ODT)) { parser = OdfParserImpl.getInstance(); } else if (getExtension(url).equals(EXT_XML)) { parser = XmlParserImpl.getInstance(); } else if (getExtension(url).equals(EXT_HTM) || getExtension(url).equals(EXT_HTML) || getExtension(url).equals(EXT_XHTML)) { parser = HtmlParserImpl.getInstance(); } else if (getExtension(url).equals(EXT_TXT)) { parser = TxtParserImpl.getInstance(); } if (parser != null) { IDocument result = (IDocument) parser.parse("", url); return result; } else { throw new ApplicationException("There's no parser available for this type of resource: " + getExtension(url)); } } /** * Fetch the URI's extension. */ public String getExtension(final String uri) { final int extPos = uri.lastIndexOf("."); final String extension = uri.substring(extPos, uri.length()); return extension; } /** * Check if the resource is an image. * * @param uri * - the resource's URI. * @return true if the resource is an image. */ public boolean isImage(String uri) { if (getExtension(uri).equals(EXT_JPG) || getExtension(uri).equals(EXT_TIFF) || getExtension(uri).equals(EXT_PNG)) { return true; } return false; } }