package org.bbaw.wsp.cms.dochandler.parser.text.parser; import java.io.InputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.bbaw.wsp.cms.dochandler.parser.document.GeneralDocument; import org.bbaw.wsp.cms.dochandler.parser.text.reader.IResourceReader; import org.bbaw.wsp.cms.dochandler.parser.text.reader.ResourceReaderImpl; import org.bbaw.wsp.cms.document.MetadataRecord; import org.xml.sax.ContentHandler; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; /** * This class is the API for all parsers. * * @author Sascha Feldmann (wsp-shk1) * @date 06.09.12 * * Last change: - ApplicationException instead of log file - Uses * {@link DocumentModelStrategy} only. * */ public abstract class ResourceParser { protected Parser parser; protected IResourceReader resourceReader; protected DocumentModelStrategy saveStrategy; /** * Create a new PdfParser instance. * * @param uri * - the URI to the document. * @throws IllegalArgumentException * if the uri is null, empty or doesn't refer to an existing file. */ public ResourceParser(final Parser parser) { if (parser == null) { throw new IllegalArgumentException("The value for the parameter parser in the constructor of PdfParserImpl mustn't be empty."); } this.parser = parser; this.resourceReader = new ResourceReaderImpl(); this.saveStrategy = new DocumentModelStrategy(); } /** * Parse a document and return the fulltext. * * @param startUri * - the harvesting URI. * @param uri * - the URI to the document. * @return a String - the fulltext * @throws ApplicationException * if the were errors while parsing. * @throws IllegalArgumentException * if the uri is null or empty * @throws IllegalStateException * if the {@link ISaveStrategy} wasn't set before. */ public Object parse(final String startUri, final String uri) throws ApplicationException { if (uri == null || uri.isEmpty()) { throw new IllegalArgumentException("The value for the parameter parser in the method parse() in ResourceParser mustn't be empty."); } if (this.saveStrategy == null) { throw new IllegalStateException("You must define a saveStategy before calling the parse()-method in ResourceParser."); } InputStream input; try { input = this.resourceReader.read(uri); // Don't limit the amount of characters -> -1 as argument ContentHandler textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); this.parser.parse(input, textHandler, metadata, context); input.close(); textHandler.endDocument(); final MetadataRecord mdRecord = new MetadataRecord(); this.matchMetadata(metadata, mdRecord); final GeneralDocument doc = (GeneralDocument) this.saveStrategy.generateDocumentModel(uri, uri, textHandler.toString()); doc.setMetadata(mdRecord); return doc; } catch (Exception e) { throw new ApplicationException("Problem while parsing file " + uri + " -- exception: " + e.getMessage() + "\n"); } } /** * Match the extracted metadata from TIKA to the {@link MetadataRecord}. * @param metadata the {@link Metadata} of TIKA. * @param mdRecord the {@link MetadataRecord}. */ private void matchMetadata(Metadata metadata, MetadataRecord mdRecord) { @SuppressWarnings("deprecation") final String pageCount = metadata.get(Metadata.PAGE_COUNT); if(pageCount != null) { mdRecord.setPageCount(Integer.parseInt(pageCount)); } final String type = metadata.get(Metadata.CONTENT_TYPE); if(type != null) { mdRecord.setType(type); } } }