/** * Copyright 2005-2014 Restlet * * The contents of this file are subject to the terms of one of the following * open source licenses: Apache 2.0 or or EPL 1.0 (the "Licenses"). You can * select the license that you prefer but you may not use this file except in * compliance with one of these Licenses. * * You can obtain a copy of the Apache 2.0 license at * http://www.opensource.org/licenses/apache-2.0 * * You can obtain a copy of the EPL 1.0 license at * http://www.opensource.org/licenses/eclipse-1.0 * * See the Licenses for the specific language governing permissions and * limitations under the Licenses. * * Alternatively, you can obtain a royalty free commercial license with less * limitations, transferable or non-transferable, directly at * http://restlet.com/products/restlet-framework * * Restlet is a registered trademark of Restlet S.A.S. */ package org.restlet.ext.lucene; import java.io.IOException; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.restlet.engine.util.DateUtils; import org.restlet.ext.xml.SaxRepresentation; import org.restlet.representation.Representation; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Representation that parses another wrapped representation using Lucene Tika * metadata extraction engine. * * Tika can be configured to indicates a specific parser to used, otherwise a * special auto-detect parser is used. Tike metadata can also be customized if * wanted. * * @author Jerome Louvel */ public class TikaRepresentation extends SaxRepresentation { /** The Tika metadata used for both input and output. */ private Metadata metadata; /** The wrapped representation to analyze. */ private Representation representation; /** The optional Tika configuration. */ private TikaConfig tikaConfig; /** The optional Tika parser. */ private Parser tikaParser; /** * Constructor. * * @param representation * The wrapped representation to analyze. */ public TikaRepresentation(Representation representation) { this(representation, null); } /** * Constructor. * * @param representation * The wrapped representation to analyze. * @param tikaConfig * The optional Tika configuration. */ public TikaRepresentation(Representation representation, TikaConfig tikaConfig) { this(representation, tikaConfig, null); } /** * Constructor. * * @param representation * The wrapped representation to analyze. * @param tikaConfig * The optional Tika configuration. * @param tikaParser * The optional Tika parser. */ public TikaRepresentation(Representation representation, TikaConfig tikaConfig, Parser tikaParser) { super((representation == null) ? null : representation.getMediaType()); setNamespaceAware(true); this.tikaConfig = tikaConfig; this.representation = representation; this.metadata = new Metadata(); } /** * Returns the Tika metadata used for both input and output. * * @return The Tika metadata used for both input and output. */ public Metadata getMetadata() { return metadata; } /** * Returns the optional Tika configuration. * * @return The Tika configuration or null. */ public TikaConfig getTikaConfig() { return tikaConfig; } /** * Returns the optional Tika parser. * * @return The Tika parser or null. */ public Parser getTikaParser() { return tikaParser; } /** * Parsed the wrapped representation with Tika to extract the useful * metadata and produce structural SAX events (in XHTML format) and send * them to the given SAX content handler. * * @param contentHandler * The target SAX handler. */ @Override public void parse(ContentHandler contentHandler) throws IOException { if (this.representation != null) { try { // Add common HTTP metadata if (this.representation.getDisposition() != null) { String name = this.representation.getDisposition() .getFilename(); if (name != null) { getMetadata().set(TikaMetadataKeys.RESOURCE_NAME_KEY, name); getMetadata() .set(HttpHeaders.CONTENT_DISPOSITION, name); } } getMetadata().set(HttpHeaders.CONTENT_TYPE, this.representation.getMediaType().toString()); if (this.representation.getSize() != UNKNOWN_SIZE) { getMetadata().set(HttpHeaders.CONTENT_LENGTH, Long.toString(this.representation.getSize())); } if (this.representation.getModificationDate() != null) { getMetadata().set( HttpHeaders.LAST_MODIFIED, DateUtils.format(this.representation .getModificationDate())); } // Prepare the Tika parser Parser parser = (getTikaParser() != null) ? getTikaParser() : (getTikaConfig() != null) ? new AutoDetectParser( getTikaConfig()) : new AutoDetectParser(); // Parse the wrapped representation parser.parse(this.representation.getStream(), contentHandler, getMetadata(), new ParseContext()); } catch (SAXException e) { throw new IOException("SAX exception: " + e.getLocalizedMessage()); } catch (TikaException e) { throw new IOException("Tika exception: " + e.getLocalizedMessage()); } } else { throw new IOException("No wrapped representation to parse."); } } /** * The Tika metadata used for both input and output. * * @param metadata * The Tika metadata. */ public void setMetadata(Metadata metadata) { this.metadata = metadata; } /** * Sets the optional Tika configuration. * * @param tikaConfig * The Tika configuration. */ public void setTikaConfig(TikaConfig tikaConfig) { this.tikaConfig = tikaConfig; } /** * Sets the optional Tika parser. * * @param tikaParser * The Tika parser. */ public void setTikaParser(Parser tikaParser) { this.tikaParser = tikaParser; } }