/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.server.resource; import static java.nio.charset.StandardCharsets.UTF_8; import javax.mail.internet.ContentDisposition; import javax.mail.internet.ParseException; import javax.ws.rs.Consumes; import javax.ws.rs.GET; import javax.ws.rs.POST; import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.Produces; import javax.ws.rs.WebApplicationException; import javax.ws.rs.core.Context; import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.Response; import javax.ws.rs.core.StreamingOutput; import javax.ws.rs.core.UriInfo; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.cxf.jaxrs.ext.multipart.Attachment; import org.apache.poi.extractor.ExtractorFactory; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.html.BoilerpipeContentHandler; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RichTextContentHandler; import org.apache.tika.server.InputStreamFactory; import org.apache.tika.server.TikaServerParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @Path("/tika") public class TikaResource { public static final String GREETING = "This is Tika Server (" + new Tika().toString() + "). Please PUT\n"; public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR"; public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF"; private static final Logger LOG = LoggerFactory.getLogger(TikaResource.class); private static TikaConfig tikaConfig; private static DigestingParser.Digester digester = null; private static InputStreamFactory inputStreamFactory = null; public static void init(TikaConfig config, DigestingParser.Digester digestr, InputStreamFactory iSF) { tikaConfig = config; digester = digestr; inputStreamFactory = iSF; } static { ExtractorFactory.setAllThreadsPreferEventExtractors(true); } @SuppressWarnings("serial") public static Parser createParser() { final Parser parser = new AutoDetectParser(tikaConfig); Map<MediaType, Parser> parsers = ((AutoDetectParser)parser).getParsers(); parsers.put(MediaType.APPLICATION_XML, new HtmlParser()); ((AutoDetectParser)parser).setParsers(parsers); ((AutoDetectParser)parser).setFallback(new Parser() { public Set<MediaType> getSupportedTypes(ParseContext parseContext) { return parser.getSupportedTypes(parseContext); } public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) { throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE); } }); if (digester != null) { return new DigestingParser(parser, digester); } return parser; } public static TikaConfig getConfig() { return tikaConfig; } public static String detectFilename(MultivaluedMap<String, String> httpHeaders) { String disposition = httpHeaders.getFirst("Content-Disposition"); if (disposition != null) { try { ContentDisposition c = new ContentDisposition(disposition); // only support "attachment" dispositions if ("attachment".equals(c.getDisposition())) { String fn = c.getParameter("filename"); if (fn != null) { return fn; } } } catch (ParseException e) { // not a valid content-disposition field LOG.warn("Parse exception {} determining content disposition", e.getMessage(), e); } } // this really should not be used, since it's not an official field return httpHeaders.getFirst("File-Name"); } public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders, Parser embeddedParser) { TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); PDFParserConfig pdfParserConfig = new PDFParserConfig(); for (String key : httpHeaders.keySet()) { if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) { processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX); } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) { processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX); } } parseContext.set(TesseractOCRConfig.class, ocrConfig); parseContext.set(PDFParserConfig.class, pdfParserConfig); if (embeddedParser != null) { parseContext.set(Parser.class, embeddedParser); } } public static InputStream getInputStream(InputStream is, HttpHeaders headers) { try { return inputStreamFactory.getInputSteam(is, headers); } catch (IOException e) { throw new TikaServerParseException(e); } } /** * Utility method to set a property on a class via reflection. * * @param httpHeaders the HTTP headers set. * @param object the <code>Object</code> to set the property on. * @param key the key of the HTTP Header. * @param prefix the name of the HTTP Header prefix used to find property. * @throws WebApplicationException thrown when field cannot be found. */ private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) { try { String property = StringUtils.removeStart(key, prefix); Field field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property)); field.setAccessible(true); if (field.getType() == String.class) { field.set(object, httpHeaders.getFirst(key)); } else if (field.getType() == int.class) { field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key))); } else if (field.getType() == double.class) { field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key))); } else if (field.getType() == boolean.class) { field.setBoolean(object, Boolean.parseBoolean(httpHeaders.getFirst(key))); } else { //couldn't find a directly accessible field //try for setX(String s) String setter = StringUtils.uncapitalize(property); setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1); Method m = null; try { m = object.getClass().getMethod(setter, String.class); } catch (NoSuchMethodException e) { //swallow } if (m != null) { m.invoke(object, httpHeaders.getFirst(key)); } } } catch (Throwable ex) { throw new WebApplicationException(String.format(Locale.ROOT, "%s is an invalid %s header", key, X_TIKA_OCR_HEADER_PREFIX)); } } @SuppressWarnings("serial") public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap<String, String> httpHeaders) { String fileName = detectFilename(httpHeaders); if (fileName != null) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); } String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE); javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader); if (mediaType != null && "xml".equals(mediaType.getSubtype())) { mediaType = null; } if (mediaType != null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) { mediaType = null; } if (mediaType != null) { metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString()); final Detector detector = getDetector(parser); setDetector(parser, new Detector() { public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException { String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE); //make sure never to return null -- TIKA-1845 MediaType type = null; if (ct != null) { //this can return null if ct is not a valid mime type type = MediaType.parse(ct); } if (type != null) { return type; } else { return detector.detect(inputStream, metadata); } } }); } final String password = httpHeaders.getFirst("Password"); if (password != null) { context.set(PasswordProvider.class, new PasswordProvider() { @Override public String getPassword(Metadata metadata) { return password; } }); } } public static void setDetector(Parser p, Detector detector) { AutoDetectParser adp = getAutoDetectParser(p); adp.setDetector(detector); } public static Detector getDetector(Parser p) { AutoDetectParser adp = getAutoDetectParser(p); return adp.getDetector(); } private static AutoDetectParser getAutoDetectParser(Parser p) { //bit stinky if (p instanceof AutoDetectParser) { return (AutoDetectParser)p; } else if (p instanceof ParserDecorator) { Parser wrapped = ((ParserDecorator)p).getWrappedParser(); if (wrapped instanceof AutoDetectParser) { return (AutoDetectParser)wrapped; } throw new RuntimeException("Couldn't find AutoDetectParser within: "+wrapped.getClass()); } throw new RuntimeException("Couldn't find AutoDetectParser within: "+p.getClass()); } public static void parse(Parser parser, Logger logger, String path, InputStream inputStream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException { try (TikaInputStream tikaInputStream = TikaInputStream.get(inputStream)) { parser.parse(tikaInputStream, handler, metadata, parseContext); } catch (SAXException e) { throw new TikaServerParseException(e); } catch (EncryptedDocumentException e) { logger.warn("{}: Encrypted document", path, e); throw new TikaServerParseException(e); } catch (Exception e) { logger.warn("{}: Text extraction failed", path, e); throw new TikaServerParseException(e); } } public static void logRequest(Logger logger, UriInfo info, Metadata metadata) { if (metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE) == null) { logger.info("{} (autodetecting type)", info.getPath()); } else { logger.info("{} ({})", info.getPath(), metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)); } } @GET @Produces("text/plain") public String getMessage() { return GREETING; } @POST @Consumes("multipart/form-data") @Produces("text/plain") @Path("form") public StreamingOutput getTextFromMultipart(Attachment att, @Context final UriInfo info) { return produceText(att.getObject(InputStream.class), att.getHeaders(), info); } //this is equivalent to text-main in tika-app @PUT @Consumes("*/*") @Produces("text/plain") @Path("main") public StreamingOutput getTextMain(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { return produceTextMain(is, httpHeaders.getRequestHeaders(), info); } //this is equivalent to text-main (Boilerpipe handler) in tika-app @POST @Consumes("multipart/form-data") @Produces("text/plain") @Path("form/main") public StreamingOutput getTextMainFromMultipart(final Attachment att, @Context final UriInfo info) { return produceTextMain(att.getObject(InputStream.class), att.getHeaders(), info); } public StreamingOutput produceTextMain(final InputStream is, @Context MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) { final Parser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); fillMetadata(parser, metadata, context, httpHeaders); fillParseContext(context, httpHeaders, parser); logRequest(LOG, info, metadata); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { Writer writer = new OutputStreamWriter(outputStream, UTF_8); ContentHandler handler = new BoilerpipeContentHandler(writer); try (InputStream inputStream = is) { parse(parser, LOG, info.getPath(), inputStream, handler, metadata, context); } } }; } @PUT @Consumes("*/*") @Produces("text/plain") public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { return produceText(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info); } public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) { final Parser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); fillMetadata(parser, metadata, context, httpHeaders); fillParseContext(context, httpHeaders, parser); logRequest(LOG, info, metadata); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { Writer writer = new OutputStreamWriter(outputStream, UTF_8); BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer)); try (InputStream inputStream = is) { parse(parser, LOG, info.getPath(), inputStream, body, metadata, context); } } }; } @POST @Consumes("multipart/form-data") @Produces("text/html") @Path("form") public StreamingOutput getHTMLFromMultipart(Attachment att, @Context final UriInfo info) { return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "html"); } @PUT @Consumes("*/*") @Produces("text/html") public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "html"); } @POST @Consumes("multipart/form-data") @Produces("text/xml") @Path("form") public StreamingOutput getXMLFromMultipart(Attachment att, @Context final UriInfo info) { return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "xml"); } @PUT @Consumes("*/*") @Produces("text/xml") public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { return produceOutput(getInputStream(is, httpHeaders), httpHeaders.getRequestHeaders(), info, "xml"); } private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) { final Parser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); fillMetadata(parser, metadata, context, httpHeaders); fillParseContext(context, httpHeaders, parser); logRequest(LOG, info, metadata); return new StreamingOutput() { public void write(OutputStream outputStream) throws IOException, WebApplicationException { Writer writer = new OutputStreamWriter(outputStream, UTF_8); ContentHandler content; try { SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name()); handler.setResult(new StreamResult(writer)); content = new ExpandedTitleContentHandler(handler); } catch (TransformerConfigurationException e) { throw new WebApplicationException(e); } parse(parser, LOG, info.getPath(), is, content, metadata, context); } }; } }