/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package mj.ocraptor.extraction.tika.parser.image; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import javax.imageio.IIOException; import javax.imageio.ImageIO; import javax.imageio.ImageReader; import javax.imageio.metadata.IIOMetadata; import javax.imageio.stream.ImageInputStream; import mj.ocraptor.extraction.image_processing.TikaImageHelper; import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class ImageParser extends AbstractParser { /** Serial version UID */ private static final long serialVersionUID = 7852529269245520335L; private static final MediaType CANONICAL_BMP_TYPE = MediaType.image("x-ms-bmp"); private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp"); private static final Set<MediaType> SUPPORTED_TYPES = Collections .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(CANONICAL_BMP_TYPE, JAVA_BMP_TYPE, MediaType.image("gif"), MediaType.image("png"), MediaType.image("vnd.wap.wbmp"), MediaType.image("x-icon"), MediaType.image("x-xcf")))); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { String type = metadata.get(Metadata.CONTENT_TYPE); if (type != null) { TikaInputStream tikaStream = TikaInputStream.get(stream); File imageFile = tikaStream.getFile(); // Java has a different idea of the BMP mime type to // what the canonical one is, fix this up. if (CANONICAL_BMP_TYPE.toString().equals(type)) { type = JAVA_BMP_TYPE.toString(); } try { Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type); if (iterator.hasNext()) { ImageReader reader = iterator.next(); try { ImageInputStream imageStream = ImageIO .createImageInputStream(new CloseShieldInputStream(tikaStream)); try { reader.setInput(imageStream); metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0))); metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0))); metadata.set("height", Integer.toString(reader.getHeight(0))); metadata.set("width", Integer.toString(reader.getWidth(0))); loadMetadata(reader.getImageMetadata(0), metadata); } catch (Exception e) { // TODO: logging } finally { imageStream.close(); } } finally { reader.dispose(); } } // Translate certain Metadata tags from the ImageIO // specific namespace into the general Tika one setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS); setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS); setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE); } catch (IIOException e) { // TIKA-619: There is a known bug in the Sun API when dealing with GIF // images // which Tika will just ignore. if (!(e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) { throw new TikaException(type + " parse error", e); } } // ------------------------------------------------ // // -- ocr image data // ------------------------------------------------ // XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TikaImageHelper helper = new TikaImageHelper(metadata); try { helper.addImage(imageFile); helper.addTextToHandler(xhtml); } catch (Exception e) { e.printStackTrace(); } finally { if (helper != null) { helper.close(); } } xhtml.endDocument(); } } private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) { if (metadata.get(imageIOkey) != null) { String v = metadata.get(imageIOkey); if (v.endsWith(" ")) { v = v.substring(0, v.lastIndexOf(' ')); } metadata.set(tikaProp, v); } } private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) { String[] names = imageMetadata.getMetadataFormatNames(); if (names == null) { return; } int length = names.length; for (int i = 0; i < length; i++) { loadNode(metadata, imageMetadata.getAsTree(names[i]), "", false); } } private static void loadNode(Metadata metadata, Node node, String parents, boolean addThisNodeName) { if (addThisNodeName) { if (parents.length() > 0) { parents += " "; } parents += node.getNodeName(); } NamedNodeMap map = node.getAttributes(); if (map != null) { int length = map.getLength(); if (length == 1) { metadata.add(parents, normalize(map.item(0).getNodeValue())); } else if (length > 1) { StringBuilder value = new StringBuilder(); for (int i = 0; i < length; i++) { if (i > 0) { value.append(", "); } Node attr = map.item(i); value.append(attr.getNodeName()); value.append("="); value.append(normalize(attr.getNodeValue())); } metadata.add(parents, value.toString()); } } Node child = node.getFirstChild(); while (child != null) { // print children recursively loadNode(metadata, child, parents, true); child = child.getNextSibling(); } } private static String normalize(String value) { if (value != null) { value = value.trim(); } else { value = ""; } if (Boolean.TRUE.toString().equalsIgnoreCase(value)) { return Boolean.TRUE.toString(); } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) { return Boolean.FALSE.toString(); } return value; } }