/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.mime; import org.apache.tika.detect.MagicDetector; import org.w3c.dom.Attr; import org.w3c.dom.Node; import org.w3c.dom.Element; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.w3c.dom.NamedNodeMap; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; /** * A reader for XML files compliant with the freedesktop MIME-info DTD. * * <pre> * <!DOCTYPE mime-info [ * <!ELEMENT mime-info (mime-type)+> * <!ATTLIST mime-info xmlns CDATA #FIXED "http://www.freedesktop.org/standards/shared-mime-info"> * * <!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*> * <!ATTLIST mime-type type CDATA #REQUIRED> * * <!-- a comment describing a document with the respective MIME type. Example: "WMV video" --> * <!ELEMENT _comment (#PCDATA)> * <!ATTLIST _comment xml:lang CDATA #IMPLIED> * * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "WMV" --> * <!ELEMENT acronym (#PCDATA)> * <!ATTLIST acronym xml:lang CDATA #IMPLIED> * * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "Windows Media Video" --> * <!ELEMENT expanded-acronym (#PCDATA)> * <!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED> * * <!ELEMENT glob EMPTY> * <!ATTLIST glob pattern CDATA #REQUIRED> * <!ATTLIST glob isregex CDATA #IMPLIED> * * <!ELEMENT magic (match)+> * <!ATTLIST magic priority CDATA #IMPLIED> * * <!ELEMENT match (match)*> * <!ATTLIST match offset CDATA #REQUIRED> * <!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED> * <!ATTLIST match value CDATA #REQUIRED> * <!ATTLIST match mask CDATA #IMPLIED> * * <!ELEMENT root-XML EMPTY> * <!ATTLIST root-XML * namespaceURI CDATA #REQUIRED * localName CDATA #REQUIRED> * * <!ELEMENT alias EMPTY> * <!ATTLIST alias * type CDATA #REQUIRED> * * <!ELEMENT sub-class-of EMPTY> * <!ATTLIST sub-class-of * type CDATA #REQUIRED> * ]> * </pre> * * * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec * */ final class MimeTypesReader implements MimeTypesReaderMetKeys { private final MimeTypes types; MimeTypesReader(MimeTypes types) { this.types = types; } void read(InputStream stream) throws IOException, MimeTypeException { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document document = builder.parse(new InputSource(stream)); read(document); } catch (ParserConfigurationException e) { throw new MimeTypeException("Unable to create an XML parser", e); } catch (SAXException e) { throw new MimeTypeException("Invalid type configuration", e); } } void read(Document document) throws MimeTypeException { Element element = document.getDocumentElement(); if (element != null && element.getTagName().equals(MIME_INFO_TAG)) { NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); if (node.getNodeType() == Node.ELEMENT_NODE) { Element child = (Element) node; if (child.getTagName().equals(MIME_TYPE_TAG)) { readMimeType(child); } } } } else { throw new MimeTypeException( "Not a <" + MIME_INFO_TAG + "/> configuration document: " + element.getTagName()); } } /** Read Element named mime-type. */ private void readMimeType(Element element) throws MimeTypeException { String name = element.getAttribute(MIME_TYPE_TYPE_ATTR); MimeType type = types.forName(name); NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); if (node.getNodeType() == Node.ELEMENT_NODE) { Element nodeElement = (Element) node; if (nodeElement.getTagName().equals(COMMENT_TAG)) { type.setDescription( nodeElement.getFirstChild().getNodeValue()); } else if (nodeElement.getTagName().equals(GLOB_TAG)) { boolean useRegex = Boolean.valueOf(nodeElement.getAttribute(ISREGEX_ATTR)); types.addPattern(type, nodeElement.getAttribute(PATTERN_ATTR), useRegex); } else if (nodeElement.getTagName().equals(MAGIC_TAG)) { readMagic(nodeElement, type); } else if (nodeElement.getTagName().equals(ALIAS_TAG)) { String alias = nodeElement.getAttribute(ALIAS_TYPE_ATTR); MediaType aliasType = MediaType.parse(alias); if (aliasType != null) { types.addAlias(type, aliasType); } else { throw new MimeTypeException( "Invalid media type alias: " + alias); } } else if (nodeElement.getTagName().equals(ROOT_XML_TAG)) { readRootXML(nodeElement, type); } else if (nodeElement.getTagName().equals(SUB_CLASS_OF_TAG)) { String parent = nodeElement.getAttribute(SUB_CLASS_TYPE_ATTR); types.setSuperType(type, MediaType.parse(parent)); } } } types.add(type); } /** * Read Element named magic. * @throws MimeTypeException if the configuration is invalid */ private void readMagic(Element element, MimeType mimeType) throws MimeTypeException { int priority = 50; String value = element.getAttribute(MAGIC_PRIORITY_ATTR); if (value != null && value.length() > 0) { priority = Integer.parseInt(value); } for (Clause clause : readMatches(element, mimeType.getType())) { Magic magic = new Magic(mimeType); magic.setPriority(priority); magic.setClause(clause); mimeType.addMagic(magic); } } private List<Clause> readMatches(Element element, MediaType mediaType) throws MimeTypeException { List<Clause> clauses = new ArrayList<Clause>(); NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); if (node.getNodeType() == Node.ELEMENT_NODE) { Element nodeElement = (Element) node; if (nodeElement.getTagName().equals(MATCH_TAG)) { clauses.add(readMatch(nodeElement, mediaType)); } } } return clauses; } /** Read Element named match. */ private Clause readMatch(Element element, MediaType mediaType) throws MimeTypeException { String type = "string"; int start = 0; int end = 0; String value = null; String mask = null; NamedNodeMap attrs = element.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Attr attr = (Attr) attrs.item(i); if (attr.getName().equals(MATCH_OFFSET_ATTR)) { String offset = attr.getValue(); int colon = offset.indexOf(':'); if (colon == -1) { start = Integer.parseInt(offset); end = start; } else { start = Integer.parseInt(offset.substring(0, colon)); end = Integer.parseInt(offset.substring(colon + 1)); } } else if (attr.getName().equals(MATCH_TYPE_ATTR)) { type = attr.getValue(); } else if (attr.getName().equals(MATCH_VALUE_ATTR)) { value = attr.getValue(); } else if (attr.getName().equals(MATCH_MASK_ATTR)) { mask = attr.getValue(); } } if (value == null) { throw new MimeTypeException("Missing magic byte pattern"); } else if (start < 0 || end < start) { throw new MimeTypeException( "Invalid offset range: [" + start + "," + end + "]"); } byte[] patternBytes = decodeValue(type, value); int length = patternBytes.length; byte[] maskBytes = null; if (mask != null) { maskBytes = decodeValue(type, mask); length = Math.max(patternBytes.length, maskBytes.length); } MagicDetector detector = new MagicDetector( mediaType, patternBytes, maskBytes, start, end); Clause clause = new MagicMatch(detector, length); List<Clause> subClauses = readMatches(element, mediaType); if (subClauses.size() == 0) { return clause; } else if (subClauses.size() == 1) { return new AndClause(clause, subClauses.get(0)); } else { return new AndClause(clause, new OrClause(subClauses)); } } private byte[] decodeValue(String type, String value) throws MimeTypeException { // Preliminary check if ((value == null) || (type == null)) { return null; } byte[] decoded = null; String tmpVal = null; int radix = 8; // hex if (value.startsWith("0x")) { tmpVal = value.substring(2); radix = 16; } else { tmpVal = value; radix = 8; } if (type.equals("string")) { decoded = decodeString(value); } else if (type.equals("byte")) { decoded = tmpVal.getBytes(); } else if (type.equals("host16") || type.equals("little16")) { int i = Integer.parseInt(tmpVal, radix); decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) }; } else if (type.equals("big16")) { int i = Integer.parseInt(tmpVal, radix); decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) }; } else if (type.equals("host32") || type.equals("little32")) { long i = Long.parseLong(tmpVal, radix); decoded = new byte[] { (byte) ((i & 0x000000FF)), (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0xFF000000) >> 24) }; } else if (type.equals("big32")) { long i = Long.parseLong(tmpVal, radix); decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24), (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) }; } return decoded; } private byte[] decodeString(String value) throws MimeTypeException { if (value.startsWith("0x")) { byte[] bytes = new byte[(value.length() - 2) / 2]; for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte) Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16); } return bytes; } try { ByteArrayOutputStream decoded = new ByteArrayOutputStream(); for (int i = 0; i < value.length(); i++) { if (value.charAt(i) == '\\') { if (value.charAt(i + 1) == '\\') { decoded.write('\\'); i++; } else if (value.charAt(i + 1) == 'x') { decoded.write(Integer.parseInt( value.substring(i + 2, i + 4), 16)); i += 3; } else { int j = i + 1; while ((j < i + 4) && (j < value.length()) && (Character.isDigit(value.charAt(j)))) { j++; } decoded.write(Short.decode( "0" + value.substring(i + 1, j)).byteValue()); i = j - 1; } } else { decoded.write(value.charAt(i)); } } return decoded.toByteArray(); } catch (NumberFormatException e) { throw new MimeTypeException("Invalid string value: " + value, e); } } /** Read Element named root-XML. */ private void readRootXML(Element element, MimeType mimeType) { mimeType.addRootXML(element.getAttribute(NS_URI_ATTR), element .getAttribute(LOCAL_NAME_ATTR)); } }