package org.gbif.checklistbank.utils; import org.gbif.api.model.common.MediaObject; import org.gbif.api.vocabulary.MediaType; import java.net.URI; import java.util.Set; import javax.annotation.Nullable; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import org.apache.tika.Tika; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Utility class to detect media types. * Copied from occurrence interpretation. * TODO: move to parser library */ public class MediaTypeUtils { private static final Logger LOG = LoggerFactory.getLogger(MediaTypeUtils.class); private static final Tika TIKA = new Tika(); private static final MimeTypes MIME_TYPES = MimeTypes.getDefaultMimeTypes(); private static final String HTML_TYPE = "text/html"; private static final Set<String> HTML_MIME_TYPES = ImmutableSet.of("text/x-coldfusion", "text/x-php", "text/asp", "text/aspdotnet", "text/x-cgi", "text/x-jsp", "text/x-perl", HTML_TYPE, MIME_TYPES.OCTET_STREAM); public static MediaObject detectType(MediaObject mo) { if (Strings.isNullOrEmpty(mo.getFormat())) { // derive from URI mo.setFormat(parseMimeType(mo.getIdentifier())); } // if MIME type is text/html make it a references link instead if (HTML_TYPE.equalsIgnoreCase(mo.getFormat()) && mo.getIdentifier() != null) { // make file URI the references link URL instead mo.setReferences(mo.getIdentifier()); mo.setIdentifier(null); mo.setFormat(null); } if (!Strings.isNullOrEmpty(mo.getFormat())) { if (mo.getFormat().startsWith("image")) { mo.setType(MediaType.StillImage); } else if (mo.getFormat().startsWith("audio")) { mo.setType(MediaType.Sound); } else if (mo.getFormat().startsWith("video")) { mo.setType(MediaType.MovingImage); } else { LOG.debug("Unsupported media format {}", mo.getFormat()); } } return mo; } /** * Parses a mime type using apache tika which can handle the following: * http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml */ private static String parseMimeType(@Nullable String format) { if (format != null) { format = Strings.emptyToNull(format.trim().toLowerCase()); } try { MimeType mime = MIME_TYPES.getRegisteredMimeType(format); if (mime != null) { return mime.getName(); } } catch (MimeTypeException e) { } // verify this is a reasonable mime type return format == null || MimeType.isValid(format) ? format : null; } /** * Parses a mime type using apache tika which can handle the following: * http://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml */ private static String parseMimeType(@Nullable URI uri) { if (uri != null) { String mime = TIKA.detect(uri.toString()); if (mime != null && HTML_MIME_TYPES.contains(mime.toLowerCase())) { // links without any suffix default to OCTET STREAM, see: // http://dev.gbif.org/issues/browse/POR-2066 return HTML_TYPE; } return mime; } return null; } }