/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.util; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Class of mimetype utilities. * @author stack */ public class MimetypeUtils { /** * The 'no-type' content-type. * * Defined in the ARC file spec at * http://www.archive.org/web/researcher/ArcFileFormat.php. */ public static final String NO_TYPE_MIMETYPE = "no-type"; /** * Truncation regex. */ protected final static Pattern TRUNCATION_REGEX = Pattern.compile("^([^\\s;,]+).*"); /** * Truncate passed mimetype. * * Ensure no spaces. Strip encoding. Truncation required by * ARC files. * * <p>Truncate at delimiters [;, ]. * Truncate multi-part content type header at ';'. * Apache httpclient collapses values of multiple instances of the * header into one comma-separated value,therefore truncated at ','. * Current ia_tools that work with arc files expect 5-column * space-separated meta-lines, therefore truncate at ' '. * * @param contentType Raw content-type. * * @return Computed content-type made from passed content-type after * running it through a set of rules. */ public static String truncate(String contentType) { if (contentType == null) { contentType = NO_TYPE_MIMETYPE; } else { Matcher matcher = TRUNCATION_REGEX.matcher(contentType); if (matcher.matches()) { contentType = matcher.group(1); } else { contentType = NO_TYPE_MIMETYPE; } } return contentType; } }