/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.utils; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @SuppressWarnings("serial") public class CharsetUtils { private static final Pattern CHARSET_NAME_PATTERN = Pattern.compile("[ \\\"]*([^ >,;\\\"]+).*"); private static final Pattern ISO_NAME_PATTERN = Pattern.compile("(?i).*8859-([\\d]+)"); private static final Pattern CP_NAME_PATTERN = Pattern.compile("(?i)cp-([\\d]+)"); private static final Pattern WIN_NAME_PATTERN = Pattern.compile("(?i)win(|-)([\\d]+)"); // List of common invalid charset names that we can't fix using // pattern matching + heuristic private static final Map<String, String> CHARSET_ALIASES = new HashMap<String, String>() {{ put("none", null); put("no", null); put("iso-8851-1", "iso-8859-1"); put("windows", "windows-1252"); put("koi8r", "KOI8-R"); }}; /** * Safely return whether <charsetName> is supported, without throwing exceptions * * @param charsetName Name of charset (can be null) * @return true if the character set is supported */ public static boolean isSupported(String charsetName) { try { return Charset.isSupported(charsetName); } catch (IllegalCharsetNameException e) { return false; } catch (IllegalArgumentException e) { // null, for example return false; } catch (Exception e) { // Unexpected exception, what to do? return false; } } /** * Handle various common charset name errors, and return something * that will be considered valid (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name */ public static String clean(String charsetName) { if (charsetName == null) { return null; } // Get rid of cruft around names, like <>, trailing commas, etc. Matcher m = CHARSET_NAME_PATTERN.matcher(charsetName); if (!m.matches()) { return null; } String result = m.group(1); if (CHARSET_ALIASES.containsKey(result.toLowerCase())) { // Handle common erroneous charset names. result = CHARSET_ALIASES.get(result.toLowerCase()); } else if (ISO_NAME_PATTERN.matcher(result).matches()) { // Handle "iso 8859-x" error m = ISO_NAME_PATTERN.matcher(result); m.matches(); result = "iso-8859-" + m.group(1); } else if (CP_NAME_PATTERN.matcher(result).matches()) { // Handle "cp-xxx" error m = CP_NAME_PATTERN.matcher(result); m.matches(); result = "cp" + m.group(1); } else if (WIN_NAME_PATTERN.matcher(result).matches()) { // Handle "winxxx" and "win-xxx" errors m = WIN_NAME_PATTERN.matcher(result); m.matches(); result = "windows-" + m.group(2); } try { Charset cs = Charset.forName(result); return cs.name(); } catch (Exception e) { return null; } } }