/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.txt; import java.nio.charset.Charset; import org.apache.tika.detect.TextStatistics; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; import org.mozilla.universalchardet.CharsetListener; import org.mozilla.universalchardet.Constants; import org.mozilla.universalchardet.UniversalDetector; /** * Helper class used by {@link UniversalEncodingDetector} to access the * <code>juniversalchardet</code> detection logic. */ class UniversalEncodingListener implements CharsetListener { private static final String CHARSET_ISO_8859_1 = "ISO-8859-1"; private static final String CHARSET_ISO_8859_15 = "ISO-8859-15"; private final TextStatistics statistics = new TextStatistics(); private final UniversalDetector detector = new UniversalDetector(this); private String hint = null; private Charset charset = null; public UniversalEncodingListener(Metadata metadata) { MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); if (type != null) { hint = type.getParameters().get("charset"); } if (hint == null) { hint = metadata.get(Metadata.CONTENT_ENCODING); } } public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { // Use the encoding hint when available name = hint; } else if (statistics.count('\r') == 0) { // If there are no CR(LF)s, then the encoding is more // likely to be ISO-8859-1(5) than windows-1252 if (statistics.count(0xa4) > 0) { // currency/euro sign // The general currency sign is hardly ever used in // ISO-8859-1, so it's more likely that we're dealing // with ISO-8859-15, where the character is used for // the euro symbol, which is more commonly used. name = CHARSET_ISO_8859_15; } else { name = CHARSET_ISO_8859_1; } } } try { this.charset = CharsetUtils.forName(name); } catch (Exception e) { // ignore } } public boolean isDone() { return detector.isDone(); } public void handleData(byte[] buf, int offset, int length) { statistics.addData(buf, offset, length); detector.handleData(buf, offset, length); } public Charset dataEnd() { detector.dataEnd(); if (charset == null && statistics.isMostlyAscii()) { report(Constants.CHARSET_WINDOWS_1252); } return charset; } }