StringUtils.java example

Explorer

RipplePower-master
- eclipse
  - RipplePower
    - src
    - tests
      - org
        ripple
        power
        chart
        test
        DialogTest.java
        OnionKitHelper.java
        TestNames.java
        TextEditorDemo.java
        TextEditorDemo2.java
  - jcoinlibs
    - src
- loon-shell
  - apktest
    - gen
      - com
        shell
        apktest
        BuildConfig.java
        R.java
    - src
      - com
        shell
        apktest
        Main.java
  - src
    - loon
      - apk
        shell
        APKActivityCallback.java
        APKActivityControl.java
        APKCallbacks.java
        APKCreateFailedException.java
        APKDexLoader.java
        APKInstrumentObject.java
        APKManager.java
        APKNotFoundException.java
        APKObject.java
        APKProxy.java
        APKReflect.java
        APKServiceProxy.java
        APKSetting.java
        APKShellActivity.java
        APKTools.java

/*
 * Copyright (C) 2010 ZXing authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.zxing.common;

import java.nio.charset.Charset;
import java.util.Map;

import com.google.zxing.DecodeHintType;

/**
 * Common string-related functions.
 * 
 * @author Sean Owen
 * @author Alex Dupre
 */
public final class StringUtils {

	private static final String PLATFORM_DEFAULT_ENCODING = Charset
			.defaultCharset().name();
	public static final String SHIFT_JIS = "SJIS";
	public static final String GB2312 = "GB2312";
	private static final String EUC_JP = "EUC_JP";
	private static final String UTF8 = "UTF8";
	private static final String ISO88591 = "ISO8859_1";
	private static final boolean ASSUME_SHIFT_JIS = SHIFT_JIS
			.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING)
			|| EUC_JP.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING);

	private StringUtils() {
	}

	/**
	 * @param bytes
	 *            bytes encoding a string, whose encoding should be guessed
	 * @param hints
	 *            decode hints if applicable
	 * @return name of guessed encoding; at the moment will only guess one of:
	 *         {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the
	 *         platform default encoding if none of these can possibly be
	 *         correct
	 */
	public static String guessEncoding(byte[] bytes,
			Map<DecodeHintType, ?> hints) {
		if (hints != null) {
			String characterSet = (String) hints
					.get(DecodeHintType.CHARACTER_SET);
			if (characterSet != null) {
				return characterSet;
			}
		}
		// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
		// which should be by far the most common encodings.
		int length = bytes.length;
		boolean canBeISO88591 = true;
		boolean canBeShiftJIS = true;
		boolean canBeUTF8 = true;
		int utf8BytesLeft = 0;
		// int utf8LowChars = 0;
		int utf2BytesChars = 0;
		int utf3BytesChars = 0;
		int utf4BytesChars = 0;
		int sjisBytesLeft = 0;
		// int sjisLowChars = 0;
		int sjisKatakanaChars = 0;
		// int sjisDoubleBytesChars = 0;
		int sjisCurKatakanaWordLength = 0;
		int sjisCurDoubleBytesWordLength = 0;
		int sjisMaxKatakanaWordLength = 0;
		int sjisMaxDoubleBytesWordLength = 0;
		// int isoLowChars = 0;
		// int isoHighChars = 0;
		int isoHighOther = 0;

		boolean utf8bom = bytes.length > 3 && bytes[0] == (byte) 0xEF
				&& bytes[1] == (byte) 0xBB && bytes[2] == (byte) 0xBF;

		for (int i = 0; i < length
				&& (canBeISO88591 || canBeShiftJIS || canBeUTF8); i++) {

			int value = bytes[i] & 0xFF;

			// UTF-8 stuff
			if (canBeUTF8) {
				if (utf8BytesLeft > 0) {
					if ((value & 0x80) == 0) {
						canBeUTF8 = false;
					} else {
						utf8BytesLeft--;
					}
				} else if ((value & 0x80) != 0) {
					if ((value & 0x40) == 0) {
						canBeUTF8 = false;
					} else {
						utf8BytesLeft++;
						if ((value & 0x20) == 0) {
							utf2BytesChars++;
						} else {
							utf8BytesLeft++;
							if ((value & 0x10) == 0) {
								utf3BytesChars++;
							} else {
								utf8BytesLeft++;
								if ((value & 0x08) == 0) {
									utf4BytesChars++;
								} else {
									canBeUTF8 = false;
								}
							}
						}
					}
				} // else {
					// utf8LowChars++;
					// }
			}

			// ISO-8859-1 stuff
			if (canBeISO88591) {
				if (value > 0x7F && value < 0xA0) {
					canBeISO88591 = false;
				} else if (value > 0x9F) {
					if (value < 0xC0 || value == 0xD7 || value == 0xF7) {
						isoHighOther++;
					} // else {
						// isoHighChars++;
						// }
				} // else {
					// isoLowChars++;
					// }
			}

			// Shift_JIS stuff
			if (canBeShiftJIS) {
				if (sjisBytesLeft > 0) {
					if (value < 0x40 || value == 0x7F || value > 0xFC) {
						canBeShiftJIS = false;
					} else {
						sjisBytesLeft--;
					}
				} else if (value == 0x80 || value == 0xA0 || value > 0xEF) {
					canBeShiftJIS = false;
				} else if (value > 0xA0 && value < 0xE0) {
					sjisKatakanaChars++;
					sjisCurDoubleBytesWordLength = 0;
					sjisCurKatakanaWordLength++;
					if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) {
						sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
					}
				} else if (value > 0x7F) {
					sjisBytesLeft++;
					// sjisDoubleBytesChars++;
					sjisCurKatakanaWordLength = 0;
					sjisCurDoubleBytesWordLength++;
					if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) {
						sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
					}
				} else {
					// sjisLowChars++;
					sjisCurKatakanaWordLength = 0;
					sjisCurDoubleBytesWordLength = 0;
				}
			}
		}

		if (canBeUTF8 && utf8BytesLeft > 0) {
			canBeUTF8 = false;
		}
		if (canBeShiftJIS && sjisBytesLeft > 0) {
			canBeShiftJIS = false;
		}

		// Easy -- if there is BOM or at least 1 valid not-single byte character
		// (and no evidence it can't be UTF-8), done
		if (canBeUTF8
				&& (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) {
			return UTF8;
		}
		// Easy -- if assuming Shift_JIS or at least 3 valid consecutive
		// not-ascii characters (and no evidence it can't be), done
		if (canBeShiftJIS
				&& (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
			return SHIFT_JIS;
		}
		// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for
		// short words. The crude heuristic is:
		// - If we saw
		// - only two consecutive katakana chars in the whole text, or
		// - at least 10% of bytes that could be "upper" not-alphanumeric
		// Latin1,
		// - then we conclude Shift_JIS, else ISO-8859-1
		if (canBeISO88591 && canBeShiftJIS) {
			return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2)
					|| isoHighOther * 10 >= length ? SHIFT_JIS : ISO88591;
		}

		// Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to
		// default platform encoding
		if (canBeISO88591) {
			return ISO88591;
		}
		if (canBeShiftJIS) {
			return SHIFT_JIS;
		}
		if (canBeUTF8) {
			return UTF8;
		}
		// Otherwise, we take a wild guess with platform encoding
		return PLATFORM_DEFAULT_ENCODING;
	}

}