/** * UTF8 * Copyright 2011 by Michael Peter Christen * First released 25.2.2011 at http://yacy.net * * $LastChangedDate$ * $LastChangedRevision$ * $LastChangedBy$ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cora.document.encoding; import java.nio.charset.StandardCharsets; import java.util.Comparator; import org.apache.http.entity.ContentType; import org.apache.http.entity.mime.content.StringBody; /** * convenience class to produce UTF-8 encoding StringBodies and to provide a default * UTF-8 Charset object. * Reason: if this is not used in StringBody-Class initialization, a default charset name is parsed. * This is a synchronized process and all classes using default charsets synchronize at that point * Synchronization is omitted if this class is used * @author admin * */ public class UTF8 implements Comparator<String> { private final static ContentType contentType = ContentType.TEXT_PLAIN.withCharset(StandardCharsets.UTF_8); public static final UTF8 insensitiveUTF8Comparator = new UTF8(true); public static final UTF8 identityUTF8Comparator = new UTF8(false); public boolean insensitive; public UTF8(boolean insensitive) { this.insensitive = insensitive; } @Override public int compare(String o0, String o1) { final int l0 = o0.length(); final int l1 = o1.length(); final int ml = Math.min(l0, l1); char c0, c1; for (int i = 0; i < ml; i++) { if (this.insensitive) { c0 = Character.toLowerCase(o0.charAt(i)); c1 = Character.toLowerCase(o1.charAt(i)); } else { c0 = o0.charAt(i); c1 = o1.charAt(i); } if (c0 == c1) continue; return c0 - c1; } return l0 - l1; } public boolean equals(final String o0, final String o1) { final int l0 = o0.length(); final int l1 = o1.length(); if (l0 != l1) return false; return equals(o0, o1, l1); } private boolean equals(final String o0, final String o1, final int l) { char c0, c1; for (int i = 0; i < l; i++) { if (this.insensitive) { c0 = Character.toLowerCase(o0.charAt(i)); c1 = Character.toLowerCase(o1.charAt(i)); } else { c0 = o0.charAt(i); c1 = o1.charAt(i); } if (c0 == c1) continue; return false; } return true; } public final static StringBody StringBody(final byte[] b) { return StringBody(UTF8.String(b)); } public final static StringBody StringBody(final String s) { return new StringBody(s == null ? "" : s, contentType); } /** * using the string method with the default charset given as argument should prevent using the charset cache * in FastCharsetProvider.java:118 which locks all concurrent threads using a UTF8.String() method * @param bytes * @return */ public final static String String(final byte[] bytes) { return new String(bytes, 0, bytes.length, StandardCharsets.UTF_8); } public final static String String(final byte[] bytes, final int offset, final int length) { return new String(bytes, offset, length, StandardCharsets.UTF_8); } /** * getBytes() as method for String synchronizes during the look-up for the * Charset object for the default charset as given with a default charset name. * This is the normal process: public byte[] getBytes() { return StringCoding.encode(value, offset, count); } static byte[] encode(char[] ca, int off, int len) { String csn = Charset.defaultCharset().name(); try { return encode(csn, ca, off, len); ... static byte[] encode(String charsetName, char[] ca, int off, int len) throws UnsupportedEncodingException { StringEncoder se = (StringEncoder)deref(encoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((se == null) || !(csn.equals(se.requestedCharsetName()) || csn.equals(se.charsetName()))) { se = null; try { Charset cs = lookupCharset(csn); .... private static Charset lookupCharset(String csn) { if (Charset.isSupported(csn)) { try { return Charset.forName(csn); .... public static Charset forName(String charsetName) { Charset cs = lookup(charsetName); .... private static Charset lookup(String charsetName) { if (charsetName == null) throw new IllegalArgumentException("Null charset name"); Object[] a; if ((a = cache1) != null && charsetName.equals(a[0])) return (Charset)a[1]; // We expect most programs to use one Charset repeatedly. // We convey a hint to this effect to the VM by putting the // level 1 cache miss code in a separate method. return lookup2(charsetName); } private static Charset lookup2(String charsetName) { Object[] a; if ((a = cache2) != null && charsetName.equals(a[0])) { cache2 = cache1; cache1 = a; return (Charset)a[1]; } Charset cs; if ((cs = standardProvider.charsetForName(charsetName)) != null || (cs = lookupExtendedCharset(charsetName)) != null || (cs = lookupViaProviders(charsetName)) != null) { cache(charsetName, cs); .... At this point the getBytes() call synchronizes at one of the methods standardProvider.charsetForName lookupExtendedCharset lookupViaProviders * with our call using a given charset object, the call is much easier to perform * and it omits the synchronization for the charset lookup. * * @param s * @return */ public final static byte[] getBytes(final String s) { if (s == null) return null; return s.getBytes(StandardCharsets.UTF_8); } public final static byte[] getBytes(final StringBuilder s) { if (s == null) return null; return s.toString().getBytes(StandardCharsets.UTF_8); } /** * Decodes a <code>application/x-www-form-urlencoded</code> string using a specific * encoding scheme. * for url query part only application/x-www-form-urlencoded (+ -> space) is applied */ public static String decodeURL(final String s) { boolean needToChange = false; final int numChars = s.length(); final StringBuilder sb = new StringBuilder(numChars > 500 ? numChars / 2 : numChars); int i = 0; boolean insearchpart = false; char c; byte[] bytes = null; while (i < numChars) { c = s.charAt(i); switch (c) { case '?' : // mark start of query part (to start x-www-form-urlencoded) sb.append(c); i++; insearchpart = true; // flag to start x-www-form + decoding break; case '+': //application/x-www-form-urlencoded (in searchpart) if (insearchpart) { sb.append(' '); needToChange = true; } else { sb.append(c); } i++; break; case '%': try { if (bytes == null) bytes = new byte[(numChars-i)/3]; int pos = 0; while (((i+2) < numChars) && (c=='%')) { final int v = Integer.parseInt(s.substring(i+1,i+3),16); if (v < 0) { return s; //throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value"); } bytes[pos++] = (byte) v; i+= 3; if (i < numChars) c = s.charAt(i); } if ((i < numChars) && (c=='%')) { return s; //throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern"); } sb.append(new String(bytes, 0, pos, StandardCharsets.UTF_8)); } catch (final NumberFormatException e) { return s; //throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage()); } needToChange = true; break; default: sb.append(c); i++; break; } } return (needToChange? sb.toString() : s); } }