/* * XMLUtils.java * * Created on April 30, 2001, 4:51 PM */ package com.limegroup.gnutella.xml; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Writer; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Collection; import java.util.Locale; import java.util.Map; import java.util.zip.Deflater; import java.util.zip.DeflaterOutputStream; import java.util.zip.GZIPInputStream; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import org.limewire.io.IOUtils; import org.limewire.logging.Log; import org.limewire.logging.LogFactory; import org.limewire.util.I18NConvert; import org.limewire.util.StringUtils; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * Contains utility methods. * @author asingla */ public class LimeXMLUtils { private static final Log LOG = LogFactory.getLog(LimeXMLUtils.class); private static final double MATCHING_RATE = .9; private static final String C_HEADER_BEGIN = "{"; private static final String C_HEADER_END = "}"; private static final String C_HEADER_NONE_VAL = "plaintext"; private static final String C_HEADER_ZLIB_VAL = "deflate"; private static final String C_HEADER_GZIP_VAL = "gzip"; private static final String COMPRESS_HEADER_ZLIB = C_HEADER_BEGIN + C_HEADER_ZLIB_VAL + C_HEADER_END; private static final String COMPRESS_HEADER_GZIP = C_HEADER_BEGIN + C_HEADER_GZIP_VAL + C_HEADER_END; private static final String COMPRESS_HEADER_NONE = C_HEADER_BEGIN + C_HEADER_END; private static final int NONE = 0; private static final int GZIP = 1; private static final int ZLIB = 2; /** * Gets the text content of the child nodes. * This is the same as Node.getTextContent(), but exists on all * JDKs. */ public static String getTextContent(Node node) { return getText(node.getChildNodes()); } /** * Collapses a list of CDATASection, Text, and predefined EntityReference * nodes into a single string. If the list contains other types of nodes, * those other nodes are ignored. */ public static String getText(NodeList nodeList) { StringBuilder buffer = new StringBuilder(); for(int i = 0; i < nodeList.getLength(); i++) { Node node = nodeList.item(i); switch(node.getNodeType()) { case Node.CDATA_SECTION_NODE : case Node.TEXT_NODE : buffer.append(node.getNodeValue()); break; case Node.ENTITY_REFERENCE_NODE : if(node.getNodeName().equals("amp")) buffer.append('&'); else if(node.getNodeName().equals("lt")) buffer.append('<'); else if(node.getNodeName().equals("gt")) buffer.append('>'); else if(node.getNodeName().equals("apos")) buffer.append('\''); else if(node.getNodeName().equals("quot")) buffer.append('"'); // Any other entity references are ignored break; default : // All other nodes are ignored } } return buffer.toString(); } /** * Writes <CODE>string</CODE> into writer, escaping &, ', ", <, and > * with the XML escape strings. */ public static void writeEscapedString(Writer writer, String string) throws IOException { for(int i = 0; i < string.length(); i++) { char c = string.charAt(i); if(c == '<') writer.write("<"); else if(c == '>') writer.write(">"); else if(c == '&') writer.write("&"); else if(c == '\'') writer.write("'"); else if(c == '"') writer.write("""); else writer.write(c); } } /** * Reads all the bytes from the passed input stream till end of stream * reached. * @param in the input stream to read from * @return array of bytes read * @exception IOException If any I/O exception occurs while reading data */ public static byte[] readFully(InputStream in) throws IOException { //create a new byte array stream to store the read data ByteArrayOutputStream byteArray = new ByteArrayOutputStream(); //read the bytes till EOF byte[] buffer = new byte[1024]; int bytesRead; while((bytesRead = in.read(buffer)) != -1) { //append the bytes read to the byteArray buffer byteArray.write(buffer,0,bytesRead); } //return the bytes read return byteArray.toByteArray(); } /** * Compares the queryDoc with the replyDoc and finds out if the * replyDoc is a match for the queryDoc. * @param replyDoc potential reply Document * @param queryDoc the query Document * @return true if the replyDoc is a match for the queryDoc, false * otherwise */ public static boolean match(LimeXMLDocument replyDoc, LimeXMLDocument queryDoc, boolean allowAllNulls) { if(queryDoc == null || replyDoc == null) throw new NullPointerException("querying with null doc."); //First find the names of all the fields in the query Collection<Map.Entry<String, String>> queryNameValues = queryDoc.getNameValueSet(); int size = queryNameValues.size(); int matchCount = 0; // number of matches int nullCount = 0; // number of fields in query not in replyDoc. boolean matchedBitrate = false; for(Map.Entry<String, String> entry : queryNameValues) { String currFieldName = entry.getKey(); String queryValue = entry.getValue(); assert queryValue != null : "null value"; if (queryValue.equals("")) continue; // "" matches everything!! String replyDocValue = replyDoc.getValue(currFieldName); if (currFieldName.endsWith("license_type__") && queryValue.length() > 0) { if (replyDocValue == null || !replyDocValue.startsWith(queryValue)) return false; } if (replyDocValue == null || replyDocValue.equals("")) nullCount++; else { try { // if this is a parse-able numeric value, doing a prefix // matching doesn't make sense. cast it to a double and do // a straight equals comparison double rDVD = Double.parseDouble(replyDocValue); double qVD = Double.parseDouble(queryValue); if (rDVD == qVD) { matchCount++; if (currFieldName.equals(LimeXMLNames.AUDIO_BITRATE)) matchedBitrate = true; } continue; } catch (NumberFormatException nfe) { // just roll through and try to do a normal test... } // we used to do a .equalsIgnoreCase, but that is a little too // rigid. so do a ignore case prefix match. String queryValueLC = queryValue.toLowerCase(Locale.US); String replyDocValueLC = I18NConvert.instance().getNorm(replyDocValue); if (replyDocValueLC.startsWith(queryValueLC) || replyDocValueLC.indexOf(" " + queryValueLC) >= 0) matchCount++; } } // The metric of a correct match is that whatever fields are specified // in the query must have prefix match* with the fields in the reply // unless the reply has a null for that field, in which case we are OK // with letting it slide. also, %MATCHING_RATE of the fields must // either be a prefix match or null. // We make an exception for queries of size 1 field. In this case, there // must be a 100% match (which is trivially >= %MATCHING_RATE) // * prefix match assumes a string; for numerics just do an equality test double sizeD = size; double matchCountD = matchCount; double nullCountD = nullCount; if (size > 1) { if (matchedBitrate) { // discount a bitrate match. matching bitrate's shouldn't // influence the logic because where size is 2, a matching // bitrate will result in a lot of irrelevant results. sizeD--; matchCountD--; matchCount--; } if (((nullCountD + matchCountD)/sizeD) < MATCHING_RATE) return false; // ok, it passed rate test, now make sure it had SOME matches... if (allowAllNulls || matchCount > 0) return true; else return false; } else if (size == 1) { if(allowAllNulls && nullCount == 1) return true; if(matchCountD/sizeD < 1) return false; return true; } //this should never happen - size >0 return false; } /** * Scans over the given String and returns a new String that contains * no invalid whitespace XML characters if any exist. If none exist * the original string is returned. * <p> * This DOES NOT CONVERT entities such as & or <, it will only remove * invalid characters such as \u0002, \u0004, etc... */ public static String scanForBadCharacters(String input) { if(input == null) return null; int length = input.length(); //lazily create the buffer so that we can scan & return the string //itself w/o recreating it if we didn't have to. StringBuilder buffer = null; for (int i = 0; i < length; ) { int c = input.codePointAt(i); // TODO: do other types need to be blanked out? if(Character.getType(c) == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE) { if(buffer == null) buffer = createBuffer(input, i); buffer.append(' '); } else { if(buffer != null) buffer.appendCodePoint(c); } i += Character.charCount(c); } if(buffer == null) return input; else return buffer.toString(); } /** * Attempts to unencode any leftover encoded entities in the XML. * This is generally caused by poor ID3 writers that write "&" instead of "&". */ public static String unencodeXML(String input) { //return null, if null is passed as argument if(input == null) return null; int length = input.length(); //lazily create the buffer so that we can scan & return the string //itself w/o recreating it if we didn't have to. StringBuilder buffer = null; for (int i = 0; i < length; ) { int c = input.codePointAt(i); if(c == '&') { if(input.regionMatches(i+1, "amp;", 0, 4)) { if(buffer == null) buffer = createBuffer(input, i); buffer.append("&"); i += 4; } else if(input.regionMatches(i+1, "lt;", 0, 3)) { if(buffer == null) buffer = createBuffer(input, i); buffer.append("<"); i += 3; } else if(input.regionMatches(i+1, "gt;", 0, 3)) { if(buffer == null) buffer = createBuffer(input, i); buffer.append(">"); i += 3; } else if(input.regionMatches(i+1, "quot;", 0, 5)) { if(buffer == null) buffer = createBuffer(input, i); buffer.append("\""); i += 5; } else if(input.regionMatches(i+1, "apos;", 0, 5)) { if(buffer == null) buffer = createBuffer(input, i); buffer.append("'"); i += 5; } else { if(buffer != null) buffer.appendCodePoint(c); } } else { if(buffer != null) buffer.appendCodePoint(c); } i += Character.charCount(c); } // If we never created the buffer, return the string itself. if(buffer == null) return input; else return buffer.toString(); } /** * Parses the passed string, and encodes the special characters (used in * xml for special purposes) with the appropriate codes. * e.g. '<' is changed to '<' * @return the encoded string. Returns null, if null is passed as argument */ public static String encodeXML(String input) { //return null, if null is passed as argument if(input == null) return null; int length = input.length(); //lazily create the buffer so that we can scan & return the string //itself w/o recreating it if we didn't have to. StringBuilder buffer = null; for (int i = 0; i < length; ) { int c = input.codePointAt(i); // TODO: do other types need to be blanked out? if(Character.getType(c) == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE) { if(buffer == null) buffer = createBuffer(input, i); buffer.append(' '); } else { switch (c) { case '&': if(buffer == null) buffer = createBuffer(input, i); buffer.append("&"); break; case '<': if(buffer == null) buffer = createBuffer(input, i); buffer.append("<"); break; case '>': if(buffer == null) buffer = createBuffer(input, i); buffer.append(">"); break; case '\"': if(buffer == null) buffer = createBuffer(input, i); buffer.append("""); break; case '\'': if(buffer == null) buffer = createBuffer(input, i); buffer.append("'"); break; default: if(buffer != null) buffer.appendCodePoint(c); } } i += Character.charCount(c); } // If we never created the buffer, return the string itself. if(buffer == null) return input; else return buffer.toString(); } /** Creates a StringBuilder from the given data, up to the right length. */ private static StringBuilder createBuffer(String data, int upTo) { StringBuilder sb = new StringBuilder(data.length() * 2); sb.append(data, 0, upTo); return sb; } /** @return A properly formatted version of the input data. */ public static byte[] compress(byte[] data) { byte[] compressedData = null; if (shouldCompress(data)) compressedData = compressZLIB(data); byte[] retBytes = null; if (compressedData != null) { retBytes = new byte[COMPRESS_HEADER_ZLIB.length() + compressedData.length]; System.arraycopy(StringUtils.toAsciiBytes(COMPRESS_HEADER_ZLIB), 0, retBytes, 0, COMPRESS_HEADER_ZLIB.length()); System.arraycopy(compressedData, 0, retBytes, COMPRESS_HEADER_ZLIB.length(), compressedData.length); } else { // essentially compress failed, just send prefixed raw data.... retBytes = new byte[COMPRESS_HEADER_NONE.length() + data.length]; System.arraycopy(StringUtils.toAsciiBytes(COMPRESS_HEADER_NONE), 0, retBytes, 0, COMPRESS_HEADER_NONE.length()); System.arraycopy(data, 0, retBytes, COMPRESS_HEADER_NONE.length(), data.length); } return retBytes; } /** Currently, all data is compressed. In the future, this will handle * heuristics about whether data should be compressed or not. */ private static boolean shouldCompress(byte[] data) { if (data.length >= 1000) return true; else return false; } /** Returns a ZLIB'ed version of data. */ private static byte[] compressZLIB(byte[] data) { DeflaterOutputStream gos = null; Deflater def = null; try { def = new Deflater(); ByteArrayOutputStream baos=new ByteArrayOutputStream(); gos=new DeflaterOutputStream(baos, def); gos.write(data, 0, data.length); gos.flush(); gos.close(); // required to flush data -- flush doesn't do it. // System.out.println("compression savings: " + ((1-((double)baos.toByteArray().length/(double)data.length))*100) + "%"); return baos.toByteArray(); } catch (IOException e) { //This should REALLY never happen because no devices are involved. //But could we propagate it up. assert false : "Couldn't write to byte stream"; return null; } finally { IOUtils.close(gos); IOUtils.close(def); } } /** Returns a GZIP'ed version of data. */ /* private static byte[] compressGZIP(byte[] data) { try { ByteArrayOutputStream baos=new ByteArrayOutputStream(); DeflaterOutputStream gos=new GZIPOutputStream(baos); gos.write(data, 0, data.length); gos.flush(); gos.close(); //flushes bytes // System.out.println("compression savings: " + ((1-((double)baos.toByteArray().length/(double)data.length))*100) + "%"); return baos.toByteArray(); } catch (IOException e) { //This should REALLY never happen because no devices are involved. //But could we propogate it up. Assert.that(false, "Couldn't write to byte stream"); return null; } } */ /** @return Correctly uncompressed data (according to Content-Type header) * May return a byte[] of length 0 if something bad happens. */ public static byte[] uncompress(byte[] data) throws IOException { byte[] retBytes = new byte[0]; String headerFragment = StringUtils.getASCIIString(data, 0, C_HEADER_BEGIN.length()); if (headerFragment.equals(C_HEADER_BEGIN)) { // we have well formed input (so far) boolean found = false; int i=0; for(; i<data.length && !found; i++) if(data[i]==(byte)125) found = true; //We know know that "{" is at 1 because we are in this if block headerFragment = StringUtils.getASCIIString(data,1,i-1-1); int comp = getCompressionType(headerFragment); if (comp == NONE) { retBytes = new byte[data.length-(headerFragment.length()+2)]; System.arraycopy(data, i, retBytes, 0, data.length-(headerFragment.length()+2)); } else if (comp == GZIP) { retBytes = new byte[data.length-COMPRESS_HEADER_GZIP.length()]; System.arraycopy(data, COMPRESS_HEADER_GZIP.length(), retBytes, 0, data.length-COMPRESS_HEADER_GZIP.length()); retBytes = uncompressGZIP(retBytes); } else if (comp == ZLIB) { retBytes = new byte[data.length-COMPRESS_HEADER_ZLIB.length()]; System.arraycopy(data, COMPRESS_HEADER_ZLIB.length(), retBytes, 0, data.length-COMPRESS_HEADER_ZLIB.length()); retBytes = uncompressZLIB(retBytes); } else ; // uncompressible XML, just drop it on the floor.... } else return data; // the Content-Type header is optional, assumes PT return retBytes; } private static int getCompressionType(String header) { String s = header.trim(); if(s.equals("") || s.equalsIgnoreCase(C_HEADER_NONE_VAL)) return NONE; else if(s.equalsIgnoreCase(C_HEADER_GZIP_VAL)) return GZIP; else if(s.equalsIgnoreCase(C_HEADER_ZLIB_VAL)) return ZLIB; else return -1; } /** Returns the uncompressed version of the given ZLIB'ed bytes. Throws * IOException if the data is corrupt. */ private static byte[] uncompressGZIP(byte[] data) throws IOException { ByteArrayInputStream bais=new ByteArrayInputStream(data); InflaterInputStream gis = null; try { gis =new GZIPInputStream(bais); ByteArrayOutputStream baos=new ByteArrayOutputStream(); while (true) { int b=gis.read(); if (b==-1) break; baos.write(b); } return baos.toByteArray(); } finally { IOUtils.close(gis); } } /** Returns the uncompressed version of the given ZLIB'ed bytes. Throws * IOException if the data is corrupt. */ private static byte[] uncompressZLIB(byte[] data) throws IOException { ByteArrayInputStream bais=new ByteArrayInputStream(data); InflaterInputStream gis = null; Inflater inf = null; try { inf = new Inflater(); gis =new InflaterInputStream(bais, inf); ByteArrayOutputStream baos=new ByteArrayOutputStream(); while (true) { int b=gis.read(); if (b==-1) break; baos.write(b); } return baos.toByteArray(); } finally { IOUtils.close(gis); IOUtils.close(inf); } } private static final int NUM_BYTES_TO_HASH = 100; private static final int NUM_TOTAL_HASH = NUM_BYTES_TO_HASH*3; private static void clearHashBytes(byte[] hashBytes) { for (int i = 0; i < NUM_BYTES_TO_HASH; i++) hashBytes[i] = (byte)0; } /** * Hashes the file using bits and pieces of the file. * * @return the SHA hash bytes of the input bytes. * @throws IOException if hashing failed for any reason. */ public static byte[] hashFile(File toHash) throws IOException { byte[] retBytes = null; FileInputStream fis = null; byte[] hashBytes = new byte[NUM_BYTES_TO_HASH]; try { // setup fis = new FileInputStream(toHash); MessageDigest md = null; try { md = MessageDigest.getInstance("SHA"); } catch(NoSuchAlgorithmException nsae) { throw new IllegalStateException(nsae); } long fileLength = toHash.length(); if (fileLength < NUM_TOTAL_HASH) { int numRead = 0; do { clearHashBytes(hashBytes); numRead = fis.read(hashBytes); md.update(hashBytes); // if the file changed underneath me, throw away... if (toHash.length() != fileLength) throw new IOException("invalid length"); } while (numRead == NUM_BYTES_TO_HASH); } else { // need to do some mathy stuff....... long thirds = fileLength / 3; // beginning input.... clearHashBytes(hashBytes); fis.read(hashBytes); md.update(hashBytes); // if the file changed underneath me, throw away... if (toHash.length() != fileLength) throw new IOException("invalid length"); // middle input... clearHashBytes(hashBytes); fis.skip(thirds - NUM_BYTES_TO_HASH); fis.read(hashBytes); md.update(hashBytes); // if the file changed underneath me, throw away... if (toHash.length() != fileLength) throw new IOException("invalid length"); // ending input.... clearHashBytes(hashBytes); fis.skip(toHash.length() - (thirds + NUM_BYTES_TO_HASH) - NUM_BYTES_TO_HASH); fis.read(hashBytes); md.update(hashBytes); // if the file changed underneath me, throw away... if (toHash.length() != fileLength) throw new IOException("invalid length"); } retBytes = md.digest(); } finally { if (fis != null) fis.close(); } return retBytes; } /** * Tries to parse <code>integer</code> to an int. If it fails, returns * <code>defaultValue</code>. */ public static int parseInteger(String integer, int defaultValue) { try { return Integer.parseInt(integer); } catch(NumberFormatException nfx) { LOG.error("Unable to parse number: " + integer, nfx); return defaultValue; } } /** * Removes <pre><elementName>.*</elementName></pre> from <code>input</code>. * * @return <code>input</code> if element not found in the input */ public static String stripElement(String input, String elementName) { return input.replaceAll("<" + elementName + ">[^<]*</" + elementName +">", ""); } }