package com.ontology2.bakemono.util; import com.google.common.base.CharMatcher; import org.apache.commons.codec.digest.DigestUtils; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; // // Functions in this class implement transformations on identifiers that // do not require a live knowledge base and that don't depend on // a particular RDF library. // public class StatelessIdFunctions { // // Freebase guids and mids actually represent sequential integers; these // functions transform them to and from integers so we can convert // between them and sort Freebase identifiers in the order they were // inserted // private static String b32digits="0123456789bcdfghjklmnpqrstvwxyz_"; private static String bkNs="http://rdf.basekb.com/ns/"; public static long midToLong(String mid) { long value=0; if(!mid.startsWith("/m/0")) { throw new IllegalArgumentException("ill-formed mid ["+mid+"]"); } for(int i=4;i<mid.length();i++) { String c=mid.substring(i,i+1); int digitValue= b32digits.indexOf(c); if (digitValue==-1) { throw new IllegalArgumentException("ill-formed mid ["+mid+"]"); } value = value << 5; value = value | digitValue; } return value; } public static String longToGuid(long l) { return "#9202a8c04000641f"+Long.toHexString(l | 0x8000000000000000l); } public static String midToGuid(String mid) { return longToGuid(midToLong(mid)); } public static long guidToLong(String guid) { String guidPrefix="#9202a8c04000641f8"; if (!guid.startsWith(guidPrefix)) { throw new IllegalArgumentException("Guid ["+guid+"] does not start with valid prefix"); } String guidDigits=guid.substring(guidPrefix.length()); if (15!=guidDigits.length()) { throw new IllegalArgumentException("Guid ["+guid+"] has wrong number of digits"); } return Long.parseLong(guidDigits,16); }; public static String longToMid(long numericId) { StringBuffer sb=new StringBuffer(16); while(numericId>0) { int digit=(int) (numericId % 32); sb.append(b32digits.charAt(digit)); numericId=numericId/32; } sb.append("0/m/"); return sb.reverse().toString(); }; public static String guidToMid(String guid) { return longToMid(guidToLong(guid)); } public static String toBkUri(String fbId) { if ("".equals(fbId)) return bkNs; return bkNs + fbId.substring(1).replace('/', '.'); } // ---------------------------------------------------- public static String iriEscape(String key){ return new IRIEscaper().escape(key); } public static String dbpediaEscape(String key) { return new DBpediaEscaper().escape(key); } public static class IRIEscaper { StringBuffer out; public String escape(String key){ out=new StringBuffer(); final int length = key.length(); for (int offset = 0; offset < length; ) { final int codepoint = key.codePointAt(offset); transformChar(codepoint); offset += Character.charCount(codepoint); } return out.toString(); } private void transformChar(int cp) { char[] rawChars=Character.toChars(cp); if(acceptChar(rawChars,cp)) { out.append(Character.toChars(cp)); } else { percentEncode(rawChars); } } private void percentEncode(char[] rawChars) { try { byte[] bytes=new String(rawChars).getBytes("UTF-8"); for(byte b:bytes) { out.append('%'); out.append(byteToHex(b)); } } catch(UnsupportedEncodingException ex) { throw new RuntimeException(ex); } } static String byteToHex(byte b) { String padded="00"+Integer.toHexString(0x00FF & (int) b).toUpperCase(); return padded.substring(padded.length()-2); } // // this code should implement the 'ipchar' production from // // http://www.apps.ietf.org/rfc/rfc3987.html // protected boolean acceptChar(char[] chars,int cp) { if(chars.length==1) { char c=chars[0]; if(Character.isLetterOrDigit(c)) return true; if(c=='-' || c=='.' || c=='_' || c=='~') return true; if(c=='!' || c=='$' || c=='&' || c=='\'' || c=='(' || c==')' || c=='*' || c=='+' || c==',' || c==';' || c=='=' || c== ':' || c=='@') return true; if (cp<0xA0) return false; } if(cp>=0xA0 && cp<=0xD7FF) return true; if(cp>=0xF900 && cp<=0xFDCF) return true; if(cp>=0xFDF0 && cp<=0xFFEF) return true; if (cp>=0x10000 && cp<=0x1FFFD) return true; if (cp>=0x20000 && cp<=0x2FFFD) return true; if (cp>=0x30000 && cp<=0x3FFFD) return true; if (cp>=0x40000 && cp<=0x4FFFD) return true; if (cp>=0x50000 && cp<=0x5FFFD) return true; if (cp>=0x60000 && cp<=0x6FFFD) return true; if (cp>=0x70000 && cp<=0x7FFFD) return true; if (cp>=0x80000 && cp<=0x8FFFD) return true; if (cp>=0x90000 && cp<=0x9FFFD) return true; if (cp>=0xA0000 && cp<=0xAFFFD) return true; if (cp>=0xB0000 && cp<=0xBFFFD) return true; if (cp>=0xC0000 && cp<=0xCFFFD) return true; if (cp>=0xD0000 && cp<=0xDFFFD) return true; if (cp>=0xE1000 && cp<=0xEFFFD) return true; return false; } } public static class DBpediaEscaper extends IRIEscaper { // I looked at all the characters that actually appear in DBpedia 3.9 keys and removed the // % escape character. Note that this seems to be the same as what acceptChar() accepts above // in the range 0..127, which is of course what it should be final String observedCharacters="!$&'()*+,-.0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"; final CharMatcher cm=CharMatcher.anyOf(observedCharacters); protected boolean acceptChar(char[] chars,int cp) { if(chars.length==1) { return cm.matches(chars[0]); } return false; } } public static String unescapeKey(String key) { return new Unescaper().unescape(key); } private static class Unescaper { StringBuffer out; StringBuffer hexbytes; int state=0; public String unescape(String key) { out=new StringBuffer(key.length()); for(int i=0;i<key.length();i++) { processChar(key.charAt(i)); } return out.toString(); } private void processChar(char charAt) { if (state==0) { if('$'==charAt) { state=1; hexbytes=new StringBuffer(); } else { out.append(charAt); } } else { hexbytes.append(charAt); if (state==4) { int codepoint=Integer.parseInt(hexbytes.toString(),16); char[] specialChar=Character.toChars(codepoint); out.append(specialChar); state=0; } else { state++; } } } } public static int hashRawMid(String mid,int modulus) { byte[] hashResult=DigestUtils.md5(mid); long hashInt = hashArrayToInt(hashResult); return (int) Math.abs(hashInt % modulus); } public static long hashArrayToInt(byte[] hashResult) { return ByteBuffer.wrap(hashResult).getLong(); } }