/* * This software is Copyright 2005,2006,2007,2008 Langdale Consultants. * Langdale Consultants can be contacted at: http://www.langdale.com.au */ package au.com.langdale.splitmodel; /** * Write a split model. A split model represents a very large RDF graph with * a set of Turtle files. */ import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.Random; import java.util.regex.Pattern; import au.com.langdale.kena.ConversionException; import au.com.langdale.kena.Injector; import com.hp.hpl.jena.rdf.model.impl.Util; public class SplitWriter extends SplitBase implements Injector { public static final int DEFAULT_MODULUS = 128; public static final int DEFAULT_QUOTA = 128; private static final String QUOTE = "\"\"\""; private static final CharSequence ESCAPE = "\\"; private static final CharSequence QUOTE_MARK = "\""; private static final CharSequence ESCAPE_ESCAPE = "\\\\"; private static final CharSequence ESCAPE_QUOTE_MARK = "\\\""; private static final Pattern NCNAME_REGEX = Pattern.compile("[A-Za-z_][A-Za-z0-9-_.]*"); private static final Random random = new Random(); private final String local = LOCAL + Integer.toHexString(random.nextInt()) + "#"; private final Map spaces; private final Map prefixes; private final Writer[] cache; private final int quota; private int sweep, active, sequ = 0x1000; private String base; private boolean freeze, imprinted; /** * Intitialise with all parameters * @param destin: the pathname of the directory containing the split model * @param base: the base namespace * @param modulus: the number of files to use. this parameter should be proportional to model size. * @param quota: the number of files to keep memory resident. */ public SplitWriter(String destin, String base, int modulus, int quota) { this.destin = new File(destin); this.quota = quota; setBase(base); setModulus(modulus); cache = new Writer[modulus]; spaces = new LinkedHashMap(); prefixes = new HashMap(); setPrefix("local", local); setPrefix("split", SPLITMODEL); setPrefix("xsd", XSD_URI); clear(); } /** * Initialise with base namespace (recommended). URI's that * start with the base namespace will be abreviated in storage. * @param destin: the pathname of the directory containing the split model * @param base: the base namespace. */ public SplitWriter(String destin, String base) { this(destin, base, DEFAULT_MODULUS, DEFAULT_QUOTA); } /** * Initialise using the base namespace. * @param destin: the pathname of the directory containing the split model */ public SplitWriter(String destin) { this(destin, new File(destin).toURI().toString()); } private SplitWriter(SplitWriter parent, String name) { destin = new File(parent.destin, name); quota = parent.quota; setBase(parent.base); setModulus(parent.modulus); cache = new Writer[modulus]; prefixes = new HashMap(parent.prefixes); spaces = new HashMap(parent.spaces); removePrefix("local"); setPrefix("local", local); clear(); } private void clear() { for( int ix = 0; ix < modulus; ix++) { File f = getFile(ix); if( f.exists()) f.delete(); } } /* * @see au.com.langdale.splitmodel.Injector#createQuote(java.lang.String) */ public Injector createQuote(Object node) { String uri = (String) node; return new SplitWriter(this, uri.substring( Util.splitNamespace( uri ))); } /* (non-Javadoc) * @see au.com.langdale.splitmodel.Injector#getBase() */ public String getBase() { return base; } /** * Set the base namespace, provided no statements have been added. */ public void setBase(String base) { if(freeze) return; if( ! base.endsWith("#")) base += "#"; this.base = base; } /** * Set the modulus, or number of files, provided no statements have been added. * @param modulus */ public void setModulus(int modulus) { if(freeze) return; if( modulus < 2) throw new IllegalArgumentException("split model modulus must be 2 or greater"); this.modulus = modulus; } /* (non-Javadoc) * @see au.com.langdale.splitmodel.Injector#setPrefix(java.lang.String, java.lang.String) */ public void setPrefix(String prefix, String namespace) { if( freeze ) return; if( ! namespace.startsWith("http:") || ! namespace.endsWith("#")) return; if( prefixes.containsKey(prefix) || spaces.containsKey(namespace)) return; spaces.put(namespace, prefix); prefixes.put(prefix, namespace); } private void removePrefix(String prefix) { if( freeze ) return; Object ns = prefixes.remove(prefix); if( ns != null) spaces.remove(ns); } private Writer getWriter(int key) throws IOException { Writer result = cache[key]; if( result == null) { evict(); result = open(key); } return result; } private Writer open(int key) throws IOException { Writer result; File target = getFile(key); destin.mkdirs(); boolean isnew = target.createNewFile(); result = new OutputStreamWriter( new BufferedOutputStream (new FileOutputStream(target, true)), "UTF-8"); if( isnew ) init(result, key); else result.write("\n"); cache[key] = result; active++; freeze = true; imprinted = imprinted || key == 0; return result; } private void init(Writer result, int key) throws IOException { // result.write("@base <" + base + "> .\n"); result.write("@prefix : <" + base + "> .\n"); // @base not supported in Jena 2.5.3 Iterator it = spaces.keySet().iterator(); while( it.hasNext()) { String namespace = (String) it.next(); String prefix = (String) spaces.get(namespace); result.write( "@prefix " + prefix + ": <" + namespace + "> .\n"); } result.write("\n"); try { result.write(createStatement(DOCUMENT, HASH, createSymbol(Integer.toString(key), XSD_INTEGER_URI))); result.write(createStatement(DOCUMENT, MODULUS, createSymbol(Integer.toString(modulus), XSD_INTEGER_URI))); } catch (ConversionException e) { throw new Error(e); } result.write("\n"); } /* * @see au.com.langdale.splitmodel.Injector#close() */ public void close() throws IOException { for( int ix = 0; active > 0; ix++ ) { Writer target = cache[ix]; if( target != null) { target.close(); active--; } } if( ! imprinted ) open(0).close(); // ensure file 0 is always present } private String createSymbol(String uri) throws ConversionException { if( uri.startsWith(base)) { String name = uri.substring(base.length()); if( NCNAME_REGEX.matcher(name).matches()) return ":" + name; } else { int ix = uri.lastIndexOf('#') + 1; if( ix > 0 && ix < uri.length()) { String namespace = uri.substring(0, ix); String prefix = (String) spaces.get(namespace); if( prefix != null) { String name = uri.substring(ix); if( NCNAME_REGEX.matcher(name).matches()) return prefix + ":" + name; } } } if( uri.contains("{")) throw new ConversionException(); return "<" + uri + ">"; } private String createSymbol(String lex, String type) throws ConversionException { String suffix; if( type != null ) suffix = "^^" + createSymbol(type); else suffix = ""; String escaped = lex .replace(ESCAPE, ESCAPE_ESCAPE) .replace(QUOTE_MARK, ESCAPE_QUOTE_MARK); return QUOTE + escaped + QUOTE + suffix; } private String createStatement(String subj, String pred, String obj) throws ConversionException { return createSymbol(subj) + " " + createSymbol(pred) + " " + obj + " .\n"; } private void evict() throws IOException { while( active > quota ) { sweep = ( sweep + 1) % modulus; Writer target = cache[sweep]; if( target != null) { target.close(); cache[sweep] = null; active--; } } } /* * @see au.com.langdale.splitmodel.Injector#createAnon(java.lang.String) */ public Object createAnon(String id) throws ConversionException { if( id != null ) return local + id; else return local + "_" + Integer.toHexString(sequ++); } /* * @see au.com.langdale.splitmodel.Injector#createNamed(java.lang.String) */ public Object createNamed(String uri) throws ConversionException { return uri; } /* * @see au.com.langdale.splitmodel.Injector#createLiteral */ public Object createLiteral(String value, String lang, String type, boolean isXML) throws ConversionException { return createSymbol(value, type); } /* * @see au.com.langdale.splitmodel.Injector#addObjectProperty(java.lang.Object, java.lang.String, java.lang.Object) */ public void addObjectProperty(Object subj, String pred, Object obj) throws IOException, ConversionException { String s = (String)subj; String o = (String)obj; String stmnt = createStatement(s, pred, createSymbol(o)); int subjKey = hashURI(s); getWriter(subjKey).write(stmnt); if( ! pred.equals(RDF_TYPE_URI)) { int objKey = hashURI(o); if( subjKey != objKey ) getWriter(objKey).write(stmnt); } } /* * @see au.com.langdale.splitmodel.Injector#addDatatypeProperty(java.lang.Object, java.lang.String, java.lang.Object) */ public void addDatatypeProperty(Object subj, String pred, Object obj) throws IOException, ConversionException { String s = (String)subj; String o = (String)obj; String stmnt = createStatement(s, pred, o); int subjKey = hashURI(s); getWriter(subjKey).write(stmnt); } }