NodecSSE.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.tdb.store.nodetable;

import java.nio.ByteBuffer ;

import org.apache.jena.atlas.io.BlockUTF8 ;
import org.apache.jena.atlas.lib.StrUtils ;
import org.apache.jena.graph.Node ;
import org.apache.jena.graph.NodeFactory ;
import org.apache.jena.riot.RiotException ;
import org.apache.jena.riot.out.NodeFmtLib ;
import org.apache.jena.riot.system.PrefixMap ;
import org.apache.jena.riot.system.PrefixMapNull ;
import org.apache.jena.riot.tokens.Token ;
import org.apache.jena.riot.tokens.Tokenizer ;
import org.apache.jena.riot.tokens.TokenizerFactory ;
import org.apache.jena.riot.web.LangTag ;
import org.apache.jena.shared.PrefixMapping ;
import org.apache.jena.sparql.util.NodeUtils ;
import org.apache.jena.tdb.TDBException ;
import org.apache.jena.tdb.lib.StringAbbrev ;

/** Simple encoder/decoder for nodes that uses Turtle term string encoding. */

public class NodecSSE implements Nodec
{
    private static boolean SafeChars = false ;
    // Characters in IRIs that are illegal and cause SSE problems, but we wish to keep.
    final private static char MarkerChar = '_' ;
    final private static char[] invalidIRIChars = { MarkerChar , ' ' } ; 
    
    public NodecSSE() {}
    
    @Override
    public int maxSize(Node node)
    {
        return maxLength(node) ;
    }

    private static final PrefixMap pmap0 = PrefixMapNull.empty ;
    private static final boolean onlySafeBNodeLabels = false ;
    @Override
    public int encode(Node node, ByteBuffer bb, PrefixMapping pmap)
    {
        String str = null ;

        if ( node.isURI() ) 
        {
            // Pesky spaces etc
            String x = StrUtils.encodeHex(node.getURI(), MarkerChar, invalidIRIChars) ;
            if ( x != node.getURI() )
                node = NodeFactory.createURI(x) ; 
        }
        
        if ( node.isLiteral() && NodeUtils.isLangString(node) )
        {
            // Check syntactically valid.
            String lang = node.getLiteralLanguage() ;
            if ( ! LangTag.check(lang) )
                throw new TDBException("bad language tag: "+node) ;
        }
        
        if ( node.isBlank() && ! onlySafeBNodeLabels ) {
            // Special case.
            str = "_:"+node.getBlankNodeLabel() ;
        }
        
        // Node->String
        if ( str == null )
            str = NodeFmtLib.str(node, (String)null, pmap0) ;
        // String -> bytes ;
        BlockUTF8.fromChars(str, bb) ;
        bb.flip() ;
        return bb.limit() ;
    }

    @Override
    public Node decode(ByteBuffer bb, PrefixMapping pmap)
    {
        // Ideally, this would be straight from the byte buffer.
        // But currently we go bytes -> string -> node 

        // Byte -> String
        String str = BlockUTF8.toString(bb) ;
        //OLD  
        //String str = Bytes.fromByteBuffer(bb) ;
        // String -> Node
        
        // Easy cases.
        if ( str.startsWith("_:") )   
        {
            // Must be done this way.
            // In particular, bnode labels can contain ":" from Jena
            // TokenizerText does not recognize these.
            str = str.substring(2) ;
            return NodeFactory.createBlankNode(str) ;
        }

        if ( str.startsWith("<") )
        {
            // Do directly.
            // (is it quicker?)
            str = str.substring(1,str.length()-1) ;
            str = StrUtils.unescapeString(str) ;
            str = StrUtils.decodeHex(str, MarkerChar) ;
            return NodeFactory.createURI(str) ;
        }

        Tokenizer tokenizer = TokenizerFactory.makeTokenizerString(str) ;
        if ( ! tokenizer.hasNext() )
            throw new TDBException("Failed to tokenise: "+str) ;
        Token t = tokenizer.next() ;

        try {
            Node n = t.asNode() ;
            if ( n == null ) throw new TDBException("Not a node: "+str) ;
            return n ;
        } catch (RiotException ex)
        {
            throw new TDBException("Bad string for node: "+str) ;
        }
    }

    // Over-estimate the length of the encoding.
    private static int maxLength(Node node)
    {
        if ( node.isBlank() )
            // "_:"
            return 2+maxLength(node.getBlankNodeLabel()) ;    
        if ( node.isURI() )
            // "<>"
            return 2+maxLength(node.getURI()) ;
        if ( node.isLiteral() )
        {
            int len = 2+maxLength(node.getLiteralLexicalForm()) ;
            if ( NodeUtils.isLangString(node) )
                // Space for @ (language tag is ASCII)
                len = len + 3 + node.getLiteralLanguage().length() ;
            else if ( ! NodeUtils.isSimpleString(node) )
                // The quotes and also space for ^^<>
                len = len + 4 + maxLength(node.getLiteralDatatypeURI()) ;
            return len ;
        }
        if ( node.isVariable() )
            // "?"
            return 1+maxLength(node.getName()) ;
        throw new TDBException("Unrecognized node type: "+node) ;
    }

    private static int maxLength(String string)
    {
        // Very worse case for UTF-8 - and then some.
        // Encoding every character as _XX or bad UTF-8 conversion (3 bytes)
        // Max 3 bytes UTF-8 for up to 10FFFF (NB Java treats above 16bites as surrogate pairs only). 
        return string.length()*3 ;
    }
    
    // URI compression can be effective but literals are more of a problem.  More variety. 
    public final static boolean compression = false ; 
    private static StringAbbrev abbreviations = new StringAbbrev() ;
    static {
        abbreviations.add(  "rdf",      "<http://www.w3.org/1999/02/22-rdf-syntax-ns#") ;
        abbreviations.add(  "rdfs",     "<http://www.w3.org/2000/01/rdf-schema#") ;
        abbreviations.add(  "xsd",      "<http://www.w3.org/2001/XMLSchema#") ;

        // MusicBrainz
        abbreviations.add(  "mal",      "<http://musicbrainz.org/mm-2.1/album/") ;
        abbreviations.add(  "mt",       "<http://musicbrainz.org/mm-2.1/track/") ;
        abbreviations.add(  "mar",      "<http://musicbrainz.org/mm-2.1/artist/") ;
        abbreviations.add(  "mtr",      "<http://musicbrainz.org/mm-2.1/trmid/") ;
        abbreviations.add(  "mc",       "<http://musicbrainz.org/mm-2.1/cdindex/") ;

        abbreviations.add(  "m21",      "<http://musicbrainz.org/mm/mm-2.1#") ;
        abbreviations.add(  "dc",       "<http://purl.org/dc/elements/1.1/") ;
        // DBPedia
        abbreviations.add(  "r",        "<http://dbpedia/resource/") ;
        abbreviations.add(  "p",        "<http://dbpedia/property/") ;
    }
    private String compress(String str)
    {
        if ( !compression || abbreviations == null ) return str ;
        return abbreviations.abbreviate(str) ;
    }

    private String decompress(String x)
    {
        if ( !compression || abbreviations == null ) return x ;
        return abbreviations.expand(x) ;
    }

}