/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Jun 3, 2010 */ package com.bigdata.rdf.lexicon; import java.io.StringReader; import java.util.Iterator; import java.util.Properties; import java.util.UUID; import org.apache.log4j.Logger; import org.openrdf.model.Literal; import com.bigdata.btree.DefaultTupleSerializer; import com.bigdata.btree.IIndex; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexTypeEnum; import com.bigdata.btree.keys.DefaultKeyBuilderFactory; import com.bigdata.btree.keys.IKeyBuilderFactory; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.btree.raba.codec.SimpleRabaCoder; import com.bigdata.journal.IIndexManager; import com.bigdata.rdf.internal.IV; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.store.AbstractTripleStore; import com.bigdata.search.FullTextIndex; import com.bigdata.search.Hit; import com.bigdata.search.TokenBuffer; /** * Implementation based on the built-in keyword search capabilities for bigdata. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id: BigdataRDFFullTextIndex.java 4709 2011-06-15 16:23:22Z thompsonbry $ */ public class BigdataSubjectCentricFullTextIndex extends FullTextIndex implements ISubjectCentricTextIndexer<Hit> { final private static transient Logger log = Logger .getLogger(BigdataSubjectCentricFullTextIndex.class); /** * The basename of the search index. */ public static final transient String NAME_SUBJ_SEARCH = "subjectSearch"; static public BigdataSubjectCentricFullTextIndex getInstance( final IIndexManager indexManager, final String namespace, final Long timestamp, final Properties properties) { if (namespace == null) throw new IllegalArgumentException(); return new BigdataSubjectCentricFullTextIndex(indexManager, namespace, timestamp, properties); } /** * When true, index datatype literals as well. */ private final boolean indexDatatypeLiterals; public boolean getIndexDatatypeLiterals() { return indexDatatypeLiterals; } /** * @param indexManager * @param namespace * @param timestamp * @param properties */ public BigdataSubjectCentricFullTextIndex(final IIndexManager indexManager, final String namespace, final Long timestamp, final Properties properties) { super(indexManager, namespace, timestamp, properties); /* * Also index datatype literals? */ indexDatatypeLiterals = Boolean .parseBoolean(getProperty( AbstractTripleStore.Options.TEXT_INDEX_DATATYPE_LITERALS, AbstractTripleStore.Options.DEFAULT_TEXT_INDEX_DATATYPE_LITERALS)); } /** * Conditionally registers the necessary index(s). * * @throws IllegalStateException * if the client does not have write access. * * @todo this is not using {@link #acquireExclusiveLock()} since I generally * allocate the text index inside of another relation and * {@link #acquireExclusiveLock()} is not reentrant for zookeeper. */ @Override public void create() { assertWritable(); final String name = getNamespace() + "."+NAME_SUBJ_SEARCH; final IIndexManager indexManager = getIndexManager(); // final IResourceLock resourceLock = acquireExclusiveLock(); // // try { /* * Register a tuple serializer that knows how to unpack the values and * how to extract the bytes corresponding to the encoded text (they can * not be decoded) from key and how to extract the document and field * identifiers from the key. */ final Properties p = getProperties(); final IndexMetadata indexMetadata = new IndexMetadata(indexManager, p, name, UUID.randomUUID(), IndexTypeEnum.BTree); /* * Override the collator strength property to use the configured * value or the default for the text indexer rather than the * standard default. This is done because you typically want to * recognize only Primary differences for text search while you * often want to recognize more differences when generating keys for * a B+Tree. * * Note: The choice of the language and country for the collator * should not matter much for this purpose since the total ordering * is not used except to scan all entries for a given term, so the * relative ordering between terms does not matter. */ final IKeyBuilderFactory keyBuilderFactory; { final Properties tmp = new Properties(p); tmp.setProperty(KeyBuilder.Options.STRENGTH, p.getProperty( Options.INDEXER_COLLATOR_STRENGTH, Options.DEFAULT_INDEXER_COLLATOR_STRENGTH)); keyBuilderFactory = new DefaultKeyBuilderFactory(tmp); } final boolean fieldsEnabled = Boolean.parseBoolean(p .getProperty(Options.FIELDS_ENABLED, Options.DEFAULT_FIELDS_ENABLED)); if (log.isInfoEnabled()) log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled); // final boolean doublePrecision = Boolean.parseBoolean(p // .getProperty(Options.DOUBLE_PRECISION, // Options.DEFAULT_DOUBLE_PRECISION)); // // if (log.isInfoEnabled()) // log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision); /* * FIXME Optimize. SimpleRabaCoder will be faster, but can do better * with record aware coder. */ indexMetadata.setTupleSerializer(new RDFFullTextIndexTupleSerializer( keyBuilderFactory,// DefaultTupleSerializer.getDefaultLeafKeysCoder(),// // DefaultTupleSerializer.getDefaultValuesCoder(),// SimpleRabaCoder.INSTANCE, fieldsEnabled )); indexManager.registerIndex(indexMetadata); if (log.isInfoEnabled()) log.info("Registered new subject-centric text index: name=" + name); /* * Note: defer resolution of the index. */ // ndx = getIndex(name); // } finally { // // unlock(resourceLock); // // } } /** * The full text index is currently located in the same namespace as the * lexicon relation. However, the distributed zookeeper locks (ZLocks) * are not reentrant. Therefore this method is overridden to NOT acquire * the ZLock for the namespace of the relation when destroying the full * text index -- that lock is already held for the same namespace by the * {@link LexiconRelation}. */ @Override public void destroy() { if (log.isInfoEnabled()) log.info(""); assertWritable(); final String name = getNamespace() + "." + NAME_SUBJ_SEARCH; getIndexManager().dropIndex(name); } /** * The backing index. */ volatile private IIndex ndx; /** * The index used to associate term identifiers with tokens parsed from * documents. */ public IIndex getIndex() { if(ndx == null) { synchronized (this) { ndx = getIndex(getNamespace() + "." + NAME_SUBJ_SEARCH); if (ndx == null) throw new IllegalStateException(); } } return ndx; } public void index(final IV<?,?> subject, final Iterator<BigdataValue> valuesIterator) { if (subject == null) { throw new IllegalArgumentException(); } if (log.isDebugEnabled()) { log.debug("indexing: " + subject); } /* * We can use a capacity of one, because we will be indexing exactly * one subject. */ final TokenBuffer<?> buffer = new TokenBuffer(1, this); int n = 0; while (valuesIterator.hasNext()) { final BigdataValue val = valuesIterator.next(); if (log.isDebugEnabled()) { log.debug("value: " + val); } if (!(val instanceof Literal)) { /* * Note: If you allow URIs to be indexed then the code which is * responsible for free text search for quads must impose a * filter on the subject and predicate positions to ensure that * free text search can not be used to materialize literals or * URIs from other graphs. This matters when the named graphs * are used as an ACL mechanism. This would also be an issue if * literals were allowed into the subject position. */ continue; } final Literal lit = (Literal) val; if (!indexDatatypeLiterals && lit.getDatatype() != null) { // do not index datatype literals in this manner. continue; } final String languageCode = lit.getLanguage(); // Note: May be null (we will index plain literals). // if(languageCode==null) continue; final String text = lit.getLabel(); /* * Note: The OVERWRITE option is turned off to avoid some of the * cost of re-indexing each time we see a term. */ // // don't bother text indexing inline values for now // if (termId.isInline()) { // continue; // } index(buffer, subject, 0/* fieldId */, languageCode, new StringReader(text)); n++; } // flush writes to the text index. buffer.flush(); if (log.isInfoEnabled()) log.info("indexed " + n + " new values for s: " + subject); } }