/******************************************************************************* * Copyright (c) 2004, 2007 IBM Corporation and Cambridge Semantics Incorporated. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * File: $Source: /cvsroot/slrp/boca/com.ibm.adtech.boca.model.indexer.lucene/src/com/ibm/adtech/boca/model/indexer/lucene/ModelIndexer.java,v $ * Created by: Wing Yung ( <a href="mailto:wingyung@us.ibm.com">wingyung@us.ibm.com </a>) * Created on: 10/11/2005 * Revision: $Id: ModelIndexer.java 161 2007-07-31 14:11:06Z mroy $ * * Contributors: * IBM Corporation - initial API and implementation * Cambridge Semantics Incorporated - Fork to Anzo *******************************************************************************/ package org.openanzo.datasource.nodecentric.indexer; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Map; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.store.FSDirectory; import org.openanzo.datasource.nodecentric.internal.NodeCentricDatasource; import org.openanzo.datasource.nodecentric.internal.NodeCentricOperationContext; import org.openanzo.datasource.nodecentric.internal.StatementWrapper; import org.openanzo.datasource.nodecentric.sql.StatementRdbWrapper; import org.openanzo.datasource.nodecentric.sql.StatementRdbWrapper.FindLiteralStatementsNRRangeResult; import org.openanzo.datasource.nodecentric.sql.StatementRdbWrapper.FindLiteralStatementsRangeResult; import org.openanzo.datasource.nodecentric.sql.StatementRdbWrapper.FindMinMaxIdResult; import org.openanzo.exceptions.AnzoException; import org.openanzo.exceptions.ExceptionConstants; import org.openanzo.exceptions.LogUtils; import org.openanzo.indexer.IndexerException; import org.openanzo.indexer.lucene.LuceneConstants; import org.openanzo.indexer.lucene.LuceneIndexerBase; import org.openanzo.rdf.Literal; import org.openanzo.rdf.PlainLiteral; import org.openanzo.rdf.Resource; import org.openanzo.rdf.TypedLiteral; import org.openanzo.rdf.URI; import org.openanzo.rdf.Value; import org.openanzo.rdf.Constants.INDEXER; import org.openanzo.rdf.vocabulary.XMLSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Indexer for Anzo models based on Lucene. * * @author Wing Yung ( <a href="mailto:wingyung@us.ibm.com">wingyung@us.ibm.com </a>) */ public class ModelIndexer extends LuceneIndexerBase<StatementWrapper, NodeCentricDatasource> { private static final Logger log = LoggerFactory.getLogger(ModelIndexer.class.getName()); /** * Purge a namedGraph from index * * @param graphId * id of graph to purge * @throws IndexerException */ public void purgeNamedGraph(Long graphId) throws IndexerException { try { IndexReader reader = IndexReader.open(FSDirectory.open(new File(location)), true); reader.deleteDocuments(new Term(LuceneConstants.INDEXER_FIELD_GRAPH_ID, graphId.toString())); reader.close(); } catch (IOException ioe) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_DELETE_DOC, ioe); } } public boolean index(StatementWrapper statement) throws IndexerException { if (indexWriter != null) { Document doc = ModelIndexer.createDocument(statement); if (doc != null) { addDocument(doc); return true; } } return false; } public void remove(StatementWrapper statement) throws IndexerException { if (indexWriter != null) { deleteDocuments(new Term(LuceneConstants.INDEXER_FIELD_STMT_ID, statement.getId())); } } /** * Create a new IndexDocument for the statement provided * * @param statementWrapper * index wrapper around an Anzo Statement * @return the indexer's document for this statement */ private static Document createDocument(StatementWrapper statementWrapper) { Document doc = null; Value objNode = statementWrapper.getObject(); boolean index = false; if (objNode instanceof Literal) { Literal literal = (Literal) objNode; String text = null; if (literal instanceof PlainLiteral) { index = true; text = literal.getLabel(); } else if (literal instanceof TypedLiteral) { URI type = ((TypedLiteral) literal).getDatatypeURI(); if (type.equals(XMLSchema.STRING)) { index = true; text = literal.getLabel(); } else { // It's not an indexable type. // Try index numbers or booleans? } } if (index) { doc = new Document(); doc.add(new Field(LuceneConstants.INDEXER_FIELD_OBJECT, text, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(INDEXER.INDEXER_FIELD_PREDICATE, statementWrapper.getPredicate().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(INDEXER.INDEXER_FIELD_SUBJECT, statementWrapper.getSubject().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(LuceneConstants.INDEXER_FIELD_PREDICATE_ID, statementWrapper.getPredicateId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(LuceneConstants.INDEXER_FIELD_SUBJECT_ID, statementWrapper.getSubjectId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(INDEXER.INDEXER_FIELD_GRAPH_URI, statementWrapper.getGraphURI().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(LuceneConstants.INDEXER_FIELD_GRAPH_ID, statementWrapper.getGraphId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(LuceneConstants.INDEXER_FIELD_OBJ_NODE_ID, statementWrapper.getObjectId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); String id = statementWrapper.getId(); doc.add(new Field(LuceneConstants.INDEXER_FIELD_STMT_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); if (statementWrapper.getModified() != null) { doc.add(new Field(LuceneConstants.INDEXER_FIELD_MODIFIED, DateTools.timeToString(statementWrapper.getModified().longValue(), Resolution.HOUR), Field.Store.YES, Field.Index.NOT_ANALYZED)); } } } return doc; } private class IdQuad { long g; long s; long o; long p; IdQuad(long g, long s, long p, long o) { this.g = g; this.s = s; this.o = o; this.p = p; } } public int rebuild(NodeCentricDatasource datasource) throws IndexerException { clear(); int stmtCount = 0; NodeCentricOperationContext context = null; try { context = datasource.getQueryContext(null); preIndex(); long start = 0; long size = 20000; datasource.begin(context.getConnection(), false, false); try { FindMinMaxIdResult minMaxLiteral = StatementRdbWrapper.findMinMaxId(context.getStatementProvider(), context.getConnection(), "ANZO_L"); long min = minMaxLiteral.getMin(); long max = minMaxLiteral.getMax(); start = min; long end = Math.min(max, start + size); while (start < end) { if (end < max) { log.error(LogUtils.RDB_MARKER, "Reindexing statements for literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } else { log.info(LogUtils.RDB_MARKER, "Reindexing statements for literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } stmtCount += findStatements(context, start, end); start = start + size; end = Math.min(max, start + size); } FindMinMaxIdResult minMaxLongLiteral = StatementRdbWrapper.findMinMaxId(context.getStatementProvider(), context.getConnection(), "ANZO_LL"); min = minMaxLongLiteral.getMin(); max = minMaxLongLiteral.getMax(); start = min; end = Math.min(max, start + size); while (start < end) { if (end < max) { log.error(LogUtils.RDB_MARKER, "Reindexing statements for long literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } else { log.info(LogUtils.RDB_MARKER, "Reindexing statements for long literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } stmtCount += findStatements(context, start, end); start = start + size; end = Math.min(max, start + size); } FindMinMaxIdResult minMaxTypedLiteral = StatementRdbWrapper.findMinMaxId(context.getStatementProvider(), context.getConnection(), "ANZO_TL"); min = minMaxTypedLiteral.getMin(); max = minMaxTypedLiteral.getMax(); start = min; end = Math.min(max, start + size); while (start < end) { if (end < max) { log.error(LogUtils.RDB_MARKER, "Reindexing statements for typed literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } else { log.info(LogUtils.RDB_MARKER, "Reindexing statements for typed literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } stmtCount += findStatements(context, start, end); start = start + size; end = Math.min(max, start + size); } FindMinMaxIdResult minMaxLongTypedLiteral = StatementRdbWrapper.findMinMaxId(context.getStatementProvider(), context.getConnection(), "ANZO_LTL"); min = minMaxLongTypedLiteral.getMin(); max = minMaxLongTypedLiteral.getMax(); start = min; end = Math.min(max, start + size); while (start < end) { if (end < max) { log.error(LogUtils.RDB_MARKER, "Reindexing statements for long typed literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } else { log.info(LogUtils.RDB_MARKER, "Reindexing statements for long typed literals:{}-{} of {}", new Object[] { Long.toString(start - min), Long.toString(end - min), Long.toString(max - min) }); } stmtCount += findStatements(context, start, end); start = start + size; end = Math.min(max, start + size); } } finally { datasource.commit(context.getConnection(), false, false); } log.info(LogUtils.RDB_MARKER, "Index {} total statements", Integer.toString(stmtCount)); } catch (AnzoException e) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_REBUILD, e); } finally { try { datasource.returnQueryContext(context); } catch (Exception e) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_REBUILD, e); } } needsIndexRebuild = false; return stmtCount; } private long findStatements(NodeCentricOperationContext context, long start, long end) throws AnzoException { HashSet<Long> ids = new HashSet<Long>(); ArrayList<IdQuad> list = new ArrayList<IdQuad>(); long stmtCount = 0; for (FindLiteralStatementsRangeResult result : StatementRdbWrapper.findLiteralStatementsRange(context.getStatementProvider(), context.getConnection(), start, end)) { long g = result.getNamedGraphId(); long s = result.getSubj(); long p = result.getProp(); long o = result.getObj(); IdQuad quint = new IdQuad(g, s, p, o); list.add(quint); ids.add(g); ids.add(s); ids.add(p); ids.add(o); } for (FindLiteralStatementsNRRangeResult result : StatementRdbWrapper.findLiteralStatementsNRRange(context.getStatementProvider(), context.getConnection(), start, end)) { long g = result.getNamedGraphId(); long s = result.getSubj(); long p = result.getProp(); long o = result.getObj(); IdQuad quint = new IdQuad(g, s, p, o); list.add(quint); ids.add(g); ids.add(s); ids.add(p); ids.add(o); } if (list.size() > 0) { Map<Long, Value> nodes = context.getNodeLayout().resolveStoredIds(ids, context.getConnection()); for (IdQuad quad : list) { StatementWrapper sw = new StatementWrapper((URI) nodes.get(quad.g), quad.g, (Resource) nodes.get(quad.s), quad.s, (URI) nodes.get(quad.p), quad.p, nodes.get(quad.o), quad.o, Long.valueOf(0)); index(sw); stmtCount++; } ids.clear(); list.clear(); } postIndex(); return stmtCount; } }