package com.yahoo.glimmer.indexing; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.WordReader; import it.unimi.dsi.lang.MutableString; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.semanticweb.yars.nx.BNode; import org.semanticweb.yars.nx.Resource; import org.semanticweb.yars.nx.namespace.RDF; import com.yahoo.glimmer.indexing.RDFDocumentFactory.IndexType; import com.yahoo.glimmer.indexing.RDFDocumentFactory.RdfCounters; import com.yahoo.glimmer.indexing.RDFDocumentFactory.ResourceHashLookupException; /** * A RDF document. * * <p> * We delay the actual parsing until it is actually necessary, so operations * like getting the document URI will not require parsing. */ class HorizontalDocument extends RDFDocument { private static final Log LOG = LogFactory.getLog(HorizontalDocument.class); /* * The fields objects, predicates & contexts are used in 'parallel' So the * value at index I from the three lists refers to the same relation. If the * object is a Resource or BNode it's single hash value is put in the * objects list at index I. If the object is a Literal with n terms. The * terms are put in the objects list at indexes I to I+n-1 */ // Literals objects are the terms. // Resource/NBode objects are the hash values. private List<String> objects = new ArrayList<String>(); // Predicates holds the encoded urls http_www_blar_com_something private List<String> predicates = new ArrayList<String>(); // Contexts are the hash values. private List<String> contexts = new ArrayList<String>(); // hash value of subject. private List<String> subject = new ArrayList<String>(); // subjectTokens are tokens extracted from the subject Resource/BNode private List<String> subjectText = new ArrayList<String>(); protected HorizontalDocument(HorizontalDocumentFactory factory) { super(factory); } @Override public IndexType getIndexType() { return IndexType.HORIZONTAL; } protected void ensureParsed_(Iterator<Relation> relations) throws IOException { subject.clear(); subjectText.clear(); objects.clear(); predicates.clear(); contexts.clear(); FastBufferedReader fbr; MutableString word = new MutableString(); MutableString nonWord = new MutableString(); String subjectId = factory.getResourceIdPrefix() + Long.toString(getId()); subject.add(subjectId); // Add the subjectId also as text. subjectText.add(subjectId); // Index subject tokens // We index the BNode id. Do we need it? String subject = getSubject(); // remove http/https or _: int startAt = subject.indexOf(':'); if (startAt < 0) { fbr = new FastBufferedReader(subject.toCharArray()); } else { startAt++; fbr = new FastBufferedReader(subject.toCharArray(), startAt, subject.length() - startAt); } while (fbr.next(word, nonWord)) { if (word != null && !word.equals("")) { if (CombinedTermProcessor.getInstance().processTerm(word)) { subjectText.add(word.toString().toLowerCase()); } } } fbr.close(); // Predicate/object/context are parallel. while (relations.hasNext()) { Relation relation = relations.next(); String predicate = relation.getPredicate().toString(); // Check if prefix is on blacklist if (RDFDocumentFactory.isOnPredicateBlacklist(predicate.toLowerCase())) { factory.incrementCounter(RdfCounters.BLACKLISTED_TRIPLES, 1); continue; } if (predicate.equals(RDF.TYPE.toString())) { factory.incrementCounter(RdfCounters.RDF_TYPE_TRIPLES, 1); } String predicateId; try { predicateId = factory.lookupResource(predicate, true); } catch (ResourceHashLookupException rhle) { factory.incrementCounter(RdfCounters.PREDICATES_NOT_IN_HASH, 1); LOG.info("Predicate not in hash:" + predicate); continue; } String contextId = NO_CONTEXT; if (factory.isWithContexts() && relation.getContext() != null) { if (relation.getContext() instanceof Resource || relation.getContext() instanceof BNode) { try { contextId = factory.lookupResource(relation.getContext().toString(), true); } catch (ResourceHashLookupException rhle) { factory.incrementCounter(RdfCounters.CONTEXT_NOT_IN_HASH, 1); LOG.info("Context not in hash:" + relation.getContext().toString()); continue; } } else { throw new IllegalStateException("Context " + relation.getContext() + " is not a Resource."); } } if (relation.getObject() instanceof Resource || relation.getObject() instanceof BNode) { String objectId; try { objectId = factory.lookupResource(relation.getObject().toString(), true); } catch (ResourceHashLookupException rhle) { factory.incrementCounter(RdfCounters.OBJECT_NOT_IN_HASH, 1); LOG.info("Object not in hash:" + relation.getObject().toString()); continue; } objects.add(objectId); predicates.add(predicateId); contexts.add(contextId); } else { String object = relation.getObject().toString(); // Iterate over the words of the value fbr = new FastBufferedReader(object.toCharArray()); while (fbr.next(word, nonWord)) { if (word != null && !word.equals("")) { if (CombinedTermProcessor.getInstance().processTerm(word)) { // Lowercase terms objects.add(word.toString()); // Preserve casing for properties and // contexts predicates.add(predicateId); contexts.add(contextId); } } } fbr.close(); } factory.incrementCounter(RdfCounters.INDEXED_TRIPLES, 1); } } @Override public WordReader content(final int field) throws IOException { factory.ensureFieldIndex(field); ensureParsed(); switch (field) { case 0: return new WordArrayReader(subject); case 1: return new WordArrayReader(subjectText); case 2: return new WordArrayReader(objects); case 3: return new WordArrayReader(predicates); case 4: return new WordArrayReader(contexts); default: throw new IllegalArgumentException(); } } }