package com.yahoo.glimmer.indexing; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.dsi.io.WordReader; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.semanticweb.yars.nx.Node; import org.semanticweb.yars.nx.parser.NxParser; import org.semanticweb.yars.nx.parser.ParseException; import com.yahoo.glimmer.indexing.RDFDocumentFactory.IndexType; import com.yahoo.glimmer.indexing.RDFDocumentFactory.RdfCounters; import com.yahoo.glimmer.util.BySubjectRecord; import com.yahoo.glimmer.util.BySubjectRecord.BySubjectRecordException; // TODO The RDFDocument/RDFDocumentFactory classes could be simpler. They are as they are because they were derived from MG4J's Document/DocumentFactory interfaces. public abstract class RDFDocument { public static final String NO_CONTEXT = ""; protected final RDFDocumentFactory factory; private BySubjectRecord record = new BySubjectRecord(); /** Whether we already parsed the document. */ private boolean parsed; /** The cached raw content. */ private byte[] contentBytes; private int contentLength; private Long id; private String subject; public abstract WordReader content(final int field) throws IOException; public abstract IndexType getIndexType(); protected abstract void ensureParsed_(Iterator<Relation> relations) throws IOException; public RDFDocument(RDFDocumentFactory factory) { this.factory = factory; } public void setContent(byte[] bytes, int length) { contentBytes = bytes; contentLength = length; parsed = false; id = null; subject = null; } protected void ensureParsed() throws IOException { if (parsed) { return; } parsed = true; if (contentLength == 0) { factory.incrementCounter(RdfCounters.EMPTY_LINES, 1); return; } try { record.readFrom(contentBytes, 0, contentLength); } catch (BySubjectRecordException e) { factory.incrementCounter(RdfCounters.PARSE_ERROR, 1); // TODO How to fail? } id = record.getId(); subject = record.getSubject(); List<Relation> relations = new ArrayList<Relation>(); for (String relationString : record.getRelations()) { try { Node[] relationNodes = NxParser.parseNodes(relationString); Relation relation = new Relation(relationNodes); relations.add(relation); } catch (ParseException e) { System.err.println("Parsing failed for " + subject + ": " + e.getMessage() + "Content was: \n" + relationString); return; } } if (relations.isEmpty()) { factory.incrementCounter(RdfCounters.EMPTY_DOCUMENTS, 1); return; } ensureParsed_(relations.iterator()); } public long getId() { try { ensureParsed(); } catch (IOException e) { throw new RuntimeException(e); } return id; } public String getSubject() { try { ensureParsed(); } catch (IOException e) { throw new RuntimeException(e); } return subject; } public String toString() { return getSubject().toString(); } protected static class Relation { private final Node[] nodes; /** * * @param nodes The array of Nodes excluding the subject. */ public Relation(Node[] nodes) { this.nodes = nodes; } public Node getPredicate() { return nodes[0]; } public Node getObject() { return nodes[1]; } public Node getContext() { if (nodes.length > 2) { return nodes[2]; } return null; } } }