package com.yahoo.glimmer.indexing.generator;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import it.unimi.di.big.mg4j.index.BitStreamIndexWriter;
import it.unimi.di.big.mg4j.index.CompressionFlags;
import it.unimi.di.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.io.HadoopFileSystemIOFactory;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.util.Properties;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.yahoo.glimmer.indexing.CombinedTermProcessor;
import com.yahoo.glimmer.indexing.ResourceRefTermProcessor;
public class Index {
private PrintWriter terms;
private OutputStream properties;
private OutputBitStream docSizes;
private IndexWriter indexWriter;
private FileSystem fs;
private Path outputDir;
private String name;
private long numDocs;
// private int indexWriterCacheSize = QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE;
private boolean positions;
private String hashValuePrefix;
public Index(FileSystem fs, Path outputDir, String indexName, long numDocs, boolean positions, String hashValuePrefix, int indexWriterCacheSize) {
this.fs = fs;
this.outputDir = outputDir;
// It seems like MG4J doesn't like index names with the '-' char
this.name = indexName.replaceAll("\\-", "_");
this.numDocs = numDocs;
this.positions = positions;
this.hashValuePrefix = hashValuePrefix;
// if (indexWriterCacheSize != 0) {
// this.indexWriterCacheSize = indexWriterCacheSize;
// }
}
public void open() throws IOException {
String basename = new Path(outputDir, name).toString();
Path termsPath = new Path(outputDir, name + DiskBasedIndex.TERMS_EXTENSION);
terms = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(termsPath, false), "UTF-8")));
Path propertiesPath = new Path(outputDir, name + DiskBasedIndex.PROPERTIES_EXTENSION);
properties = fs.create(propertiesPath, false);
Map<Component, Coding> defaultStandardIndexFlags = new Object2ObjectOpenHashMap<Component, Coding>(CompressionFlags.DEFAULT_STANDARD_INDEX);
if (!positions) {
defaultStandardIndexFlags.remove(CompressionFlags.Component.POSITIONS);
defaultStandardIndexFlags.remove(CompressionFlags.Component.COUNTS); // Quasi Succinct Indexes can't not have counts.
}
IOFactory ioFactory = new HadoopFileSystemIOFactory(fs);
// indexWriter = new QuasiSuccinctIndexWriter(ioFactory, basename, numDocs, Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM), indexWriterCacheSize, defaultStandardIndexFlags, ByteOrder.nativeOrder());
indexWriter = new BitStreamIndexWriter(ioFactory, basename, numDocs, true, defaultStandardIndexFlags);
}
public PrintWriter getTermsWriter() {
return terms;
}
public boolean hasPositions() {
return positions;
}
public IndexWriter getIndexWriter() {
return indexWriter;
}
public OutputStream getPropertiesStream() {
return properties;
}
private long docSizesLastDocument = -1;
public void writeDocSize(long document, int size) throws IOException {
if (document <= docSizesLastDocument) {
throw new IllegalArgumentException("Given document ID " + document + " isn't bigger than the document ID given in the previous call " + docSizesLastDocument);
}
if (docSizes == null) {
// Only create the file when needed. writeDocSize() shouldn't be called for vertical indexes.
Path docSizesPath = new Path(outputDir, name + DiskBasedIndex.SIZES_EXTENSION);
docSizes = new OutputBitStream(fs.create(docSizesPath, false));
}
for (docSizesLastDocument++ ; docSizesLastDocument < document; docSizesLastDocument++) {
docSizes.writeGamma(0);
}
docSizes.writeGamma(size);
}
public void close(long writtenOccurrences) throws IOException {
try {
Properties props = indexWriter.properties();
System.out.println("Closing index " + name + " which has " + props.getProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.TERMS) + " terms ");
if (positions) {
props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.OCCURRENCES, writtenOccurrences);
}
props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.MAXCOUNT, -1);
props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.FIELD, name);
props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.TERMPROCESSOR, CombinedTermProcessor.getInstance());
props.addProperty(ResourceRefTermProcessor.PropertyKeys.REF_PREFIX, hashValuePrefix);
props.save(properties);
} catch (ConfigurationException e) {
throw new IOException(e.getMessage());
}
properties.close();
if (docSizes != null) {
for (; docSizesLastDocument < numDocs; docSizesLastDocument++) {
docSizes.writeGamma(0);
}
docSizes.close();
}
terms.close();
indexWriter.close();
}
public String getName() {
return name;
}
}