/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.flamdex.utils; import com.google.common.base.Charsets; import com.google.common.primitives.Ints; import com.indeed.util.core.shell.PosixFileOperations; import com.indeed.flamdex.simple.SimpleFlamdexWriter; import com.indeed.flamdex.writer.IntFieldWriter; import com.indeed.flamdex.writer.StringFieldWriter; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.util.Arrays; import java.util.Collections; import java.util.Random; /** * @author dwahler */ public class RestoreFlamdex { public static void main(String[] args) throws Exception { long elapsed = -System.currentTimeMillis(); final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream("/tmp/shard.dat"))); final int numDocs = dis.readInt(); final boolean permuteDocs = true; final int[] permutation = new int[numDocs]; for (int i = 0; i < numDocs; i++) { permutation[i] = i; } if (permuteDocs) { System.out.print("generating permutation..."); final Random r = new Random(0); Collections.shuffle(Ints.asList(permutation), r); System.out.println("done."); } final int[] docIds = new int[numDocs]; final String path = "/tmp/reconstructed"; PosixFileOperations.rmrf(new File(path)); final SimpleFlamdexWriter writer = new SimpleFlamdexWriter(path, numDocs, true, true); long termCount = 0; long uninvertedSize = 0; while (dis.readBoolean()) { final String strField = dis.readUTF(); System.out.println("strField: " + strField); final StringFieldWriter strWriter = writer.getStringFieldWriter(strField, true); while (dis.readBoolean()) { final String strTerm = dis.readUTF(); strWriter.nextTerm(strTerm); termCount++; final int utf8Length = strTerm.getBytes(Charsets.UTF_8).length; final int valueLength = 1 + varIntLength(utf8Length) + utf8Length; //field code + length + value int docIdCount = 0; while (dis.readBoolean()) { final int doc = dis.readInt(); docIds[docIdCount++] = permutation[doc]; } Arrays.sort(docIds, 0, docIdCount); for (int i = 0; i < docIdCount; i++) { final int doc = docIds[i]; strWriter.nextDoc(doc); uninvertedSize += valueLength; } } strWriter.close(); } while (dis.readBoolean()) { final String intField = dis.readUTF(); System.out.println("intField: " + intField); final IntFieldWriter intWriter = writer.getIntFieldWriter(intField, true); while (dis.readBoolean()) { final long intTerm = dis.readLong(); intWriter.nextTerm(intTerm); termCount++; final int valueLength = 1 + varIntLength(intTerm); //field code + value int docIdCount = 0; while (dis.readBoolean()) { final int doc = dis.readInt(); docIds[docIdCount++] = permutation[doc]; } Arrays.sort(docIds, 0, docIdCount); for (int i = 0; i < docIdCount; i++) { final int doc = docIds[i]; intWriter.nextDoc(doc); uninvertedSize += valueLength; } } intWriter.close(); } dis.close(); writer.close(); elapsed += System.currentTimeMillis(); System.out.println("finished in " + elapsed + " ms"); System.out.println("total terms: " + termCount); System.out.println("approx uninvertedSize: " + uninvertedSize); } private static int varIntLength(long l) { int length = 0; while (l >= (1<<7)) { length++; l >>= 7; } return length; } }