/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.flamdex.utils; import com.indeed.util.core.sort.Quicksortables; import com.indeed.flamdex.api.DocIdStream; import com.indeed.flamdex.api.FlamdexReader; import com.indeed.flamdex.api.IntTermIterator; import com.indeed.flamdex.api.StringTermIterator; import com.indeed.flamdex.writer.FlamdexWriter; import com.indeed.flamdex.writer.IntFieldWriter; import com.indeed.flamdex.writer.StringFieldWriter; import com.indeed.util.core.sort.RadixSort; import java.io.IOException; /** * @author jsgroth */ public class FlamdexSort { private static final int MAGIC_SORTING_NUMBER = 50000; // this method DOES close the FlamdexWriter upon completion public static void sort(FlamdexReader r, FlamdexWriter w, int[] oldDocIdToNewDocId) throws IOException { sort(r, w, oldDocIdToNewDocId, r.getIntFields(), r.getStringFields()); } public static void sort(FlamdexReader r, FlamdexWriter w, int[] oldDocIdToNewDocId, Iterable<String> intFields, Iterable<String> stringFields) throws IOException { final int[] docIdBuffer = new int[r.getNumDocs()]; final int[] scratch = new int[r.getNumDocs()]; final int[] countScratch = new int[65536]; // magic number final DocIdStream dis = r.getDocIdStream(); for (final String intField : intFields) { System.out.println("intField:"+intField); final IntTermIterator iter = r.getIntTermIterator(intField); final IntFieldWriter ifw = w.getIntFieldWriter(intField); while (iter.next()) { ifw.nextTerm(iter.term()); dis.reset(iter); final int n = dis.fillDocIdBuffer(docIdBuffer); for (int i = 0; i < n; ++i) { docIdBuffer[i] = oldDocIdToNewDocId[docIdBuffer[i]]; } if (n >= MAGIC_SORTING_NUMBER) { RadixSort.radixSort(docIdBuffer, n, scratch, countScratch); } else { Quicksortables.sort(Quicksortables.getQuicksortableIntArray(docIdBuffer), n); } for (int i = 0; i < n; ++i) { ifw.nextDoc(docIdBuffer[i]); } } iter.close(); ifw.close(); } for (final String stringField : stringFields) { System.out.println("stringField:"+stringField); final StringTermIterator iter = r.getStringTermIterator(stringField); final StringFieldWriter sfw = w.getStringFieldWriter(stringField); while (iter.next()) { sfw.nextTerm(iter.term()); dis.reset(iter); final int n = dis.fillDocIdBuffer(docIdBuffer); for (int i = 0; i < n; ++i) { docIdBuffer[i] = oldDocIdToNewDocId[docIdBuffer[i]]; } if (n >= MAGIC_SORTING_NUMBER) { RadixSort.radixSort(docIdBuffer, n, scratch, countScratch); } else { Quicksortables.sort(Quicksortables.getQuicksortableIntArray(docIdBuffer), n); } for (int i = 0; i < n; ++i) { sfw.nextDoc(docIdBuffer[i]); } } iter.close(); sfw.close(); } dis.close(); w.close(); } }