/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.imhotep.local; import javax.annotation.Nonnull; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import com.google.common.collect.Maps; import com.indeed.flamdex.api.IntTermDocIterator; import com.indeed.flamdex.api.StringTermDocIterator; import com.indeed.flamdex.writer.FlamdexWriter; import com.indeed.flamdex.writer.IntFieldWriter; import com.indeed.flamdex.writer.StringFieldWriter; import com.indeed.imhotep.MemoryReservationContext; import com.indeed.imhotep.api.ImhotepOutOfMemoryException; public class IndexReWriter { private final List<ImhotepLocalSession> sessions; private final ImhotepLocalSession newSession; private int[] sessionDocIdOffsets; private final MemoryReservationContext memory; private GroupLookup newGroupLookup; private List<int[]> perSessionMappings; private Map<String, DynamicMetric> dynamicMetrics; private long newMaxDocs; public IndexReWriter(List<ImhotepLocalSession> localSessions, ImhotepLocalSession newSession, MemoryReservationContext memory) throws ImhotepOutOfMemoryException { this.sessions = localSessions; this.newSession = newSession; this.memory = memory; this.sessionDocIdOffsets = new int[localSessions.size()]; } public GroupLookup getNewGroupLookup() { return this.newGroupLookup; } public Map<String, DynamicMetric> getDynamicMetrics() { return this.dynamicMetrics; } public List<int[]> getPerSessionMappings() { return perSessionMappings; } public int getNumSessionsMerged() { return sessions.size(); } public void optimizeIndecies(@Nonnull final List<String> intFields, @Nonnull final List<String> stringFields, @Nonnull final FlamdexWriter w) throws IOException, ImhotepOutOfMemoryException { final int[] docIdBuffer = new int[128]; List<IntTermDocIterator> intIters = new ArrayList<IntTermDocIterator>(sessions.size()); List<StringTermDocIterator> stringIters = new ArrayList<StringTermDocIterator>(sessions.size()); List<Integer> sessionOffsets = new ArrayList<Integer>(this.sessionDocIdOffsets.length); int[] oldToNewDocIdMapping; oldToNewDocIdMapping = remapDocIds(this.sessions); w.resetMaxDocs(this.newMaxDocs); for (final String intField : intFields) { intIters.clear(); sessionOffsets.clear(); for (int i = 0; i < sessions.size(); i++) { ImhotepLocalSession session = sessions.get(i); IntTermDocIterator iter = session.getReader().getIntTermDocIterator(intField); if (iter == null) { continue; } intIters.add(iter); sessionOffsets.add(this.sessionDocIdOffsets[i]); } final MergingIntTermDocIterator iter = new MergingIntTermDocIterator(intIters, oldToNewDocIdMapping, sessionOffsets); final IntFieldWriter ifw = w.getIntFieldWriter(intField); while (iter.nextTerm()) { ifw.nextTerm(iter.term()); /* * Write all the terms and groups to the new index, skipping * those in group 0 */ int n; do { n = iter.fillDocIdBuffer(docIdBuffer); for (int i = 0; i < n; ++i) { final int docId = docIdBuffer[i]; if (docId == -1) { /* doc was in group 0 */ continue; } ifw.nextDoc(docId); } } while (n == docIdBuffer.length); } iter.close(); ifw.close(); } for (final String stringField : stringFields) { stringIters.clear(); sessionOffsets.clear(); for (int i = 0; i < sessions.size(); i++) { ImhotepLocalSession session = sessions.get(i); StringTermDocIterator iter = session.getReader().getStringTermDocIterator(stringField); if (iter == null) { continue; } stringIters.add(iter); sessionOffsets.add(this.sessionDocIdOffsets[i]); } final MergingStringTermDocIterator iter = new MergingStringTermDocIterator(stringIters, oldToNewDocIdMapping, sessionOffsets); final StringFieldWriter sfw = w.getStringFieldWriter(stringField); while (iter.nextTerm()) { sfw.nextTerm(iter.term()); /* * Write all the terms and groups to the new index, skipping * those in group 0 */ int n; do { n = iter.fillDocIdBuffer(docIdBuffer); for (int i = 0; i < n; ++i) { final int docId = docIdBuffer[i]; if (docId == -1) { /* doc was in group 0 */ continue; } sfw.nextDoc(docId); } } while (n == docIdBuffer.length); } iter.close(); sfw.close(); } this.perSessionMappings = constructPerSessionNewToOldIdMappings(oldToNewDocIdMapping); memory.releaseMemory(oldToNewDocIdMapping.length * 4L); } /* * Converts the oldToNewDocIdMapping mapping into a set of * per session newToOldId mappings. Needed for reconstructing * the DynamicMetrics after one or more optimize calls followed * by a reset. * * Kinda overkill now that multiple shards are not being merged * anymore */ private List<int[]> constructPerSessionNewToOldIdMappings(int[] oldToNewDocIdMapping) throws ImhotepOutOfMemoryException { List<int[]> results = new ArrayList<int[]>(this.sessions.size()); for (int i = 0; i < this.sessions.size(); i++) { int offset = this.sessionDocIdOffsets[i]; int nDocs = this.sessions.get(i).getNumDocs(); if (!memory.claimMemory(nDocs * 4L)) throw new ImhotepOutOfMemoryException(); int[] mapping = new int[nDocs]; int last = -1; for (int oldDocId = offset; oldDocId < nDocs; oldDocId++) { int newDocId = oldToNewDocIdMapping[oldDocId]; if (newDocId != -1) { mapping[newDocId] = oldDocId; last = newDocId; } } /* claim memory for new array that is inserted into results */ if (!memory.claimMemory((last + 1) * 4L)) throw new ImhotepOutOfMemoryException(); results.add(Arrays.copyOf(mapping, last + 1)); /* release memory for mapping[] */ memory.releaseMemory(nDocs * 4L); } return results; } /* * Maps the existing doc ids in the sessions to a new * non-overlapping set, skipping the docs in group 0. * * Also constructs a new GroupLookup with these new doc * ids, and a new DynamicMetric * * @returns A mapping from old doc id to new doc id - with * -1 as the doc id for docs to be removed (the ones * in group 0) */ private int[] remapDocIds(List<ImhotepLocalSession> sessions) throws ImhotepOutOfMemoryException { int nTotalDocs = 0; int numGroups = 0; int newNumDocs = 0; int nextDocId = 0; /* calculate the number of docs and non-group0 docs */ for (int i = 0; i < sessions.size(); i++) { this.sessionDocIdOffsets[i] = nTotalDocs; final ImhotepLocalSession session = sessions.get(i); final GroupLookup gl = session.docIdToGroup; final int numDocs = gl.size(); final int grp0Docs = session.groupDocCount[0]; nTotalDocs += numDocs; newNumDocs += numDocs - grp0Docs; numGroups = Math.max(numGroups, gl.getNumGroups()); } this.newMaxDocs = newNumDocs; /* allocate the old doc id to new doc id mapping */ if (!memory.claimMemory(nTotalDocs * 4L)) { throw new ImhotepOutOfMemoryException(); } final int[] mapping = new int[nTotalDocs]; /* populate mapping and new GroupLookup */ GroupLookup newGL = GroupLookupFactory.create(numGroups, newNumDocs, this.newSession, memory); for (int i = 0; i < sessions.size(); i++) { final GroupLookup gl = sessions.get(i).docIdToGroup; final int offset = this.sessionDocIdOffsets[i]; for (int j = 0; j < gl.size(); j++) { final int group = gl.get(j); if (group != 0) { mapping[j + offset] = nextDocId; newGL.set(nextDocId, group); ++nextDocId; } else { mapping[j + offset] = -1; } } } newGL.recalculateNumGroups(); this.newGroupLookup = newGL; /* * remap the dynamic metrics */ /* allocate the new DynamicMetrics */ /* all session have the same # of dynamic metrics */ int nDynMetrics = sessions.get(0).getDynamicMetrics().size(); if (!memory.claimMemory((nTotalDocs * 4L) * nDynMetrics)) { throw new ImhotepOutOfMemoryException(); } final Map<String, DynamicMetric> newDynMetrics = Maps.newHashMap(); for (int i = 0; i < sessions.size(); i++) { ImhotepLocalSession s = sessions.get(i); final GroupLookup gl = s.docIdToGroup; final int offset = this.sessionDocIdOffsets[i]; for (Map.Entry<String, DynamicMetric> e : s.getDynamicMetrics().entrySet()) { DynamicMetric oldDM = e.getValue(); DynamicMetric newDM = newDynMetrics.get(e.getKey()); if (newDM == null) { newDM = new DynamicMetric(newNumDocs); } for (int j = 0; j < gl.size(); j++) { int docId = mapping[j + offset]; if (docId == -1) continue; newDM.add(docId, oldDM.lookupSingleVal(j)); } newDynMetrics.put(e.getKey(), newDM); } } this.dynamicMetrics = newDynMetrics; return mapping; } }