/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.flamdex.simple; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.indeed.util.io.BufferedFileDataInputStream; import com.indeed.util.io.BufferedFileDataOutputStream; import com.indeed.flamdex.MemoryFlamdex; import com.indeed.flamdex.api.FlamdexReader; import com.indeed.flamdex.writer.FlamdexDocWriter; import com.indeed.flamdex.writer.FlamdexDocument; import com.indeed.flamdex.writer.FlamdexWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.List; /** * @author jsgroth */ public final class SimpleFlamdexDocWriter implements FlamdexDocWriter { private final String outputDirectory; private final int docBufferSize; private final int mergeFactor; private final List<List<File>> segmentsOnDisk; private MemoryFlamdex currentBuffer = new MemoryFlamdex(); private String currentSegment = "_0"; public SimpleFlamdexDocWriter(String outputDirectory, Config config) throws IOException { createOutputDir(outputDirectory); this.outputDirectory = outputDirectory; this.docBufferSize = config.getDocBufferSize(); this.mergeFactor = config.getMergeFactor(); segmentsOnDisk = Lists.newArrayList(); segmentsOnDisk.add(new ArrayList<File>()); } private static void createOutputDir(String outputDirectory) throws IOException { final File f = new File(outputDirectory); if (f.exists() && !f.isDirectory()) { throw new FileNotFoundException(f + " is not a directory"); } if (!f.exists() && !f.mkdirs()) { throw new IOException("unable to create directory " + f); } } @Override public void addDocument(FlamdexDocument doc) throws IOException { currentBuffer.addDocument(doc); if (currentBuffer.getNumDocs() == docBufferSize) { flush(); currentBuffer = new MemoryFlamdex(); } } private void flush() throws IOException { if (currentBuffer.getNumDocs() == 0) return; final File outFile = new File(outputDirectory, currentSegment); final BufferedFileDataOutputStream out = new BufferedFileDataOutputStream(outFile, ByteOrder.nativeOrder(), 65536); currentBuffer.write(out); out.close(); segmentsOnDisk.get(0).add(outFile); currentSegment = nextSegmentDirectory(currentSegment); int i = 0; while (segmentsOnDisk.get(i).size() == mergeFactor) { final List<File> segments = segmentsOnDisk.get(i); final List<FlamdexReader> readers = Lists.newArrayListWithCapacity(segments.size()); long numDocs = 0; for (final File segment : segments) { final FlamdexReader reader; if (i == 0) { reader = MemoryFlamdex.streamer(new BufferedFileDataInputStream(segment, ByteOrder.nativeOrder(), 65536)); } else { reader = SimpleFlamdexReader.open(segment.getAbsolutePath(), new SimpleFlamdexReader.Config().setWriteBTreesIfNotExisting(false)); } readers.add(reader); numDocs += reader.getNumDocs(); } final File mergeDir = new File(outputDirectory, currentSegment); currentSegment = nextSegmentDirectory(currentSegment); final FlamdexWriter w = new SimpleFlamdexWriter(mergeDir.getAbsolutePath(), numDocs, true, false); SimpleFlamdexWriter.merge(readers, w); w.close(); for (final FlamdexReader reader : readers) { reader.close(); } for (final File segment : segments) { rmrf(segment); } segments.clear(); if (i == segmentsOnDisk.size() - 1) { segmentsOnDisk.add(new ArrayList<File>()); } segmentsOnDisk.get(i + 1).add(mergeDir); ++i; } } @Override public void close() throws IOException { flush(); long numDocs = 0; final List<FlamdexReader> allReaders = Lists.newArrayList(); for (final File file : Iterables.concat(Lists.reverse(segmentsOnDisk.subList(1, segmentsOnDisk.size())))) { final SimpleFlamdexReader reader = SimpleFlamdexReader.open(file.getAbsolutePath(), new SimpleFlamdexReader.Config().setWriteBTreesIfNotExisting(false)); allReaders.add(reader); numDocs += reader.getNumDocs(); } for (final File file : segmentsOnDisk.get(0)) { final FlamdexReader reader = MemoryFlamdex.streamer(new BufferedFileDataInputStream(file, ByteOrder.nativeOrder(), 65536)); allReaders.add(reader); numDocs += reader.getNumDocs(); } final FlamdexWriter w = new SimpleFlamdexWriter(outputDirectory, numDocs, true, true); SimpleFlamdexWriter.merge(allReaders, w); w.close(); for (final FlamdexReader reader : allReaders) { reader.close(); } for (final File file : Iterables.concat(segmentsOnDisk)) { rmrf(file); } } private static String nextSegmentDirectory(String s) { int i = s.length() - 1; while (s.charAt(i) == 'z') { --i; } final StringBuilder sb = new StringBuilder(i == 0 ? s.length() + 1 : s.length()); sb.append(s.substring(0, i)); if (i == 0) { sb.append("_0"); } else { sb.append(s.charAt(i) == '9' ? 'a' : (char)(s.charAt(i) + 1)); } for (int j = i + 1; j < s.length(); ++j) { sb.append('0'); } return sb.toString(); } public static class Config { private int docBufferSize = 500; private int mergeFactor = 100; public int getDocBufferSize() { return docBufferSize; } public int getMergeFactor() { return mergeFactor; } public Config setDocBufferSize(int docBufferSize) { this.docBufferSize = docBufferSize; return this; } public Config setMergeFactor(int mergeFactor) { this.mergeFactor = mergeFactor; return this; } } private static void rmrf(final File file) throws IOException { final Process rmrf = Runtime.getRuntime().exec(new String[]{"rm", "-rf", file.getAbsolutePath()}); try { final int exit = rmrf.waitFor(); if (exit != 0) { throw new IOException ("rm -rf " + file + " failed with exit code " + exit); } } catch (InterruptedException e) { throw new RuntimeException(e); } } }