/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.data; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.ListIterator; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pig.PigCounters; import org.apache.pig.PigWarning; /** * An unordered collection of Tuples with no multiples. Data is * stored without duplicates as it comes in. When it is time to spill, * that data is sorted and written to disk. It must also be sorted upon * the first read, otherwise if a spill happened after that the iterators * would have no way to find their place in the new file. The data is * stored in a HashSet. When it is time to sort it is placed in an * ArrayList and then sorted. Dispite all these machinations, this was * found to be faster than storing it in a TreeSet. */ public class DistinctDataBag extends DefaultAbstractBag { private static final long serialVersionUID = 2L; private static final Log log = LogFactory.getLog(DistinctDataBag.class); private static final InterSedes SEDES = InterSedesFactory.getInterSedesInstance(); public DistinctDataBag() { mContents = new HashSet<Tuple>(); } @Override public boolean isSorted() { return false; } @Override public boolean isDistinct() { return true; } @Override public long size() { if (mSpillFiles != null && mSpillFiles.size() > 0){ //We need to racalculate size to guarantee a count of unique //entries including those on disk Iterator<Tuple> iter = iterator(); int newSize = 0; while (iter.hasNext()) { newSize++; iter.next(); } synchronized(mContents) { //we don't want adds to change our numbers //the lock may need to cover more of the method mSize = newSize; } } return mSize; } @Override public Iterator<Tuple> iterator() { return new DistinctDataBagIterator(); } @Override public void add(Tuple t) { synchronized (mContents) { if (mContents.add(t)) { mSize++; } } markSpillableIfNecessary(); } @Override public long spill() { // Make sure we have something to spill. Don't create empty // files, as that will make a mess. if (mContents.size() == 0) return 0; // Lock the container before I spill, so that iterators aren't // trying to read while I'm mucking with the container. long spilled = 0; synchronized (mContents) { DataOutputStream out = null; try { out = getSpillFile(); } catch (IOException ioe) { // Do not remove last file from spilled array. It was not // added as File.createTmpFile threw an IOException warn( "Unable to create tmp file to spill to disk", PigWarning.UNABLE_TO_CREATE_FILE_TO_SPILL, ioe); return 0; } try { // If we've already started reading, then it will already be // sorted into an array list. If not, we need to sort it // before writing. if (mContents instanceof ArrayList) { Iterator<Tuple> i = mContents.iterator(); while (i.hasNext()) { SEDES.writeDatum(out, i.next(), DataType.TUPLE); spilled++; // This will spill every 16383 records. if ((spilled & 0x3fff) == 0) reportProgress(); } } else { Tuple[] array = new Tuple[mContents.size()]; mContents.toArray(array); Arrays.sort(array); for (int i = 0; i < array.length; i++) { array[i].write(out); spilled++; // This will spill every 16383 records. if ((spilled & 0x3fff) == 0) reportProgress(); } } out.flush(); } catch (IOException ioe) { // Remove the last file from the spilled array, since we failed to // write to it. mSpillFiles.remove(mSpillFiles.size() - 1); warn( "Unable to spill contents to disk", PigWarning.UNABLE_TO_SPILL, ioe); return 0; } finally { if (out != null) { try { out.close(); } catch (IOException e) { warn("Error closing spill", PigWarning.UNABLE_TO_CLOSE_SPILL_FILE, e); } } } mContents.clear(); } // Increment the spill count incSpillCount(PigCounters.SPILLABLE_MEMORY_MANAGER_SPILL_COUNT); return spilled; } /** * An iterator that handles getting the next tuple from the bag. This * iterator has a couple of issues to deal with. First, data can be * stored in a combination of in memory and on disk. Second, the bag * may be asked to spill while the iterator is reading it. This means * that it will be pointing to someplace in memory and suddenly it * will need to switch to a disk file. */ private class DistinctDataBagIterator implements Iterator<Tuple> { private class TContainer implements Comparable<TContainer> { public Tuple tuple; public int fileNum; @Override @SuppressWarnings("unchecked") public int compareTo(TContainer other) { return tuple.compareTo(other.tuple); } @Override public boolean equals(Object obj){ if (obj instanceof TContainer) return tuple.equals(((TContainer)obj).tuple); else return false; } @Override public int hashCode() { return tuple.hashCode(); } } // We have to buffer a tuple because there's no easy way for next // to tell whether or not there's another tuple available, other // than to read it. private Tuple mBuf = null; private int mMemoryPtr = 0; private TreeSet<TContainer> mMergeTree = null; private ArrayList<DataInputStream> mStreams = null; private int mCntr = 0; @SuppressWarnings("unchecked") DistinctDataBagIterator() { // If this is the first read, we need to sort the data. synchronized (mContents) { if (mContents instanceof HashSet) { preMerge(); // We're the first reader, we need to sort the data. // This is in case it gets dumped under us. ArrayList<Tuple> l = new ArrayList<Tuple>(mContents); Collections.sort(l); mContents = l; } } } @Override public boolean hasNext() { // See if we can find a tuple. If so, buffer it. mBuf = next(); return mBuf != null; } @Override public Tuple next() { // This will report progress every 1024 times through next. // This should be much faster than using mod. if ((mCntr++ & 0x3ff) == 0) reportProgress(); // If there's one in the buffer, use that one. if (mBuf != null) { Tuple t = mBuf; mBuf = null; return t; } // Check to see if we just need to read from memory. boolean spilled = false; synchronized (mContents) { if (mSpillFiles == null || mSpillFiles.size() == 0) { return readFromMemory(); } if (mMemoryPtr > 0 && mContents.size() == 0) { spilled = true; } } // Check to see if we were reading from memory but we spilled if (spilled) { DataInputStream in; // We need to open the new file // and then fast forward past all of the tuples we've // already read. Then we need to place the first tuple // from that file in the priority queue. Whatever tuples // from memory that were already in the queue will be fine, // as they're guaranteed to be ahead of the point we fast // foward to. // We're guaranteed that the file we want to read from for // the fast forward is the last element in mSpillFiles, // because we don't support calls to add() after calls to // iterator(), and spill() won't create empty files. try { in = new DataInputStream(new BufferedInputStream( new FileInputStream(mSpillFiles.get( mSpillFiles.size() - 1)))); if (mStreams == null) { mMergeTree = new TreeSet<TContainer>(); // We didn't have any files before this spill. mStreams = new ArrayList<DataInputStream>(1); } mStreams.add(in); } catch (FileNotFoundException fnfe) { // We can't find our own spill file? That should never // happen. String msg = "Unable to find our spill file."; log.fatal(msg, fnfe); throw new RuntimeException(msg, fnfe); } // Fast forward past the tuples we've already put in the // queue. for (int i = 0; i < mMemoryPtr; i++) { try { SEDES.readDatum(in); } catch (EOFException eof) { // This should never happen, it means we // didn't dump all of our tuples to disk. throw new RuntimeException("Ran out of tuples to read prematurely.", eof); } catch (IOException ioe) { String msg = "Unable to find our spill file."; log.fatal(msg, ioe); throw new RuntimeException(msg, ioe); } } mMemoryPtr = 0; // Add the next tuple from this file to the queue. addToQueue(null, mSpillFiles.size() - 1); // Fall through to read the next entry from the priority // queue. } // We have spill files, so we need to read the next tuple from // one of those files or from memory. return readFromTree(); } /** * Not implemented. */ @Override public void remove() {} private Tuple readFromTree() { if (mMergeTree == null) { // First read, we need to set up the queue and the array of // file streams mMergeTree = new TreeSet<TContainer>(); // Add one to the size in case we spill later. mStreams = new ArrayList<DataInputStream>(mSpillFiles.size() + 1); Iterator<File> i = mSpillFiles.iterator(); while (i.hasNext()) { try { DataInputStream in = new DataInputStream(new BufferedInputStream( new FileInputStream(i.next()))); mStreams.add(in); // Add the first tuple from this file into the // merge queue. addToQueue(null, mStreams.size() - 1); } catch (FileNotFoundException fnfe) { // We can't find our own spill file? That should // never happen. String msg = "Unable to find our spill file."; log.fatal(msg, fnfe); throw new RuntimeException(msg, fnfe); } } // Prime one from memory too if (mContents.size() > 0) { addToQueue(null, -1); } } if (mMergeTree.size() == 0) return null; // Pop the top one off the queue TContainer c = mMergeTree.first(); mMergeTree.remove(c); // Add the next tuple from whereever we read from into the // queue. Buffer the tuple we're returning, as we'll be // reusing c. Tuple t = c.tuple; addToQueue(c, c.fileNum); return t; } private void addToQueue(TContainer c, int fileNum) { if (c == null) { c = new TContainer(); } c.fileNum = fileNum; if (fileNum == -1) { // Need to read from memory. We may have spilled since // this tuple was put in the queue, and hence memory might // be empty. But I don't care, as then I just won't add // any more from memory. synchronized (mContents) { do { c.tuple = readFromMemory(); if (c.tuple != null) { // If we find a unique entry, then add it to the queue. // Otherwise ignore it and keep reading. if (mMergeTree.add(c)) { return; } } } while (c.tuple != null); } return; } // Read the next tuple from the indicated file DataInputStream in = mStreams.get(fileNum); if (in != null) { // There's still data in this file do { try { c.tuple = (Tuple) SEDES.readDatum(in); // If we find a unique entry, then add it to the queue. // Otherwise ignore it and keep reading. If we run out // of tuples to read that's fine, we just won't add a // new one from this file. if (mMergeTree.add(c)) { return; } } catch (EOFException eof) { // Out of tuples in this file. Set our slot in the // array to null so we don't keep trying to read from // this file. try { in.close(); }catch(IOException e) { log.warn("Failed to close spill file.", e); } mStreams.set(fileNum, null); return; } catch (IOException ioe) { String msg = "Unable to find our spill file."; log.fatal(msg, ioe); throw new RuntimeException(msg, ioe); } } while (true); } } // Function assumes that the reader lock is already held before we enter // this function. private Tuple readFromMemory() { if (mContents.size() == 0) return null; if (mMemoryPtr < mContents.size()) { return ((ArrayList<Tuple>)mContents).get(mMemoryPtr++); } else { return null; } } /** * Pre-merge if there are too many spill files. This avoids the issue * of having too large a fan out in our merge. Experimentation by * the hadoop team has shown that 100 is about the optimal number * of spill files. This function modifies the mSpillFiles array * and assumes the write lock is already held. It will not unlock it. * * Tuples are reconstituted as tuples, evaluated, and rewritten as * tuples. This is expensive, but I don't know how to read tuples * from the file otherwise. * * This function is slightly different than the one in * SortedDataBag, as it uses a TreeSet instead of a PriorityQ. */ private void preMerge() { if (mSpillFiles == null || mSpillFiles.size() <= MAX_SPILL_FILES) { return; } // While there are more than max spill files, gather max spill // files together and merge them into one file. Then remove the others // from mSpillFiles. The new spill files are attached at the // end of the list, so I can just keep going until I get a // small enough number without too much concern over uneven // size merges. Convert mSpillFiles to a linked list since // we'll be removing pieces from the middle and we want to do // it efficiently. try { LinkedList<File> ll = new LinkedList<File>(mSpillFiles); LinkedList<File> filesToDelete = new LinkedList<File>(); while (ll.size() > MAX_SPILL_FILES) { ListIterator<File> i = ll.listIterator(); mStreams = new ArrayList<DataInputStream>(MAX_SPILL_FILES); mMergeTree = new TreeSet<TContainer>(); for (int j = 0; j < MAX_SPILL_FILES; j++) { try { File f = i.next(); DataInputStream in = new DataInputStream(new BufferedInputStream( new FileInputStream(f))); mStreams.add(in); addToQueue(null, mStreams.size() - 1); i.remove(); filesToDelete.add(f); } catch (FileNotFoundException fnfe) { // We can't find our own spill file? That should // neer happen. String msg = "Unable to find our spill file."; log.fatal(msg, fnfe); throw new RuntimeException(msg, fnfe); } } // Get a new spill file. This adds one to the end of // the spill files list. So I need to append it to my // linked list as well so that it's still there when I // move my linked list back to the spill files. try { DataOutputStream out = getSpillFile(); ll.add(mSpillFiles.get(mSpillFiles.size() - 1)); Tuple t; while ((t = readFromTree()) != null) { t.write(out); } out.flush(); out.close(); } catch (IOException ioe) { String msg = "Unable to find our spill file."; log.fatal(msg, ioe); throw new RuntimeException(msg, ioe); } } // delete files that have been merged into new files for(File f : filesToDelete){ if( f.delete() == false){ log.warn("Failed to delete spill file: " + f.getPath()); } } // clear the list, so that finalize does not delete any files, // when mSpillFiles is assigned a new value mSpillFiles.clear(); // Now, move our new list back to the spill files array. mSpillFiles = new FileList(ll); } finally { // Reset mStreams and mMerge so that they'll be allocated // properly for regular merging. mStreams = null; mMergeTree = null; } } } }