/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.ByteArrayOutputStream; import java.io.CharArrayWriter; import java.io.FilterOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.BitSet; import java.util.Date; import java.util.LinkedList; import java.util.Map; import java.util.SortedSet; import java.util.StringTokenizer; import java.util.Vector; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.concurrent.locks.AbstractQueuedSynchronizer; import java.util.concurrent.locks.ReentrantLock; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableName; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.crawl.common.shared.Constants; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOBufferListOutputStream; import org.commoncrawl.io.NIODataSink; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.ArcFileWriterStats; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.MimeTypeCount; import org.commoncrawl.protocol.shared.ArcFileItem; import org.commoncrawl.util.GZIPUtils.UnzipResult; import org.junit.Test; import com.google.common.collect.TreeMultimap; /** * * @author rana * */ public class ArcFileWriter { /** logging **/ private static final Log LOG = LogFactory .getLog(ArcFileWriter.class); private static SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat( "yyyyMMddHHmmss"); private static SimpleDateFormat FILENAME_TIMESTAMP = new SimpleDateFormat( "yyyy/MM/dd/"); public static final int MAX_SIZE_DEFAULT = 100000000; private static final int MAX_WRITERS_DEFAULT = 10; private static final String DEFAULT_ENCODING = "ISO-8859-1"; private static final String ARC_MAGIC_NUMBER = "filedesc://"; public static final char LINE_SEPARATOR = '\n'; private static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0, 0, 0, 0, 0 }; private final static Pattern TRUNCATION_REGEX = Pattern .compile("^([^\\s;,]+).*"); private static final String NO_TYPE_MIMETYPE = "no-type"; private static final int MAX_METADATA_LINE_LENGTH = (8 * 1024); private static final Pattern METADATA_LINE_PATTERN = Pattern .compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$"); private static final char HEADER_FIELD_SEPARATOR = ' '; private static final String UTF8 = "UTF-8"; private FileSystem _fileSystem; private Path _outputPath; private int _id; private int _maxSize = MAX_SIZE_DEFAULT; private int _maxWriters = MAX_WRITERS_DEFAULT; private Semaphore _maxWritersSemaphore = null; private Vector<ArcFile> _arcFiles = new Vector<ArcFile>(); private String _activeFileName = null; private int _lastItemPos = -1; private int _lastItemCompressedSize = -1; private TreeMultimap<String, Integer> _mimeTypeCounts = TreeMultimap .create(); public static final String ARC_FILE_SUFFIX = ".arc.gz"; private OutputStream _out = null; private static BitSet dontNeedEncoding; static final int caseDiff = ('a' - 'A'); static { dontNeedEncoding = new BitSet(256); // alpha characters for (int i = 'a'; i <= 'z'; i++) { dontNeedEncoding.set(i); } for (int i = 'A'; i <= 'Z'; i++) { dontNeedEncoding.set(i); } // numeric characters for (int i = '0'; i <= '9'; i++) { dontNeedEncoding.set(i); } // special chars dontNeedEncoding.set('-'); dontNeedEncoding.set('~'); dontNeedEncoding.set('_'); dontNeedEncoding.set('.'); dontNeedEncoding.set('*'); dontNeedEncoding.set('/'); dontNeedEncoding.set('='); dontNeedEncoding.set('&'); dontNeedEncoding.set('+'); dontNeedEncoding.set(','); dontNeedEncoding.set(':'); dontNeedEncoding.set(';'); dontNeedEncoding.set('@'); dontNeedEncoding.set('$'); dontNeedEncoding.set('!'); dontNeedEncoding.set(')'); dontNeedEncoding.set('('); // experiments indicate: Firefox (1.0.6) never escapes '%' dontNeedEncoding.set('%'); // experiments indicate: Firefox (1.0.6) does not escape '|' or ''' dontNeedEncoding.set('|'); dontNeedEncoding.set('\''); } private static class BufferItem { public BufferItem(ByteBuffer bufferItem) { _buffer = bufferItem; } public ByteBuffer _buffer; }; private static final class ThreadSync extends AbstractQueuedSynchronizer { /** * */ private static final long serialVersionUID = 8771504638721679952L; ThreadSync() { setState(0); } int getCount() { return getState(); } public int tryAcquireShared(int acquires) { return getState() == 0 ? 1 : -1; } public boolean tryReleaseShared(int releases) { // Decrement count; signal when transition to zero for (;;) { int c = getState(); if (c == 0) return false; int nextc = c - 1; if (compareAndSetState(c, nextc)) return nextc == 0; } } public void incrementCount() { // loop until we can atomically increment ... for (;;) { int c = getState(); int nextc = c + 1; if (compareAndSetState(c, nextc)) break; } } } private ThreadSync _activeWriterCount = new ThreadSync(); private final class ArcFile implements NIODataSink { private Path _hdfsPath; private NIOBufferList _buffer = new NIOBufferList(); private NIOBufferListOutputStream _nioStream = new NIOBufferListOutputStream( _buffer); private int _streamPos = 0; public int _totalHeaderBytesWritten = 0; public int _totalContentBytesWritten = 0; public int _itemsWritten = 0; public int _compressedBytesWritten = 0; private final ReentrantLock queueLock = new ReentrantLock(); private OutputStream _out = new FilterOutputStream( _nioStream) { @Override public void write( int b) throws IOException { ++_streamPos; _nioStream .write(b); } @Override public void write( byte[] b, int off, int len) throws IOException { _streamPos += len; _nioStream .write( b, off, len); }; }; private LinkedBlockingQueue<BufferItem> _consumerQueue = new LinkedBlockingQueue<BufferItem>(); private LinkedList<BufferItem> _rewindQueue = new LinkedList<BufferItem>(); private FSDataOutputStream _hdfsStream = null; private FileSystem _hdfs = null; private Thread _hdfsWriterThread = null; private long _timestamp; // bytes consumed via Blocking Consumer interface ... int _bytesConsumed = 0; private boolean _abort = false; // failure exception ... if any ... private IOException _failureException = null; private void restartWrite() throws IOException { LOG.info("Restarting Write of File:" + _hdfsPath); if (_hdfsStream != null) { LOG .warn("HDFSStream != NULL for File:" + _hdfsPath + " during restart"); _hdfsStream.close(); _hdfsStream = null; } LOG.info("REWIND - Deleting File :" + _hdfsPath); // delete existing ... _hdfs.delete(_hdfsPath,false); LOG.info("REWIND - ReCreating File :" + _hdfsPath); // create new file stream ... _hdfsStream = _hdfs.create(_hdfsPath); // lock queue try { queueLock.lock(); ArrayList<BufferItem> itemList = new ArrayList<BufferItem>(); LOG.info("REWIND - There are:" + _rewindQueue.size() + " Items in the Rewind Queue for File :" + _hdfsPath); itemList.addAll(_rewindQueue); LOG.info("REWIND - There are:" + _consumerQueue.size() + " Items in the Consumer Queue for File :" + _hdfsPath); _consumerQueue.drainTo(_rewindQueue); _consumerQueue.clear(); int itemCount = 0; for (BufferItem bufferItem : itemList) { _consumerQueue.offer(bufferItem); itemCount++; } LOG.info("REWIND - There should be:" + itemCount + " Items in the Consumer Queue for File :" + _hdfsPath); _rewindQueue.clear(); } finally { queueLock.unlock(); } } public ArcFile(FileSystem fileSystem, Path arcFilePath, long timestamp) throws IOException { // first things first ... we need to acquire the writer semaphore ... _maxWritersSemaphore.acquireUninterruptibly(); // increment thread count in parent class ... _activeWriterCount.incrementCount(); // store hdfs filesystem reference ... _hdfs = fileSystem; // and the path to our arc file ... _hdfsPath = arcFilePath; // delete existing ... _hdfs.delete(_hdfsPath,false); // create new file stream ... _hdfsStream = _hdfs.create(_hdfsPath); // and setup the consumer queue relationship _buffer.setSink(this); // store timestamp that was used to create unique filename _timestamp = timestamp; // and finally start the blocking writer thread ... _hdfsWriterThread = new Thread(new Runnable() { public void run() { LOG.info("Writing File:" + _hdfsPath.toString()); test: for (;;) { try { BufferItem item = _consumerQueue.take(); // add item to rewind queue _rewindQueue.addLast(item); // if buffer item is null... this is considered an eof condition // ... break out ... if (item._buffer == null) { // LOG.info("Received Null BufferItem ... Shutting down File:" + // _hdfsPath.toString()); // time to shutdown stream ... try { _hdfsStream.flush(); _hdfsStream.close(); _hdfsStream = null; break; } catch (IOException e) { if (!_abort) { LOG.error("Exception During Flush of File:" + _hdfsPath + "(Restarting) Exception:" + CCStringUtils.stringifyException(e)); try { _hdfsStream = null; restartWrite(); continue test; } catch (IOException e2) { LOG.error("Restart of Stream:" + _hdfsPath.toString() + " Failed with Exception:" + CCStringUtils.stringifyException(e2)); _failureException = e2; // break out of outer loop break; } } else { LOG.error("Aborting Operation for File:" + _hdfsPath); break; } } } // otherwise ... write the else { try { int arrayOffset = item._buffer.arrayOffset(); arrayOffset += item._buffer.position(); int end = item._buffer.limit(); byte[] byteBuffer = item._buffer.array(); // LOG.info("Wrote:" + (end-arrayOffset) + "bytes for File:" + // _hdfsPath.toString()); // write the buffer to disk ... _hdfsStream.write(byteBuffer, arrayOffset, end - arrayOffset); } catch (IOException e) { try { _hdfsStream.close(); } catch (IOException e2) { LOG.error("Ignoring Exception During Close:" + CCStringUtils.stringifyException(e2)); } finally { _hdfsStream = null; } if (!_abort) { LOG.error("Exception During Write of File:" + _hdfsPath + "(Restarting) Exception:" + CCStringUtils.stringifyException(e)); try { restartWrite(); continue test; } catch (IOException e2) { LOG.error("Restart of Stream:" + _hdfsPath.toString() + " Failed with Exception:" + CCStringUtils.stringifyException(e2)); _failureException = e2; // break out of outer loop break; } } else { LOG.error("Aborting Operation for File:" + _hdfsPath); break; } } } } catch (InterruptedException e) { } } LOG.info("Finished Writing File:" + _hdfsPath.toString() + ". Clearing Rewind Queue"); _rewindQueue.clear(); // release our reference to ourselves ... _hdfsWriterThread = null; // and release the semaphore ... _maxWritersSemaphore.release(); // decrement the active thread count ... _activeWriterCount.releaseShared(1); } }); // launch the writer thread ... _hdfsWriterThread.start(); } public void available(ByteBuffer availableReadBuffer) { try { queueLock.lock(); _consumerQueue.offer(new BufferItem(availableReadBuffer)); _bytesConsumed += availableReadBuffer.remaining(); } finally { queueLock.unlock(); } } public void finished() { // NOOP } public void freeze() { // add empty buffer to consumer queue ... which will trigger writer thread // to flush and terminate ... _consumerQueue.offer(new BufferItem(null)); } public OutputStream getOutputStream() { return _out; } public IOException getFailureException() { return _failureException; } public long getTimestamp() { return _timestamp; } /** * get the stream position (the number of bytes written to the output stream * (or file) ) */ public int getStreamPos() { return _streamPos; } /** get the estimated output file size **/ public int getFileSize() { int fileSizeOut = 0; // pickup anything pending (uflushed) data ... ByteBuffer writeBuffer = _buffer.peekAtWriteBuffer(); if (writeBuffer != null) { fileSizeOut += writeBuffer.capacity() - writeBuffer.remaining(); } fileSizeOut += _bytesConsumed; return fileSizeOut; } public void flush() { _buffer.flush(); } public void close() { if (_hdfsWriterThread != null) { throw new RuntimeException( "Arc File close called w/ writer thread still running ...!"); } // ok ... either is called in a clean state or NOT in a clean state ... // if stream is open ... non-clean state ... close it ... if (_hdfsStream != null) { _abort = true; try { _hdfsStream.close(); _hdfsStream = null; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } // time to delete the underlying file since it is corrupt ... try { _hdfs.delete(_hdfsPath,false); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } // and set error condition (if not already set) if (_failureException == null) { _failureException = new IOException( "ArcFile close called on file in improper state"); } } } public void delete() { try { _hdfs.delete(_hdfsPath,false); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } /** Unit Test Constructor ***/ public ArcFileWriter() throws IOException { if (CrawlEnvironment.getHadoopConfig() == null) { Configuration conf = new Configuration(); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); } _fileSystem = CrawlEnvironment.getDefaultFileSystem(); _outputPath = new Path("crawl/test"); _id = 1; _maxWritersSemaphore = new Semaphore(_maxWriters); rotateFile(); } /** * constructor for arc file writer * * * @throws IOException */ public ArcFileWriter(FileSystem fileSystem, Path outputPath, int writerId, int maxSimultaneousWriters) throws IOException { _fileSystem = fileSystem; _outputPath = outputPath; _id = writerId; _maxWriters = maxSimultaneousWriters; _maxWritersSemaphore = new Semaphore(_maxWriters); // set up the initial arc file . rotateFile(); } @Test public void testArcFileWriter() throws Exception { Path crawlFilePath = new Path( "crawl/checkpoint_data/CrawlLog_cc08_1210918849380"); WritableName.setName(CrawlURL.class, "org.crawlcommons.protocol.CrawlURL"); SequenceFile.Reader reader = new SequenceFile.Reader(_fileSystem, crawlFilePath, CrawlEnvironment.getHadoopConfig()); Text url = new Text(); CrawlURL urlData = new CrawlURL(); while (reader.next(url, urlData)) { NIOHttpHeaders headers = CrawlURLHelper.getHeadersFromCrawlURL(urlData); write(url.toString(), 1, 1, urlData, headers, "text/html", "test"); } reader.close(); this.close(false); } public ArcFileWriterStats close(boolean purgeOutput) throws IOException { ArcFileWriterStats statsOut = new ArcFileWriterStats(); if (getActiveFile() != null) { LOG.info("Closing ArcFileWriter ... flushing active file"); // flush any partial writes ... getActiveFile().flush(); getActiveFile().freeze(); } LOG.info("Generating Stats"); // flush mime type stats for (Map.Entry<String, Integer> mimeTypeEntry : _mimeTypeCounts.entries()) { MimeTypeCount mimeTypeCount = new MimeTypeCount(); mimeTypeCount.setMimeType(mimeTypeEntry.getKey()); mimeTypeCount.setCount(mimeTypeEntry.getValue()); statsOut.getMimeTypeCounts().add(mimeTypeCount); } _mimeTypeCounts.clear(); SmoothedAverage itemsPerArcFileAvg = new SmoothedAverage(.25); for (ArcFile arcFile : _arcFiles) { statsOut.setArcFilesWritten(statsOut.getArcFilesWritten() + 1); statsOut.setTotalItemsWritten(statsOut.getTotalItemsWritten() + arcFile._itemsWritten); itemsPerArcFileAvg.addSample(arcFile._itemsWritten); statsOut.setHeaderBytesWritten(statsOut.getHeaderBytesWritten() + arcFile._totalHeaderBytesWritten); statsOut.setContentBytesWritten(statsOut.getContentBytesWritten() + arcFile._totalContentBytesWritten); statsOut.setCompressedBytesWritten(statsOut.getCompressedBytesWritten() + arcFile._compressedBytesWritten); } statsOut.setAverageItemsPerFile((float) itemsPerArcFileAvg.getAverage()); LOG.info("Closing ArcFileWriter ... waiting for all writers to complete"); // now wait for all arc files writes to finish ... _activeWriterCount.acquireShared(1); LOG.info("Closing ArcFileWriter ... all writers completed. closing files"); IOException exceptionOut = null; // now walk arc files collecting any exceptions ... for (ArcFile arcFile : _arcFiles) { if (arcFile.getFailureException() != null) { exceptionOut = arcFile.getFailureException(); } arcFile.close(); } LOG.info("Closing ArcFileWriter ... close complete"); if (purgeOutput) { LOG.info("Purging ArcFiles Due to Possible Error"); for (ArcFile arcFile : _arcFiles) { arcFile.delete(); } } _arcFiles.clear(); if (exceptionOut != null) throw exceptionOut; return statsOut; } private String escapeURI(String uri, String charsetEncoding) throws IOException { boolean needToChange = false; StringBuffer out = new StringBuffer(uri.length()); Charset charset; CharArrayWriter charArrayWriter = new CharArrayWriter(); if (charsetEncoding == null) throw new NullPointerException("charsetName"); try { charset = Charset.forName(charsetEncoding); } catch (IllegalCharsetNameException e) { throw new UnsupportedEncodingException(charsetEncoding); } catch (UnsupportedCharsetException e) { throw new UnsupportedEncodingException(charsetEncoding); } for (int i = 0; i < uri.length();) { int c = (int) uri.charAt(i); // System.out.println("Examining character: " + c); if (dontNeedEncoding.get(c)) { out.append((char) c); i++; } else { // convert to external encoding before hex conversion do { charArrayWriter.write(c); /* * If this character represents the start of a Unicode surrogate pair, * then pass in two characters. It's not clear what should be done if * a bytes reserved in the surrogate pairs range occurs outside of a * legal surrogate pair. For now, just treat it as if it were any * other character. */ if (c >= 0xD800 && c <= 0xDBFF) { /* * System.out.println(Integer.toHexString(c) + * " is high surrogate"); */ if ((i + 1) < uri.length()) { int d = (int) uri.charAt(i + 1); /* * System.out.println("\tExamining " + Integer.toHexString(d)); */ if (d >= 0xDC00 && d <= 0xDFFF) { /* * System.out.println("\t" + Integer.toHexString(d) + * " is low surrogate"); */ charArrayWriter.write(d); i++; } } } i++; } while (i < uri.length() && !dontNeedEncoding.get((c = (int) uri.charAt(i)))); charArrayWriter.flush(); String str = new String(charArrayWriter.toCharArray()); byte[] ba = str.getBytes(charsetEncoding); for (int j = 0; j < ba.length; j++) { out.append('%'); char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16); // converting to use uppercase letter as part of // the hex value if ch is a letter. if (Character.isLetter(ch)) { ch -= caseDiff; } out.append(ch); ch = Character.forDigit(ba[j] & 0xF, 16); if (Character.isLetter(ch)) { ch -= caseDiff; } out.append(ch); } charArrayWriter.reset(); needToChange = true; } } return (needToChange ? out.toString() : uri); } /** * write a url entry via the arc file writer NOTE: BY DESIGN this call could * BLOCK if the number of active writers exceeds the value specified by * maxSimultaneousWriters (in the constructor) * **/ public boolean write(String normalizedURL, int segmentid, int crawlNumber, CrawlURL urlItem, NIOHttpHeaders headers, String contentType, String signature) throws IOException { boolean generatedARCFileContent = false; // String encodedURI = escapeURI(normalizedURL,UTF8); String encodedURI = normalizedURL; GoogleURL url = new GoogleURL(normalizedURL); if (url.isValid()) { encodedURI = url.getCanonicalURL(); } int hostIP = urlItem.getServerIP(); String hostIPStr = IPAddressUtils.IntegerToIPAddressString(hostIP); long fetchBeginTimestamp = urlItem.getLastAttemptTime(); String encoding = headers.findValue("Content-Encoding"); String truncationFlags = ""; if ((urlItem.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { truncationFlags += ArcFileItem.Flags .toString(ArcFileItem.Flags.TruncatedInDownload); } byte[] crawlData = urlItem.getContentRaw().getReadOnlyBytes(); int crawlDataLen = (crawlData != null) ? crawlData.length : 0; // validate content type ... if (contentType == null) { LOG.error("URL:" + normalizedURL + " Rejected - Invalid Content Type:" + contentType); } else { if (crawlData != null && encoding != null && encoding.equalsIgnoreCase("gzip")) { int compressedSize = crawlData.length; try { UnzipResult result = GZIPUtils.unzipBestEffort(crawlData, CrawlEnvironment.CONTENT_SIZE_LIMIT); crawlData = result.data.get(); crawlDataLen = result.data.getCount(); if (result.wasTruncated) { if (truncationFlags.length() != 0) truncationFlags += ","; truncationFlags += ArcFileItem.Flags .toString(ArcFileItem.Flags.TruncatedInInflate); } } catch (Exception e) { LOG.error("URL:" + normalizedURL + " Rejected - GZIP Decompression Failed"); crawlData = null; } } // content must not be null if (crawlData == null) { LOG.error("URL:" + normalizedURL + " Rejected - Content is NULL"); } else { // add in our custom headers ... headers.add(Constants.ARCFileHeader_ParseSegmentId, ((Integer) segmentid).toString()); headers.add(Constants.ARCFileHeader_OriginalURL, normalizedURL); headers.add(Constants.ARCFileHeader_URLFP, Long.toString(urlItem .getFingerprint())); headers.add(Constants.ARCFileHeader_HostFP, Long.toString(urlItem .getHostFP())); headers.add(Constants.ARCFileHeader_Signature, signature); headers.add(Constants.ARCFileHeader_CrawlNumber, Integer .toString(crawlNumber)); headers.add(Constants.ARCFileHeader_FetchTimeStamp, Long .toString(urlItem.getLastAttemptTime())); // headers.add(Environment.ARCFileHeader_CrawlerId, // Integer.toString((int)urlItem.get)); if (truncationFlags.length() != 0) { headers .add(Constants.ARCFileHeader_ContentTruncated, truncationFlags); } String headerString = headers.toString() + "\r\n"; byte[] headerBytes = headerString.getBytes("UTF-8"); // content is truncated further upstream, so this redundant check / // truncation is problematic // int contentLength = Math.min(crawlData.length,CONTENT_SIZE_LIMIT); // extract metadata line upfront, since if the url exceeds a certain // size limit , we are going to reject the entry... byte metaDataLine[]; try { metaDataLine = getMetaLine(encodedURI, contentType, hostIPStr, fetchBeginTimestamp, crawlDataLen + headerBytes.length).getBytes( UTF8); } catch (IOException e) { LOG.error("Metadata Line Validation FAILED with Exception:" + CCStringUtils.stringifyException(e)); // bail here ... return false; } // get ready to write out a new gziped entry ... preWriteRecordTasks(headerBytes.length, crawlDataLen, contentType); try { // read to write an entry ... write(metaDataLine); // write out the headers ... write(headerBytes, 0, headerBytes.length); // write out the content write(crawlData, 0, crawlDataLen); // line separator ... write(LINE_SEPARATOR); // indicate success ... generatedARCFileContent = true; } finally { // flush the gzip stream... postWriteRecordTasks(); } } } return generatedARCFileContent; } /** * * @return timestamp of the current arc file */ public long getActiveFileTimestamp() { return getActiveFile().getTimestamp(); } /** * * @return the position in the arc file of the last written item */ public int getLastItemPos() { return _lastItemPos; } /** * * @return the compressed size (within the arc file) of the last written item */ public int getLastItemCompressedSize() { return _lastItemCompressedSize; } private ArcFile getActiveFile() { if (_arcFiles.size() != 0) { return _arcFiles.lastElement(); } return null; } private static NIOHttpHeaders getHeadersFromString(String headers) { NIOHttpHeaders headersOut = new NIOHttpHeaders(); StringTokenizer tokenizer = new StringTokenizer(headers, "\r\n"); while (tokenizer.hasMoreElements()) { String token = tokenizer.nextToken(); if (token != null && token.length() != 0) { int colonPos = token.indexOf(':'); if (colonPos != -1 && colonPos != token.length() - 1) { String key = token.substring(0, colonPos); String value = token.substring(colonPos + 1); if (key.length() != 0 && value.length() != 0) { headersOut.add(key, value); } } else { headersOut.add(null, token); } } } return headersOut; } public static String getMetaLine(String uri, String contentType, String hostIP, long fetchBeginTimeStamp, long recordLength) throws IOException { if (fetchBeginTimeStamp <= 0) { throw new IOException("Bogus fetchBeginTimestamp: " + Long.toString(fetchBeginTimeStamp)); } return createMetaline(uri, hostIP, TIMESTAMP14.format(new Date( fetchBeginTimeStamp)), contentType, Long.toString(recordLength)); } public static String createMetaline(String uri, String hostIP, String timeStamp, String mimetype, String recordLength) { return uri + HEADER_FIELD_SEPARATOR + hostIP + HEADER_FIELD_SEPARATOR + timeStamp + HEADER_FIELD_SEPARATOR + mimetype + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR; } protected void rotateFile() throws IOException { if (getActiveFile() != null) { ArcFile activeFile = getActiveFile(); // flush any partial writes ... activeFile.flush(); // close it ... activeFile.freeze(); } // generate a timestamp value ... long timestamp = System.currentTimeMillis(); // create a new arc file based on path and timestamp _activeFileName = generateNewARCFilename(timestamp); // create arc file path ... Path arcFilePath = new Path(_outputPath, _activeFileName); // and create a new ArcFile object ... ArcFile newArcFile = new ArcFile(_fileSystem, arcFilePath, timestamp); // and make it the active arc file ... _arcFiles.add(newArcFile); // and set up output stream ... _out = newArcFile.getOutputStream(); // and write out firt record ... writeFirstRecord(TIMESTAMP14.format(new Date(System.currentTimeMillis()))); } private String generateNewARCFilename(long timestamp) { return timestamp + "_" + _id + ARC_FILE_SUFFIX; /* * Date date = new Date(timestamp); String arcFileName = * FILENAME_TIMESTAMP.format(date) + timestamp + "-" + _id + "arc.gz"; * return arcFileName; */ } private String getARCFilename() { return _activeFileName; } /** * Call this method just before/after any significant write. * * Call at the end of the writing of a record or just before we start writing * a new record. Will close current file and open a new file if file size has * passed out maxSize. * * <p> * Creates and opens a file if none already open. One use of this method then * is after construction, call this method to add the metadata, then call * {@link #getPosition()} to find offset of first record. * * @exception IOException */ private void checkSize(int headerBytesLength, int contentBytesLength) throws IOException { if (getActiveFile() == null || (_maxSize != -1 && (getActiveFile().getFileSize() > _maxSize))) { rotateFile(); } } /** * append a pre-generated arcfile entry directly into the arc file writer * * @param arcFileData * - the compressed arc file entry * @param dataBufferLength * - the entry length * @throws IOException */ public void writeRawArcFileItem(String contentType, byte[] arcFileData, int dataBufferLength) throws IOException { // check to see if we need to start a new underlying file checkSize(0, dataBufferLength); // update stats getActiveFile()._totalContentBytesWritten += dataBufferLength; getActiveFile()._itemsWritten++; SortedSet<Integer> counts = _mimeTypeCounts.get(contentType); if (counts.size() == 0) { counts.add(1); } else { int count = counts.first() + 1; counts.clear(); counts.add(count); } // record start position of this item _lastItemPos = getActiveFile().getFileSize(); // write out data _out.write(arcFileData, 0, dataBufferLength); // record size of last item _lastItemCompressedSize = (getActiveFile().getFileSize() - _lastItemPos); // update stats getActiveFile()._compressedBytesWritten += _lastItemCompressedSize; } private void preWriteRecordTasks(int headerBytesLength, int contentBytesLength, String contentType) throws IOException { checkSize(headerBytesLength, contentBytesLength); // update stats getActiveFile()._totalHeaderBytesWritten += headerBytesLength; getActiveFile()._totalContentBytesWritten += contentBytesLength; getActiveFile()._itemsWritten++; SortedSet<Integer> counts = _mimeTypeCounts.get(contentType); if (counts.size() == 0) { counts.add(1); } else { int count = counts.first() + 1; counts.clear(); counts.add(count); } // record start position of this item _lastItemPos = getActiveFile().getFileSize(); // Wrap stream in GZIP Writer. // The below construction immediately writes the GZIP 'default' // header out on the underlying stream. _out = new CompressedStream(_out); } private void postWriteRecordTasks() throws IOException { CompressedStream o = (CompressedStream) _out; o.finish(); o.flush(); o.end(); _out = o.getWrappedStream(); // record size of last item _lastItemCompressedSize = (getActiveFile().getFileSize() - _lastItemPos); // update stats getActiveFile()._compressedBytesWritten += _lastItemCompressedSize; } private void write(final byte[] b, int offset, int size) throws IOException { _out.write(b, offset, size); } private void write(final byte[] b) throws IOException { _out.write(b); } private void write(int b) throws IOException { _out.write(b); } private void writeFirstRecord(final String ts) throws IOException { write(generateARCFileMetaData(ts)); } /** * An override so we get access to underlying output stream and offer an end() * that does not accompany closing underlying stream. * * @author stack */ public static class CompressedStream extends GZIPOutputStream { public CompressedStream(OutputStream out) throws IOException { super(out); } /** * @return Reference to stream being compressed. */ OutputStream getWrappedStream() { return this.out; } /** * Release the deflater's native process resources, which otherwise would * not occur until either finalization or DeflaterOutputStream.close() * (which would also close underlying stream). */ public void end() { def.end(); } } /** * Gzip passed bytes. Use only when bytes is small. * * @param bytes * What to gzip. * @return A gzip member of bytes. * @throws IOException */ private static byte[] gzip(byte[] bytes) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzipOS = new GZIPOutputStream(baos); gzipOS.write(bytes, 0, bytes.length); gzipOS.close(); return baos.toByteArray(); } private byte[] generateARCFileMetaData(String date) throws IOException { String metadataHeaderLinesTwoAndThree = getMetadataHeaderLinesTwoAndThree("1 " + "0"); int recordLength = metadataHeaderLinesTwoAndThree .getBytes(DEFAULT_ENCODING).length; String metadataHeaderStr = ARC_MAGIC_NUMBER + getARCFilename() + " 0.0.0.0 " + date + " text/plain " + recordLength + metadataHeaderLinesTwoAndThree; ByteArrayOutputStream metabaos = new ByteArrayOutputStream(recordLength); // Write the metadata header. metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING)); // Write out a LINE_SEPARATORs to end this record. metabaos.write(LINE_SEPARATOR); // Now get bytes of all just written and compress if flag set. byte[] bytes = metabaos.toByteArray(); // GZIP the header but catch the gzipping into a byte array so we // can add the special IA GZIP header to the product. After // manipulations, write to the output stream (The JAVA GZIP // implementation does not give access to GZIP header. It // produces a 'default' header only). We can get away w/ these // maniupulations because the GZIP 'default' header doesn't // do the 'optional' CRC'ing of the header. byte[] gzippedMetaData = gzip(bytes); if (gzippedMetaData[3] != 0) { throw new IOException("The GZIP FLG header is unexpectedly " + " non-zero. Need to add smarter code that can deal " + " when already extant extra GZIP header fields."); } // Set the GZIP FLG header to '4' which says that the GZIP header // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0, // '0'} 'extra' field. The IA GZIP header will also set byte // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same. gzippedMetaData[3] = 4; gzippedMetaData[9] = 3; byte[] assemblyBuffer = new byte[gzippedMetaData.length + ARC_GZIP_EXTRA_FIELD.length]; // '10' in the below is a pointer past the following bytes of the // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See // RFC1952 for explaination of the abbreviations just used. System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10); System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10, ARC_GZIP_EXTRA_FIELD.length); System.arraycopy(gzippedMetaData, 10, assemblyBuffer, 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10); bytes = assemblyBuffer; return bytes; } private String getMetadataHeaderLinesTwoAndThree(String version) { StringBuffer buffer = new StringBuffer(); buffer.append(LINE_SEPARATOR); buffer.append(version); buffer.append(" CommonCrawl"); buffer.append(LINE_SEPARATOR); buffer.append("URL IP-address Archive-date Content-type Archive-length"); buffer.append(LINE_SEPARATOR); return buffer.toString(); } private static String truncateMimeType(String contentType) { if (contentType == null) { contentType = NO_TYPE_MIMETYPE; } else { Matcher matcher = TRUNCATION_REGEX.matcher(contentType); if (matcher.matches()) { contentType = matcher.group(1); } else { contentType = NO_TYPE_MIMETYPE; } } return contentType; } /** * Test that the metadata line is valid before writing. * * @param metaLineStr * @throws IOException * @return The passed in metaline. */ protected String validateMetaLine(String metaLineStr) throws IOException { if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) { throw new IOException("Metadata line length is " + metaLineStr.length() + " which is > than maximum " + MAX_METADATA_LINE_LENGTH); } Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr); if (!m.matches()) { throw new IOException("Metadata line doesn't match expected" + " pattern: " + metaLineStr); } return metaLineStr; } }