/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.io; import java.io.IOException; import java.nio.charset.Charset; import java.text.NumberFormat; import java.util.Date; import java.util.Random; import java.util.logging.Logger; import org.archive.util.FileUtils; import org.archive.util.TmpDirTestCase; import com.google.common.base.Charsets; /** * Test ReplayCharSequences. * * @author stack, gojomo * @version $Revision$, $Date$ */ public class ReplayCharSequenceTest extends TmpDirTestCase { /** * Logger. */ private static Logger logger = Logger.getLogger("org.archive.io.ReplayCharSequenceFactoryTest"); private static final int SEQUENCE_LENGTH = 127; private static final int MULTIPLIER = 3; private static final int BUFFER_SIZE = SEQUENCE_LENGTH * MULTIPLIER; private static final int INCREMENT = 1; /** * Buffer of regular content. */ private byte [] regularBuffer = null; /* * @see TestCase#setUp() */ protected void setUp() throws Exception { super.setUp(); this.regularBuffer = fillBufferWithRegularContent(new byte [BUFFER_SIZE]); } public void testShiftjis() throws IOException { // Here's the bytes for the JIS encoding of the Japanese form of Nihongo byte[] bytes_nihongo = { (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46, (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38, (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42, (byte) 0x1B, (byte) 0x28, (byte) 0x42 }; final String ENCODING = "SJIS"; // Here is nihongo converted to JVM encoding. String nihongo = new String(bytes_nihongo, ENCODING); RecordingOutputStream ros = writeTestStream( bytes_nihongo,MULTIPLIER, "testShiftjis",MULTIPLIER); // TODO: check for existence of overflow file? ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(ENCODING)); // Now check that start of the rcs comes back in as nihongo string. String rcsStr = rcs.subSequence(0, nihongo.length()).toString(); assertTrue("Nihongo " + nihongo + " does not equal converted string" + " from rcs " + rcsStr, nihongo.equals(rcsStr)); // And assert next string is also properly nihongo. if (rcs.length() >= (nihongo.length() * 2)) { rcsStr = rcs.subSequence(nihongo.length(), nihongo.length() + nihongo.length()).toString(); assertTrue("Nihongo " + nihongo + " does not equal converted " + " string from rcs (2nd time)" + rcsStr, nihongo.equals(rcsStr)); } } public void testGetReplayCharSequenceByteZeroOffset() throws IOException { RecordingOutputStream ros = writeTestStream( regularBuffer,MULTIPLIER, "testGetReplayCharSequenceByteZeroOffset",MULTIPLIER); ReplayCharSequence rcs = getReplayCharSequence(ros); for (int i = 0; i < MULTIPLIER; i++) { accessingCharacters(rcs); } } private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros) throws IOException { return getReplayCharSequence(ros,null); } private ReplayCharSequence getReplayCharSequence(RecordingOutputStream ros, Charset charset) throws IOException { return new GenericReplayCharSequence(ros.getReplayInputStream(), ros.getBufferLength()/2, ros.backingFilename, charset); } public void testGetReplayCharSequenceMultiByteZeroOffset() throws IOException { RecordingOutputStream ros = writeTestStream( regularBuffer,MULTIPLIER, "testGetReplayCharSequenceMultiByteZeroOffset",MULTIPLIER); ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); for (int i = 0; i < MULTIPLIER; i++) { accessingCharacters(rcs); } } public void testReplayCharSequenceByteToString() throws IOException { String fileContent = "Some file content"; byte [] buffer = fileContent.getBytes(); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToString.txt",0); ReplayCharSequence rcs = getReplayCharSequence(ros); String result = rcs.toString(); assertEquals("Strings don't match",result,fileContent); } private String toHexString(String str) { if (str != null) { StringBuilder buf = new StringBuilder("{ "); buf.append(Integer.toString(str.charAt(0), 16)); for (int i = 1; i < str.length(); i++) { buf.append(", "); buf.append(Integer.toString(str.charAt(i), 16)); } buf.append(" }"); return buf.toString(); } else return "null"; } public void testSingleByteEncodings() throws IOException { byte[] bytes = { (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x7d, (byte) 0x7e, (byte) 0x7f, (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0xfc, (byte) 0xfd, (byte) 0xfe, (byte) 0xff }; String latin1String = new String(bytes, "latin1"); RecordingOutputStream ros = writeTestStream( bytes, 1, "testSingleByteEncodings-latin1.txt", 0); ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.ISO_8859_1); String result = rcs.toString(); logger.fine("latin1[0] " + toHexString(latin1String)); logger.fine("latin1[1] " + toHexString(result)); assertEquals("latin1 strings don't match", result, latin1String); String w1252String = new String(bytes, "windows-1252"); ros = writeTestStream( bytes, 1, "testSingleByteEncodings-windows-1252.txt", 0); rcs = getReplayCharSequence(ros,Charset.forName("windows-1252")); result = rcs.toString(); logger.fine("windows-1252[0] " + toHexString(w1252String)); logger.fine("windows-1252[1] " + toHexString(result)); assertEquals("windows-1252 strings don't match", result, w1252String); String asciiString = new String(bytes, "ascii"); ros = writeTestStream( bytes, 1, "testSingleByteEncodings-ascii.txt", 0); rcs = getReplayCharSequence(ros,Charset.forName("ascii")); result = rcs.toString(); logger.fine("ascii[0] " + toHexString(asciiString)); logger.fine("ascii[1] " + toHexString(result)); assertEquals("ascii strings don't match", result, asciiString); } public void testReplayCharSequenceByteToStringOverflow() throws IOException { String fileContent = "Some file content. "; // ascii byte [] buffer = fileContent.getBytes(); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToStringOverflow.txt",1); String expectedContent = fileContent+fileContent; // The string is ascii which is a subset of both these encodings. Use // both encodings because they exercise different code paths. UTF-8 is // decoded to UTF-16 while windows-1252 is memory mapped directly. See // GenericReplayCharSequence ReplayCharSequence rcsUtf8 = getReplayCharSequence(ros,Charsets.UTF_8); ReplayCharSequence rcs1252 = getReplayCharSequence(ros,Charset.forName("windows-1252")); String result = rcsUtf8.toString(); assertEquals("Strings don't match", expectedContent, result); result = rcs1252.toString(); assertEquals("Strings don't match", expectedContent, result); } public void testReplayCharSequenceByteToStringMulti() throws IOException { String fileContent = "Some file content"; byte [] buffer = fileContent.getBytes("UTF-8"); final int MULTIPLICAND = 10; StringBuilder sb = new StringBuilder(MULTIPLICAND * fileContent.length()); for (int i = 0; i < MULTIPLICAND; i++) { sb.append(fileContent); } String expectedResult = sb.toString(); RecordingOutputStream ros = writeTestStream( buffer,1, "testReplayCharSequenceByteToStringMulti.txt",MULTIPLICAND-1); for (int i = 0; i < 3; i++) { ReplayCharSequence rcs = getReplayCharSequence(ros,Charsets.UTF_8); String result = rcs.toString(); assertEquals("Strings don't match", result, expectedResult); rcs.close(); System.gc(); System.runFinalization(); } } public void xestHugeReplayCharSequence() throws IOException { String fileContent = "01234567890123456789"; String characterEncoding = "ascii"; byte[] buffer = fileContent.getBytes(characterEncoding); long reps = (long) Integer.MAX_VALUE / (long) buffer.length + 1000000l; logger.info("writing " + (reps * buffer.length) + " bytes to testHugeReplayCharSequence.txt"); RecordingOutputStream ros = writeTestStream(buffer, 0, "testHugeReplayCharSequence.txt", reps); ReplayCharSequence rcs = getReplayCharSequence(ros,Charset.forName(characterEncoding)); if (reps * fileContent.length() > (long) Integer.MAX_VALUE) { assertTrue("ReplayCharSequence has wrong length (length()=" + rcs.length() + ") (should be " + Integer.MAX_VALUE + ")", rcs.length() == Integer.MAX_VALUE); } else { assertEquals("ReplayCharSequence has wrong length (length()=" + rcs.length() + ") (should be " + (reps * fileContent.length()) + ")", (long) rcs.length(), reps * (long) fileContent.length()); } // boundary cases or something for (int index : new int[] { 0, rcs.length() / 4, rcs.length() / 2, rcs.length() - 1, rcs.length() / 4 }) { // logger.info("testing char at index=" + // NumberFormat.getInstance().format(index)); assertEquals("Characters don't match (index=" + NumberFormat.getInstance().format(index) + ")", fileContent.charAt(index % fileContent.length()), rcs .charAt(index)); } // check that out of bounds indices throw exception for (int n : new int[] { -1, Integer.MIN_VALUE, rcs.length() + 1 }) { try { String message = "rcs.charAt(" + n + ")=" + rcs.charAt(n) + " ?!? -- expected IndexOutOfBoundsException"; logger.severe(message); fail(message); } catch (IndexOutOfBoundsException e) { logger.info("got expected exception: " + e); } } // check some characters at random spots & kinda stress test the // system's memory mapping facility Random rand = new Random(0); // seed so we get the same ones each time for (int i = 0; i < 5000; i++) { int index = rand.nextInt(rcs.length()); // logger.info(i + ". testing char at index=" + // NumberFormat.getInstance().format(index)); assertEquals("Characters don't match (index=" + NumberFormat.getInstance().format(index) + ")", fileContent.charAt(index % fileContent.length()), rcs .charAt(index)); } } /** * Accessing characters test. * * Checks that characters in the rcs are in sequence. * * @param rcs The ReplayCharSequence to try out. */ private void accessingCharacters(CharSequence rcs) { long timestamp = (new Date()).getTime(); int seeks = 0; for (int i = (INCREMENT * 2); (i + INCREMENT) < rcs.length(); i += INCREMENT) { checkCharacter(rcs, i); seeks++; for (int j = i - INCREMENT; j < i; j++) { checkCharacter(rcs, j); seeks++; } } // Note that printing out below breaks cruisecontrols drawing // of the xml unit test results because it outputs disallowed // xml characters. logger.fine(rcs + " seeks count " + seeks + " in " + ((new Date().getTime()) - timestamp) + " milliseconds."); } /** * Check the character read. * * Throws assertion if not expected result. * * @param rcs ReplayCharSequence to read from. * @param i Character offset. */ private void checkCharacter(CharSequence rcs, int i) { int c = rcs.charAt(i); assertTrue("Character " + Integer.toString(c) + " at offset " + i + " unexpected.", (c % SEQUENCE_LENGTH) == (i % SEQUENCE_LENGTH)); } /** * @param baseName * @return RecordingOutputStream * @throws IOException */ private RecordingOutputStream writeTestStream(byte[] content, int memReps, String baseName, long fileReps) throws IOException { String backingFilename = FileUtils.maybeRelative(getTmpDir(),baseName).getAbsolutePath(); RecordingOutputStream ros = new RecordingOutputStream( content.length * memReps, backingFilename); ros.open(); ros.markMessageBodyBegin(); for(long i = 0; i < (memReps+fileReps); i++) { // fill buffer (repeat MULTIPLIER times) and // overflow to disk (also MULTIPLIER times) ros.write(content); } ros.close(); return ros; } /** * Fill a buffer w/ regular progression of single-byte * (and <= 127) characters. * @param buffer Buffer to fill. * @return The buffer we filled. */ private byte [] fillBufferWithRegularContent(byte [] buffer) { int index = 0; for (int i = 0; i < buffer.length; i++) { buffer[i] = (byte) (index & 0x00ff); index++; if (index >= SEQUENCE_LENGTH) { // Reset the index. index = 0; } } return buffer; } public void testCheckParameters() { // TODO. } }