/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.util.sort; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Writer; import java.util.Comparator; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.Resource; import org.apache.metamodel.util.ToStringComparator; import junit.framework.TestCase; public class SortMergeWriterTest extends TestCase { public void testSimpleSort() throws Exception { doSortTests(2); doSortTests(5); doSortTests(10); doSortTests(10000); } public void testSimpleDedup() throws Exception { doDedupTests(2); doDedupTests(5); doDedupTests(10); doDedupTests(10000); } // test that the comparator is being used. Here we dedup string arrays but // only based on the string at index 0. public void testDedupArray() throws Exception { final Comparator<String[]> comparator = (o1, o2) -> o1[0].compareTo(o2[0]); final SortMergeWriter<String[], Writer> sorter = new SortMergeWriter<String[], Writer>(2, comparator) { @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } @Override protected void writeRow(final Writer writer, final String[] row, final int count) throws IOException { if (row == null) { writer.write("<null>," + count + "\n"); } else { writer.write(row[0] + "," + count + "\n"); } } protected void writeHeader(final Writer writer) throws IOException { writer.write("text,count\n"); } }; sorter.append(new String[] { "foo", "foo" }); sorter.append(new String[] { "bar", "foobar" }); sorter.append(new String[] { "foobar", "bar" }); sorter.append(new String[] { "barfoo", "foobar" }); sorter.append(new String[] { "foo", "foo" }); sorter.append(new String[] { "foobar", "bar" }); sorter.append(new String[] { "barfoo", "foobar" }); sorter.append(new String[] { "bar", "foo" }); sorter.append(new String[] { "foobar", "bar" }); sorter.append(new String[] { "barfoo", "foobar" }); sorter.append(new String[] { "bar", "foo" }); sorter.append(new String[] { "foobar", "bar" }); final File file = sorter.write("target/sort_merge_arrays-deduped.csv"); assertTrue(file.exists()); try (BufferedReader br = FileHelper.getBufferedReader(file)) { assertEquals("text,count", br.readLine()); assertEquals("bar,3", br.readLine()); assertEquals("barfoo,3", br.readLine()); assertEquals("foo,2", br.readLine()); assertEquals("foobar,4", br.readLine()); assertNull(br.readLine()); br.close(); } } public void testUseAsUniquenessChecker() throws Exception { final SortMergeWriter<String, Writer> sorter = new SortMergeWriter<String, Writer>(2, ToStringComparator.getComparator()) { @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } @Override protected void writeRow(final Writer writer, final String row, final int count) throws IOException { if (count > 1) { writer.write(row + "," + count + "\n"); } } protected void writeHeader(final Writer writer) throws IOException { writer.write("text,count\n"); } @Override protected void writeNull(final Writer writer, final int nullCount) throws IOException { if (nullCount > 1) { writeRow(writer, "<null>", nullCount); } } }; sorter.append("foo"); sorter.append("bar"); sorter.append("baz"); sorter.append("hello"); sorter.append("world"); for (int i = 0; i < 100; i++) { sorter.append("unique" + i); } sorter.append("bar"); sorter.append("foo"); final File file = sorter.write("target/sort_merge_uniqueness.txt"); final String str = FileHelper.readFileAsString(file); assertEquals("text,count\n" + "bar,2\n" + "foo,2", str); } public void testNullSafety() throws Exception { final SortMergeWriter<String, Writer> sorter = new SortMergeWriter<String, Writer>(2, ToStringComparator.getComparator()) { @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } @Override protected void writeRow(final Writer writer, final String row, final int count) throws IOException { writer.write(row + "," + count + "\n"); } protected void writeHeader(final Writer writer) throws IOException { writer.write("text,count\n"); } @Override protected void writeNull(final Writer writer, final int nullCount) throws IOException { writeRow(writer, "<null>", nullCount); } }; sorter.append("1234"); sorter.append("acb"); sorter.append(null); sorter.append("5678"); sorter.append("1234"); sorter.append("acb", 3); sorter.append("acb"); sorter.append("5678"); sorter.append("1234"); final File file = sorter.write("target/sort_merge_null_safety.txt"); assertTrue(file.exists()); final BufferedReader br = FileHelper.getBufferedReader(file); assertEquals("text,count", br.readLine()); assertEquals("<null>,1", br.readLine()); assertEquals("1234,3", br.readLine()); assertEquals("5678,2", br.readLine()); assertEquals("acb,5", br.readLine()); assertNull(br.readLine()); } public void testNoUnnecessaryTempFiles() throws Exception { final SortMergeWriter<String, Writer> sorter = new SortMergeWriter<String, Writer>(10, ToStringComparator.getComparator()) { @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } @Override protected void writeRow(final Writer writer, final String row, final int count) throws IOException { writer.write(row + "," + count + "\n"); } protected void writeHeader(final Writer writer) throws IOException { writer.write("text,count\n"); } @Override protected File createTempFile() throws IOException { throw new IllegalStateException("This test is not supposed to require temp files!"); } }; sorter.append("1234"); sorter.append("acb"); sorter.append("abc"); sorter.append("acb"); sorter.append("5678"); final File file = sorter.write("target/sort_merge_no_temp_file.txt"); assertTrue(file.exists()); try (BufferedReader br = FileHelper.getBufferedReader(file)) { assertEquals("text,count", br.readLine()); assertEquals("1234,1", br.readLine()); assertEquals("5678,1", br.readLine()); assertEquals("abc,1", br.readLine()); assertEquals("acb,2", br.readLine()); assertNull(br.readLine()); } } private void doSortTests(final int threshold) throws Exception { final SortMergeWriter<String, Writer> sorter = new SortMergeWriter<String, Writer>(threshold, ToStringComparator.getComparator()) { @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } @Override protected void writeRow(final Writer writer, final String row, final int count) throws IOException { writer.write(row + "," + count + "\n"); } protected void writeHeader(final Writer writer) throws IOException { writer.write("number,count\n"); } }; sorter.append("02"); sorter.append("01"); sorter.append("04"); sorter.append("03"); sorter.append("06"); sorter.append("07"); sorter.append("08"); sorter.append("05"); sorter.append("09"); sorter.append("10"); sorter.append("13"); sorter.append("12"); sorter.append("11"); sorter.append("14"); final File file = sorter.write("target/sort_merge_sort_" + threshold + ".txt"); assertTrue(file.exists()); try (BufferedReader br = FileHelper.getBufferedReader(file)) { assertEquals("number,count", br.readLine()); assertEquals("01,1", br.readLine()); assertEquals("02,1", br.readLine()); assertEquals("03,1", br.readLine()); assertEquals("04,1", br.readLine()); assertEquals("05,1", br.readLine()); assertEquals("06,1", br.readLine()); assertEquals("07,1", br.readLine()); assertEquals("08,1", br.readLine()); assertEquals("09,1", br.readLine()); assertEquals("10,1", br.readLine()); assertEquals("11,1", br.readLine()); assertEquals("12,1", br.readLine()); assertEquals("13,1", br.readLine()); assertEquals("14,1", br.readLine()); assertNull(br.readLine()); } } private void doDedupTests(final int threshold) throws Exception { final SortMergeWriter<String, Writer> sorter = new SortMergeWriter<String, Writer>(threshold, ToStringComparator.getComparator()) { @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } @Override protected void writeRow(final Writer writer, final String row, final int count) throws IOException { writer.write(row + "," + count + "\n"); } @Override protected void writeHeader(final Writer writer) throws IOException { // do nothing } }; sorter.append("02"); sorter.append("01"); sorter.append("04"); sorter.append("03"); sorter.append("06"); sorter.append("07"); sorter.append("08"); sorter.append("05"); sorter.append("09"); sorter.append("10"); sorter.append("13"); sorter.append("12"); sorter.append("11"); sorter.append("14"); sorter.append("02"); sorter.append("01"); sorter.append("01"); sorter.append("14"); sorter.append("10"); sorter.append("10"); sorter.append("10"); final File file = sorter.write("target/sort_merge_dedup_" + threshold + ".txt"); assertTrue(file.exists()); try (BufferedReader br = FileHelper.getBufferedReader(file)) { assertEquals("01,3", br.readLine()); assertEquals("02,2", br.readLine()); assertEquals("03,1", br.readLine()); assertEquals("04,1", br.readLine()); assertEquals("05,1", br.readLine()); assertEquals("06,1", br.readLine()); assertEquals("07,1", br.readLine()); assertEquals("08,1", br.readLine()); assertEquals("09,1", br.readLine()); assertEquals("10,4", br.readLine()); assertEquals("11,1", br.readLine()); assertEquals("12,1", br.readLine()); assertEquals("13,1", br.readLine()); assertEquals("14,2", br.readLine()); assertNull(br.readLine()); } } }