/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.manual;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Random;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.typeutils.TypeComparator;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.api.java.typeutils.TypeInfoParser;
import org.apache.flink.api.java.typeutils.runtime.CopyableValueComparator;
import org.apache.flink.api.java.typeutils.runtime.CopyableValueSerializer;
import org.apache.flink.api.java.typeutils.runtime.RuntimeSerializerFactory;
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync;
import org.apache.flink.runtime.memory.MemoryManager;
import org.apache.flink.runtime.operators.sort.UnilateralSortMerger;
import org.apache.flink.runtime.operators.testutils.DummyInvokable;
import org.apache.flink.types.StringValue;
import org.apache.flink.util.MutableObjectIterator;
import org.junit.Assert;
public class MassiveStringValueSorting {
private static final long SEED = 347569784659278346L;
public void testStringValueSorting() {
File input = null;
File sorted = null;
try {
// the source file
input = generateFileWithStrings(300000, "http://some-uri.com/that/is/a/common/prefix/to/all");
// the sorted file
sorted = File.createTempFile("sorted_strings", "txt");
String[] command = {"/bin/bash","-c","export LC_ALL=\"C\" && cat \"" + input.getAbsolutePath() + "\" | sort > \"" + sorted.getAbsolutePath() + "\""};
Process p = null;
try {
p = Runtime.getRuntime().exec(command);
int retCode = p.waitFor();
if (retCode != 0) {
throw new Exception("Command failed with return code " + retCode);
}
p = null;
} finally {
if (p != null) {
p.destroy();
}
}
// sort the data
UnilateralSortMerger<StringValue> sorter = null;
BufferedReader reader = null;
BufferedReader verifyReader = null;
try {
MemoryManager mm = new MemoryManager(1024 * 1024, 1);
IOManager ioMan = new IOManagerAsync();
TypeSerializer<StringValue> serializer = new CopyableValueSerializer<StringValue>(StringValue.class);
TypeComparator<StringValue> comparator = new CopyableValueComparator<StringValue>(true, StringValue.class);
reader = new BufferedReader(new FileReader(input));
MutableObjectIterator<StringValue> inputIterator = new StringValueReaderMutableObjectIterator(reader);
sorter = new UnilateralSortMerger<StringValue>(mm, ioMan, inputIterator, new DummyInvokable(),
new RuntimeSerializerFactory<StringValue>(serializer, StringValue.class), comparator, 1.0, 4, 0.8f,
true /* use large record handler */, true);
MutableObjectIterator<StringValue> sortedData = sorter.getIterator();
reader.close();
// verify
verifyReader = new BufferedReader(new FileReader(sorted));
String nextVerify;
StringValue nextFromFlinkSort = new StringValue();
while ((nextVerify = verifyReader.readLine()) != null) {
nextFromFlinkSort = sortedData.next(nextFromFlinkSort);
Assert.assertNotNull(nextFromFlinkSort);
Assert.assertEquals(nextVerify, nextFromFlinkSort.getValue());
}
}
finally {
if (reader != null) {
reader.close();
}
if (verifyReader != null) {
verifyReader.close();
}
if (sorter != null) {
sorter.close();
}
}
}
catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
Assert.fail(e.getMessage());
}
finally {
if (input != null) {
//noinspection ResultOfMethodCallIgnored
input.delete();
}
if (sorted != null) {
//noinspection ResultOfMethodCallIgnored
sorted.delete();
}
}
}
@SuppressWarnings("unchecked")
public void testStringValueTuplesSorting() {
final int NUM_STRINGS = 300000;
File input = null;
File sorted = null;
try {
// the source file
input = generateFileWithStringTuples(NUM_STRINGS, "http://some-uri.com/that/is/a/common/prefix/to/all");
// the sorted file
sorted = File.createTempFile("sorted_strings", "txt");
String[] command = {"/bin/bash","-c","export LC_ALL=\"C\" && cat \"" + input.getAbsolutePath() + "\" | sort > \"" + sorted.getAbsolutePath() + "\""};
Process p = null;
try {
p = Runtime.getRuntime().exec(command);
int retCode = p.waitFor();
if (retCode != 0) {
throw new Exception("Command failed with return code " + retCode);
}
p = null;
} finally {
if (p != null) {
p.destroy();
}
}
// sort the data
UnilateralSortMerger<Tuple2<StringValue, StringValue[]>> sorter = null;
BufferedReader reader = null;
BufferedReader verifyReader = null;
try {
MemoryManager mm = new MemoryManager(1024 * 1024, 1);
IOManager ioMan = new IOManagerAsync();
TupleTypeInfo<Tuple2<StringValue, StringValue[]>> typeInfo = (TupleTypeInfo<Tuple2<StringValue, StringValue[]>>)
TypeInfoParser.<Tuple2<StringValue, StringValue[]>>parse("Tuple2<org.apache.flink.types.StringValue, org.apache.flink.types.StringValue[]>");
TypeSerializer<Tuple2<StringValue, StringValue[]>> serializer = typeInfo.createSerializer(new ExecutionConfig());
TypeComparator<Tuple2<StringValue, StringValue[]>> comparator = typeInfo.createComparator(new int[] { 0 }, new boolean[] { true }, 0, new ExecutionConfig());
reader = new BufferedReader(new FileReader(input));
MutableObjectIterator<Tuple2<StringValue, StringValue[]>> inputIterator = new StringValueTupleReaderMutableObjectIterator(reader);
sorter = new UnilateralSortMerger<Tuple2<StringValue, StringValue[]>>(mm, ioMan, inputIterator, new DummyInvokable(),
new RuntimeSerializerFactory<Tuple2<StringValue, StringValue[]>>(serializer, (Class<Tuple2<StringValue, StringValue[]>>) (Class<?>) Tuple2.class), comparator, 1.0, 4, 0.8f,
true /* use large record handler */, false);
// use this part to verify that all if good when sorting in memory
// List<MemorySegment> memory = mm.allocatePages(new DummyInvokable(), mm.computeNumberOfPages(1024*1024*1024));
// NormalizedKeySorter<Tuple2<String, String[]>> nks = new NormalizedKeySorter<Tuple2<String,String[]>>(serializer, comparator, memory);
//
// {
// Tuple2<String, String[]> wi = new Tuple2<String, String[]>("", new String[0]);
// while ((wi = inputIterator.next(wi)) != null) {
// Assert.assertTrue(nks.write(wi));
// }
//
// new QuickSort().sort(nks);
// }
//
// MutableObjectIterator<Tuple2<String, String[]>> sortedData = nks.getIterator();
MutableObjectIterator<Tuple2<StringValue, StringValue[]>> sortedData = sorter.getIterator();
reader.close();
// verify
verifyReader = new BufferedReader(new FileReader(sorted));
MutableObjectIterator<Tuple2<StringValue, StringValue[]>> verifyIterator = new StringValueTupleReaderMutableObjectIterator(verifyReader);
Tuple2<StringValue, StringValue[]> nextVerify = new Tuple2<StringValue, StringValue[]>(new StringValue(), new StringValue[0]);
Tuple2<StringValue, StringValue[]> nextFromFlinkSort = new Tuple2<StringValue, StringValue[]>(new StringValue(), new StringValue[0]);
int num = 0;
while ((nextVerify = verifyIterator.next(nextVerify)) != null) {
num++;
nextFromFlinkSort = sortedData.next(nextFromFlinkSort);
Assert.assertNotNull(nextFromFlinkSort);
Assert.assertEquals(nextVerify.f0, nextFromFlinkSort.f0);
Assert.assertArrayEquals(nextVerify.f1, nextFromFlinkSort.f1);
}
Assert.assertNull(sortedData.next(nextFromFlinkSort));
Assert.assertEquals(NUM_STRINGS, num);
}
finally {
if (reader != null) {
reader.close();
}
if (verifyReader != null) {
verifyReader.close();
}
if (sorter != null) {
sorter.close();
}
}
}
catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
Assert.fail(e.getMessage());
}
finally {
if (input != null) {
//noinspection ResultOfMethodCallIgnored
input.delete();
}
if (sorted != null) {
//noinspection ResultOfMethodCallIgnored
sorted.delete();
}
}
}
// --------------------------------------------------------------------------------------------
private static final class StringValueReaderMutableObjectIterator implements MutableObjectIterator<StringValue> {
private final BufferedReader reader;
public StringValueReaderMutableObjectIterator(BufferedReader reader) {
this.reader = reader;
}
@Override
public StringValue next(StringValue reuse) throws IOException {
String line = reader.readLine();
if (line == null) {
return null;
}
reuse.setValue(line);
return reuse;
}
@Override
public StringValue next() throws IOException {
return next(new StringValue());
}
}
private static final class StringValueTupleReaderMutableObjectIterator implements MutableObjectIterator<Tuple2<StringValue, StringValue[]>> {
private final BufferedReader reader;
public StringValueTupleReaderMutableObjectIterator(BufferedReader reader) {
this.reader = reader;
}
@Override
public Tuple2<StringValue, StringValue[]> next(Tuple2<StringValue, StringValue[]> reuse) throws IOException {
String line = reader.readLine();
if (line == null) {
return null;
}
String[] parts = line.split(" ");
reuse.f0.setValue(parts[0]);
reuse.f1 = new StringValue[parts.length];
for (int i = 0; i < parts.length; i++) {
reuse.f1[i] = new StringValue(parts[i]);
}
return reuse;
}
@Override
public Tuple2<StringValue, StringValue[]> next() throws IOException {
return next(new Tuple2<StringValue, StringValue[]>(new StringValue(), new StringValue[0]));
}
}
// --------------------------------------------------------------------------------------------
private File generateFileWithStrings(int numStrings, String prefix) throws IOException {
final Random rnd = new Random(SEED);
final StringBuilder bld = new StringBuilder();
final int resetValue = prefix.length();
bld.append(prefix);
File f = File.createTempFile("strings", "txt");
BufferedWriter wrt = null;
try {
wrt = new BufferedWriter(new FileWriter(f));
for (int i = 0 ; i < numStrings; i++) {
bld.setLength(resetValue);
int len = rnd.nextInt(20) + 300;
for (int k = 0; k < len; k++) {
char c = (char) (rnd.nextInt(80) + 40);
bld.append(c);
}
String str = bld.toString();
wrt.write(str);
wrt.newLine();
}
} finally {
if (wrt != null) {
wrt.close();
}
}
return f;
}
private File generateFileWithStringTuples(int numStrings, String prefix) throws IOException {
final Random rnd = new Random(SEED);
final StringBuilder bld = new StringBuilder();
File f = File.createTempFile("strings", "txt");
BufferedWriter wrt = null;
try {
wrt = new BufferedWriter(new FileWriter(f));
for (int i = 0 ; i < numStrings; i++) {
bld.setLength(0);
int numComps = rnd.nextInt(5) + 1;
for (int z = 0; z < numComps; z++) {
if (z > 0) {
bld.append(' ');
}
bld.append(prefix);
int len = rnd.nextInt(20) + 10;
for (int k = 0; k < len; k++) {
char c = (char) (rnd.nextInt(80) + 40);
bld.append(c);
}
}
String str = bld.toString();
wrt.write(str);
wrt.newLine();
}
} finally {
if (wrt != null) {
wrt.close();
}
}
return f;
}
// --------------------------------------------------------------------------------------------
public static void main(String[] args) {
new MassiveStringValueSorting().testStringValueSorting();
new MassiveStringValueSorting().testStringValueTuplesSorting();
}
}