/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.fileset;
import co.cask.cdap.api.common.RuntimeArguments;
import co.cask.cdap.api.common.Scope;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.FileSetArguments;
import co.cask.cdap.test.ApplicationManager;
import co.cask.cdap.test.DataSetManager;
import co.cask.cdap.test.MapReduceManager;
import co.cask.cdap.test.ServiceManager;
import co.cask.cdap.test.TestBase;
import co.cask.cdap.test.TestConfiguration;
import com.google.common.base.Charsets;
import com.google.common.collect.Maps;
import org.apache.twill.filesystem.Location;
import org.junit.Assert;
import org.junit.ClassRule;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* A unit test for word count over file sets.
*/
public class FileSetWordCountTest extends TestBase {
@ClassRule
public static final TestConfiguration CONFIG = new TestConfiguration("explore.enabled", false);
@Test
public void testWordCountOnFileSet() throws Exception {
// deploy the application
ApplicationManager applicationManager = deployApplication(FileSetExample.class);
final String line1 = "a b a";
final String line2 = "b a b";
// discover the file set service
ServiceManager serviceManager = applicationManager.getServiceManager("FileSetService").start();
serviceManager.waitForStatus(true);
URL serviceURL = serviceManager.getServiceURL();
// write a file to the file set using the service
HttpURLConnection connection = (HttpURLConnection) new URL(serviceURL, "lines?path=nn.1").openConnection();
try {
connection.setDoOutput(true);
connection.setRequestMethod("PUT");
connection.getOutputStream().write(line1.getBytes(Charsets.UTF_8));
Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
} finally {
connection.disconnect();
}
// run word count over that file only
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
FileSetArguments.setInputPaths(inputArgs, "nn.1");
Map<String, String> outputArgs = Maps.newHashMap();
FileSetArguments.setOutputPath(outputArgs, "out.1");
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "lines", inputArgs));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "counts", outputArgs));
MapReduceManager mapReduceManager = applicationManager.getMapReduceManager("WordCount").start(runtimeArguments);
mapReduceManager.waitForFinish(5, TimeUnit.MINUTES);
// retrieve the counts through the service and verify
Map<String, Integer> counts = Maps.newHashMap();
connection = (HttpURLConnection) new URL(serviceURL, "counts?path=out.1/part-r-00000").openConnection();
try {
connection.setRequestMethod("GET");
Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
readCounts(connection.getInputStream(), counts);
} finally {
connection.disconnect();
}
// "a b a" should yield "a":2, "b":1
Assert.assertEquals(2, counts.size());
Assert.assertEquals(new Integer(2), counts.get("a"));
Assert.assertEquals(new Integer(1), counts.get("b"));
// write a file to the file set using the dataset directly
DataSetManager<FileSet> linesManager = getDataset("lines");
OutputStream output = linesManager.get().getLocation("nn.2").getOutputStream();
try {
output.write(line2.getBytes(Charsets.UTF_8));
} finally {
output.close();
}
// run word count over both files
FileSetArguments.setInputPath(inputArgs, "nn.1");
FileSetArguments.addInputPath(inputArgs, "nn.2");
FileSetArguments.setOutputPath(outputArgs, "out.2");
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "lines", inputArgs));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "counts", outputArgs));
mapReduceManager = applicationManager.getMapReduceManager("WordCount").start(runtimeArguments);
mapReduceManager.waitForFinish(5, TimeUnit.MINUTES);
// retrieve the counts through the dataset API and verify
// write a file to the file set using the dataset directly
DataSetManager<FileSet> countsManager = getDataset("counts");
counts.clear();
Location resultLocation = countsManager.get().getLocation("out.2");
Assert.assertTrue(resultLocation.isDirectory());
List<String> parts = new LinkedList<>();
for (Location child : resultLocation.list()) {
if (child.getName().startsWith("part-")) { // only read part files, no check sums or done files
parts.add(child.getName());
readCounts(child.getInputStream(), counts);
}
}
// "a b a" and "b a b" should yield "a":3, "b":3
Assert.assertEquals(2, counts.size());
Assert.assertEquals(new Integer(3), counts.get("a"));
Assert.assertEquals(new Integer(3), counts.get("b"));
// retrieve the counts through the service
counts.clear();
for (String part : parts) {
connection = (HttpURLConnection) new URL(serviceURL, "counts?path=out.2/" + part).openConnection();
try {
connection.setRequestMethod("GET");
Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
readCounts(connection.getInputStream(), counts);
} finally {
connection.disconnect();
}
}
// "a b a" and "b a b" should yield "a":3, "b":3
Assert.assertEquals(2, counts.size());
Assert.assertEquals(new Integer(3), counts.get("a"));
Assert.assertEquals(new Integer(3), counts.get("b"));
serviceManager.stop();
}
/**
* Helper to read an input stream, line by line, and parse each line in the format word:count,
* and add it as an entry to the map that is passed in.
*/
private static void readCounts(InputStream inputStream, Map<String, Integer> counts) throws IOException {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
while (true) {
String line = reader.readLine();
if (line == null || line.isEmpty()) {
break;
}
String[] fields = line.split(":");
Assert.assertEquals(2, fields.length);
counts.put(fields[0], Integer.valueOf(fields[1]));
}
} finally {
inputStream.close();
}
}
}