package hadoop.convertFilesToSequenceFile;
import hadoop.extensions.WebTableInputFormat;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
public class ConvertWebTablesToSequenceFile {
private SequenceFile.Writer writer;
static int counter = 0;
static int together = 0;
static long size = 0;
public ConvertWebTablesToSequenceFile(String s, String directoryStructure)
throws IOException {
super();
String uri = s;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path(uri);
try {
this.writer = SequenceFile.createWriter(fs, conf, path, Text.class,
BytesWritable.class);
processFiles(new File(directoryStructure));
} finally {
IOUtils.closeStream(writer);
}
}
private void processFiles(File node) throws IOException {
if (node.isDirectory()) {
System.out.println("Directory: "+node.getAbsolutePath());
String[] subNote = node.list();
for (String filename : subNote) {
processFiles(new File(node, filename));
}
} else {
writeOut(node, node.getAbsoluteFile().getParentFile()
.getAbsolutePath());
}
}
private void writeOut(File file, String directoryName) throws IOException {
GzipCompressorInputStream gzIn = null;
TarArchiveInputStream tarIn = null;
try {
gzIn = new GzipCompressorInputStream(new FileInputStream(file));
tarIn = new TarArchiveInputStream(gzIn);
TarArchiveEntry entry = null;
while (true) {
entry = tarIn.getNextTarEntry();
if (entry == null) {
break;
}
String key = entry.getName();
if (key.endsWith(".csv")) {
together++;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] temp = new byte[8192];
boolean isTooLarge = false;
while (true) {
int bytesRead = 0;
try {
bytesRead = tarIn.read(temp, 0, 8192);
} catch (EOFException e) {
if (WebTableInputFormat.getLenient() == false) {
throw e;
}
break;
}
if (bytesRead > 0) {
bos.write(temp, 0, bytesRead);
} else {
break;
}
if (bos.size() > (100 * 100 * 8192)) {
isTooLarge = true;
break;
}
}
if (!isTooLarge) {
bos.flush();
BytesWritable currentValue = new BytesWritable(
bos.toByteArray());
Text currentKey = new Text(directoryName + "/"
+ entry.getName());
this.writer.append(currentKey, currentValue);
counter++;
}
}
}
} finally {
try {
tarIn.close();
gzIn.close();
} catch (Exception ignore) {
}
}
}
public static void main(String[] args) {
try {
new ConvertWebTablesToSequenceFile(args[1], args[0]);
System.out.println(counter+ " "+together );
} catch (IOException e) {
e.printStackTrace();
}
}
}