/*
* Sun Public License
*
* The contents of this file are subject to the Sun Public License Version
* 1.0 (the "License"). You may not use this file except in compliance with
* the License. A copy of the License is available at http://www.sun.com/
*
* The Original Code is the SLAMD Distributed Load Generation Engine.
* The Initial Developer of the Original Code is Neil A. Wilson.
* Portions created by Neil A. Wilson are Copyright (C) 2004-2010.
* Some preexisting portions Copyright (C) 2002-2006 Sun Microsystems, Inc.
* All Rights Reserved.
*
* Contributor(s): Neil A. Wilson
*/
package com.slamd.misc;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
/**
* This program provides a mechanism to easily split a text file into multiple
* files based on a number of criteria (specific number of files, number of
* lines per file, or number of bytes per file).
*
*
* @author Neil A. Wilson
*/
public class SplitFile
{
/**
* The split type that indicates the file should be split up into a specific
* number of files. The order of the original file will not be preserved.
*/
public static final int SPLIT_TYPE_NUM_FILES = 1;
/**
* The split type that indicates the file should be split up into a specific
* number of files. The order of the original file will be preserved.
*/
public static final int SPLIT_TYPE_NUM_FILES_PRESERVE_ORDER = 2;
/**
* The split type that indicates the file should be split up so that there are
* a maximum number of lines per file.
*/
public static final int SPLIT_TYPE_NUM_LINES = 3;
/**
* The split type that indicates the file should be split up so that there are
* a maximum number of bytes per file.
*/
public static final int SPLIT_TYPE_NUM_BYTES = 4;
// The maximum number of bytes to include in a file.
private int bytesPerFile;
// The maximum number of lines to include in a file.
private int linesPerFile;
// The number of files to create.
private int numFiles;
// The criteria to use when splitting the file.
private int splitType;
// The base path and name of the split files to create.
private String outputBase;
// The file to be split.
private String inputFile;
/**
* The main method for this program, which simply invokes the constructor.
*
* @param args The command line arguments provided to the program.
*/
public static void main(String[] args)
{
new SplitFile(args);
}
/**
* Creates a new instance of this program and coordinates the process of
* splitting the file.
*
* @param args The command line arguments provided to the program.
*/
public SplitFile(String[] args)
{
// Set default values for the arguments.
bytesPerFile = -1;
linesPerFile = -1;
numFiles = -1;
splitType = -1;
inputFile = null;
outputBase = null;
// Parse the command-line arguments.
for (int i=0; i < args.length; i++)
{
if (args[i].equals("-b"))
{
splitType = SPLIT_TYPE_NUM_BYTES;
bytesPerFile = Integer.parseInt(args[++i]);
}
else if (args[i].equals("-l"))
{
splitType = SPLIT_TYPE_NUM_LINES;
linesPerFile = Integer.parseInt(args[++i]);
}
else if (args[i].equals("-n"))
{
splitType = SPLIT_TYPE_NUM_FILES;
numFiles = Integer.parseInt(args[++i]);
}
else if (args[i].equals("-N"))
{
splitType = SPLIT_TYPE_NUM_FILES_PRESERVE_ORDER;
numFiles = Integer.parseInt(args[++i]);
}
else if (args[i].equals("-i"))
{
inputFile = args[++i];
}
else if (args[i].equals("-o"))
{
outputBase = args[++i];
}
else if (args[i].equals("-H"))
{
displayUsage();
return;
}
else
{
System.err.println("ERROR: Unrecognized argument \"" + args[i] + '"');
displayUsage();
return;
}
}
// Make sure that both a split type was specified.
if (splitType < 0)
{
System.err.println("ERROR: No split mechanism specified");
displayUsage();
return;
}
// Make sure that an input file was specified.
if (inputFile == null)
{
System.err.println("ERROR: No input file specified");
displayUsage();
return;
}
// If no output base was specified, then use the input file as the base.
if (outputBase == null)
{
outputBase = inputFile;
}
// Split the file as per the user's request.
try
{
switch (splitType)
{
case SPLIT_TYPE_NUM_FILES:
splitToNumFiles();
break;
case SPLIT_TYPE_NUM_FILES_PRESERVE_ORDER:
int totalLines = countLines();
linesPerFile = totalLines / numFiles;
if ((totalLines % numFiles) != 0)
{
linesPerFile++;
}
splitByMaxLines();
break;
case SPLIT_TYPE_NUM_LINES:
splitByMaxLines();
break;
case SPLIT_TYPE_NUM_BYTES:
splitByMaxBytes();
break;
}
}
catch (IOException ioe)
{
System.err.println("ERROR: Unable to split file -- " + ioe);
}
}
/**
* Counts the number of lines in the input file.
*
* @return The number of lines in the input file.
*
* @throws IOException If a problem occurs while counting the lines in the
* file.
*/
public int countLines()
throws IOException
{
int numLines = 0;
BufferedReader reader = new BufferedReader(new FileReader(inputFile));
while (reader.ready())
{
reader.readLine();
numLines++;
}
reader.close();
return numLines;
}
/**
* Splits the input file into the specified number of output files.
*
* @throws IOException If a problem occurs while splitting the file.
*/
public void splitToNumFiles()
throws IOException
{
BufferedReader reader = new BufferedReader(new FileReader(inputFile));
BufferedWriter[] writers = new BufferedWriter[numFiles];
for (int i=0; i < writers.length; i++)
{
writers[i] = new BufferedWriter(new FileWriter(outputBase + '.' + (i+1)));
}
int fileNum = 0;
int numLines = 0;
while (reader.ready())
{
String line = reader.readLine();
writers[fileNum].write(line);
writers[fileNum].newLine();
numLines++;
if ((numLines % 1000) == 0)
{
System.out.println("Processed " + numLines + " lines");
}
fileNum++;
if (fileNum >= writers.length)
{
fileNum = 0;
}
}
reader.close();
for (int i=0; i < writers.length; i++)
{
writers[i].flush();
writers[i].close();
}
System.out.println("Processed a total of " + numLines + " lines");
}
/**
* Splits the input file into a number of output files based on the number of
* lines per file.
*
* @throws IOException If a problem occurs while splitting the file.
*/
public void splitByMaxLines()
throws IOException
{
BufferedReader reader = new BufferedReader(new FileReader(inputFile));
int fileNum = 1;
int currentLines = 0;
BufferedWriter writer = new BufferedWriter(new FileWriter(outputBase + '.' +
fileNum));
int numLines = 0;
while (reader.ready())
{
String line = reader.readLine();
currentLines++;
if (currentLines > linesPerFile)
{
writer.flush();
writer.close();
fileNum++;
writer = new BufferedWriter(new FileWriter(outputBase + '.' + fileNum));
currentLines = 1;
}
writer.write(line);
writer.newLine();
numLines++;
if ((numLines % 1000) == 0)
{
System.out.println("Processed " + numLines + " lines");
}
}
writer.flush();
writer.close();
reader.close();
System.out.println("Processed a total of " + numLines + " lines");
}
/**
* Splits the input file into a number of output files based on the number of
* bytes per file.
*
* @throws IOException If a problem occurs while splitting the file.
*/
public void splitByMaxBytes()
throws IOException
{
BufferedReader reader = new BufferedReader(new FileReader(inputFile));
int eolBytes = 0;
String eol = System.getProperty("line.separator");
if (eol == null)
{
eolBytes = 1;
eol = "\n";
}
else
{
eolBytes = eol.length();
}
int fileNum = 1;
int currentBytes = 0;
int numLines = 0;
BufferedWriter writer = new BufferedWriter(new FileWriter(outputBase + '.' +
fileNum));
while (reader.ready())
{
String line = reader.readLine();
currentBytes += (line.length() + eolBytes);
if (currentBytes > bytesPerFile)
{
writer.flush();
writer.close();
fileNum++;
writer = new BufferedWriter(new FileWriter(outputBase + '.' + fileNum));
currentBytes = line.length() + eolBytes;
}
writer.write(line);
writer.write(eol);
numLines++;
if ((numLines % 1000) == 0)
{
System.out.println("Processed " + numLines + " lines");
}
}
writer.flush();
writer.close();
reader.close();
System.out.println("Processed a total of " + numLines + " lines");
}
/**
* Prints usage information for this program to standard error.
*/
public void displayUsage()
{
System.err.println("Available Options:");
System.err.println("-i {path} -- The path to the input file");
System.err.println("-o {path} -- The path to the base name of the output " +
"file");
System.err.println("-n {num} -- The number of files to create");
System.err.println("-N {num} -- The number of files to create (preserve " +
"line order");
System.err.println("-l {num} -- Splits into files of at most {num} lines");
System.err.println("-b {num} -- Splits into files of at most {num} bytes");
System.err.println("-H -- Displays usage information for this " +
"program");
}
}