package edu.stanford.nlp.parser.ensemble.utils;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class FileUtils {
public static BufferedWriter openForWriting(String filename)
throws IOException {
OutputStream stream = new FileOutputStream(filename);
if (filename.endsWith(".gz")) {
stream = new GZIPOutputStream(stream);
}
return new BufferedWriter(new OutputStreamWriter(stream));
}
public static BufferedReader openForReading(String filename)
throws IOException {
InputStream stream = new FileInputStream(filename);
if (filename.endsWith(".gz")) {
stream = new GZIPInputStream(stream);
}
return new BufferedReader(new InputStreamReader(stream));
}
/**
* Represents a basic CoNLL file (just words and parts of speech) along with
* the corresponding parses by different parsers.
*
* @author dmcc
*/
public static class BaseFilenameAndParses {
private String baseFilename;
private String[] parses;
public BaseFilenameAndParses(String baseFilename, String[] parses) {
this.baseFilename = baseFilename;
this.parses = parses;
}
public String getBaseFilename() {
return baseFilename;
}
public void setBaseFilename(String baseFilename) {
this.baseFilename = baseFilename;
}
public String[] getParses() {
return parses;
}
public void setParses(String[] parses) {
this.parses = parses;
}
@Override
public String toString() {
return "BaseFilenameAndParses [baseFilename=" + baseFilename
+ ", parses=" + Arrays.toString(parses) + "]";
}
}
public static List<BaseFilenameAndParses> getParseFilesFromDir(String input) {
List<BaseFilenameAndParses> results = new ArrayList<BaseFilenameAndParses>();
File inputDirectory = new File(input);
// first, find all the simple filenames (ones with words and tags only)
String[] wordsAndTagsFilenames = inputDirectory.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".parse.gz");
}
});
/*
* given the simple filenames, we find out which filenames start with
* them since parses using the words and tags filenames use them as a
* prefix.
*/
for (final String wordsAndTagsFilename : wordsAndTagsFilenames) {
String[] allParses = inputDirectory.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(wordsAndTagsFilename)
&& !name.equals(wordsAndTagsFilename);
}
});
BaseFilenameAndParses singleEntry = new BaseFilenameAndParses(
input + "/" + wordsAndTagsFilename, allParses);
results.add(singleEntry);
}
return results;
}
/**
* Split the *.parse.gz files in one directory evenly among output
* directories. We create the output directories (output/[division number])
* and print commands which create softlinks from the input files to the
* output files.
*
* @param input base directory for input (should contain *.parse.gz files)
* @param output base directory for output (subdirectories for each division
* will be made inside this)
* @param divisions number of divisions
* @throws IOException
*/
public static void evenlySplitFilesInDirectory(String input, String output,
int divisions) throws IOException {
// make output directories
File[] outputSubDirs = new File[divisions];
for (int i = 0; i < divisions; i++) {
File outputDir = new File(output, Integer.toString(i));
outputDir.mkdirs();
outputSubDirs[i] = outputDir;
}
int currentOutputSubDir = 0;
for (BaseFilenameAndParses bfap : getParseFilesFromDir(input)) {
File inputFile = new File(bfap.baseFilename);
File outputFile = new File(outputSubDirs[currentOutputSubDir],
inputFile.getName());
System.out.format("ln -s %s %s\n", inputFile.getAbsolutePath(),
outputFile.getAbsolutePath());
currentOutputSubDir++;
currentOutputSubDir %= divisions;
}
}
public static void main(String[] args) throws IOException {
String baseDir = "/home/mcclosky/data/gigaword-selected/apw_eng/100";
String outputDir = "/home/mcclosky/data/gigaword-selected/apw_eng/100-split";
evenlySplitFilesInDirectory(baseDir, outputDir, 6);
}
}