package edu.umd.hooka;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
public class CreateMetadata {
public static void GenerateMetadata(Path bitextPath, Path resultPath) throws IOException
{
System.out.println(bitextPath.toString());
JobConf conf = new JobConf(CreateMetadata.class);
FileSystem fileSys = FileSystem.get(conf);
//SequenceFile.Reader[] x = SequenceFileOutputFormat.getReaders(conf, bitextPath);
SequenceFile.Reader[] x = SequenceFileOutputFormat.getReaders(conf, new Path("/shared/bitexts/ar-en.ldc.10k/ar-en.10k.bitext"));
WritableComparable key = new IntWritable();
PhrasePair value = new PhrasePair();
int sc = 0;
int ec = 0;
int fc = 0;
try{
for(SequenceFile.Reader r : x)
while(r.next(key, value))
{
sc = sc + 1;
for(int word: value.getE().getWords())
if(word > ec) ec=word;
for(int word: value.getF().getWords())
if(word > fc) fc=word;
}
}
catch(IOException e){throw new RuntimeException("IO exception: " + e.getMessage());}
Metadata theMetadata = new Metadata(sc, ec, fc);
ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream(FileSystem.get(conf).create(resultPath)));
mdstream.writeObject(theMetadata);
mdstream.close();
}
}