package com.antbrains.crf.hadoop; import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import com.antbrains.crf.CompactedTroveFeatureDict; import com.antbrains.crf.DATrieFeatureDict; import com.antbrains.crf.FeatureDict; import com.antbrains.crf.TroveFeatureDict; import de.ruedigermoeller.serialization.FSTObjectOutput; public class FeatureDictReader { public static void main(String[] args) throws IOException { if (args.length < 2) { System.out.println("Usage FeatureDictReader inDir outDict"); System.exit(-1); } Configuration conf = new Configuration(); for (int i = 2; i < args.length; i++) { System.out.println("add resource: " + args[i]); conf.addResource(new Path(args[i])); } CompressionCodecFactory factory = new CompressionCodecFactory(conf); // conf.addResource(new // Path("/home/lili/soft/hadoop/conf/core-site.xml")); FileSystem fs = FileSystem.get(conf); Path inFile = new Path(args[0]); FileStatus[] status = fs.listStatus(inFile); // FeatureDict fd=new TroveFeatureDict(102400); FeatureDict fd = new CompactedTroveFeatureDict(102400); int lineNum = 0; for (FileStatus stat : status) { if (stat.isDir()) { System.out.println("ignore subdir: " + stat.getPath().toString()); } else { Path f = stat.getPath(); if (f.getName().startsWith("_")) { System.out.println("ignore file: " + f.toString()); } else { System.out.println(new java.util.Date() + " process: " + f.toString()); CompressionCodec codec = factory.getCodec(f); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fs.open(f)); } else { stream = fs.open(f); } BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(stream, "UTF8")); String line; while ((line = br.readLine()) != null) { String[] arr = line.split("\t"); fd.get(arr[0], true); lineNum++; } } finally { if (br != null) { br.close(); } } } } } System.out.println(lineNum + "=?=" + fd.size()); FSTObjectOutput foo = null; try { foo = new FSTObjectOutput(new FileOutputStream(args[1])); foo.writeObject(fd); } finally { if (foo != null) { foo.close(); } } } }