package joshua.discriminative.feature_related; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import joshua.corpus.vocab.BuildinSymbol; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.hypergraph.DiskHyperGraph; import joshua.decoder.hypergraph.HyperGraph; import joshua.discriminative.DiscriminativeSupport; import joshua.discriminative.FileUtilityOld; import joshua.discriminative.feature_related.feature_template.EdgeBigramFT; import joshua.discriminative.feature_related.feature_template.FeatureTemplate; import joshua.discriminative.feature_related.feature_template.NgramFT; import joshua.discriminative.feature_related.feature_template.TMFT; public class FeatureIntersection { public static void main(String[] args) { //##read configuration information if(args.length<8){ System.out.println("wrong command, correct command should be: java Perceptron_HG is_crf lf_train_items lf_train_rules lf_orc_items lf_orc_rules f_l_num_sents f_data_sel f_model_out_prefix use_tm_feat use_lm_feat use_edge_bigram_feat_only f_feature_set use_joint_tm_lm_feature"); System.out.println("num of args is "+ args.length); for(int i=0; i <args.length; i++)System.out.println("arg is: " + args[i]); System.exit(0); } String f_l_train_items=args[0].trim(); String f_l_train_rules=args[1].trim(); String f_l_num_sents=args[2].trim(); String f_data_sel=args[3].trim(); boolean useTMFeat = new Boolean(args[4].trim()); boolean useLMFeat = new Boolean(args[5].trim()); boolean useEdgeNgramOnly = new Boolean(args[6].trim()); String featureFile = args[7].trim(); boolean saveModelCosts = false; //???????????????????????????????????????????????????? int ngramStateID = 0; //?????????????????????????????????????? SymbolTable p_symbol = new BuildinSymbol(null); //##setup feature templates list ArrayList<FeatureTemplate> featTemplates = new ArrayList<FeatureTemplate>(); boolean useIntegerString = false; boolean useRuleIDName = false; if(useTMFeat==true){ FeatureTemplate ft = new TMFT(p_symbol, useIntegerString, useRuleIDName); featTemplates.add(ft); } int baseline_lm_order = 3;//TODO if(useLMFeat==true){ FeatureTemplate ft = new NgramFT(p_symbol, false, ngramStateID, baseline_lm_order,1,2);//TODO: unigram and bi gram featTemplates.add(ft); }else if(useEdgeNgramOnly){//exclusive with use_lm_feat FeatureTemplate ft = new EdgeBigramFT(p_symbol, ngramStateID, baseline_lm_order, useIntegerString); featTemplates.add(ft); } System.out.println("feature template are " + featTemplates.toString()); List<String> l_file_train_items = DiscriminativeSupport.readFileList(f_l_train_items); List<String> l_file_train_rules = DiscriminativeSupport.readFileList(f_l_train_rules); List<String> l_num_sents = DiscriminativeSupport.readFileList(f_l_num_sents); HashMap<Integer, Boolean> tbl_sent_selected = DiscriminativeSupport.setupDataSelTbl(f_data_sel);//for data selection HashSet<String> restrictedFeatureSet = new HashSet<String>(); HashMap<String,Double> featureIntersectionSet = new HashMap<String,Double>(); if(featureFile!=null) DiscriminativeSupport.loadFeatureSet(featureFile, restrictedFeatureSet); else{ System.out.println("Must specify feature set"); System.exit(0); } //#####begin to do training int sentID=0; for(int fid=0; fid < l_file_train_items.size(); fid++){ System.out.println("############Process file id " + fid); DiskHyperGraph diskHG = new DiskHyperGraph(p_symbol, ngramStateID, saveModelCosts, null); diskHG.initRead(l_file_train_items.get(fid), l_file_train_rules.get(fid),tbl_sent_selected); int total_num_sent = new Integer((String)l_num_sents.get(fid)); for(int sent_id=0; sent_id < total_num_sent; sent_id ++){ System.out.println("#Process sentence " + sentID); HyperGraph hg = diskHG.readHyperGraph(); if(hg!=null)//sent is not skipped FeatureExtractionHG.featureExtractionOnHG(hg,featureIntersectionSet, restrictedFeatureSet, featTemplates); sentID++; } } FileUtilityOld.printHashTblAboveThreshold(featureIntersectionSet, featureFile+".intersection", false, 0, false, false, null); } }