package eairoldi.utils; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.classify.transform.*; import java.io.File; import java.io.PrintStream; import java.io.FileOutputStream; import java.util.ArrayList; import org.apache.log4j.Logger; /** * @author Edoardo Airoldi * Date: Dec 20, 2004 */ public class ConversionUtilities { static private Logger log = Logger.getLogger(ConversionUtilities.class); /** Converts Dataset from Minorthird into LDA format */ static public void saveLdaFile(Dataset data, String filename, boolean makeBinary) { if (makeBinary) { System.out.println("Transforming :: Counts to 0/1"); MakeBinaryTransform filter = new MakeBinaryTransform(); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); } BasicFeatureIndex fidx = new BasicFeatureIndex( data ); try { System.out.println("Printing :: Data to file"); File outFILE = new File(filename+".data-lda.txt"); File wordFILE = new File(filename+".words-lda.txt"); PrintStream out = new PrintStream(new FileOutputStream(outFILE)); PrintStream word = new PrintStream(new FileOutputStream(wordFILE)); int len = fidx.featureIterator().estimatedSize(); System.out.print("Total words :: "+len); ArrayList num2word = new ArrayList(); for (Feature.Looper fl=fidx.featureIterator(); fl.hasNext();) { Feature ft = fl.nextFeature(); num2word.add(ft); int idx = num2word.indexOf(ft); word.println( idx+" "+ft ); } for (Example.Looper el=data.iterator(); el.hasNext();) { Example ex = el.nextExample(); len = ex.featureIterator().estimatedSize(); StringBuffer line = new StringBuffer(len+""); for (Feature.Looper fl=ex.featureIterator(); fl.hasNext(); ) { Feature ft = fl.nextFeature(); int idx = num2word.indexOf(ft); int wgt = (int) ex.getWeight( ft ); line.append(" "+idx+":"+wgt); } out.println( line ); } } catch (Exception e) { log.error(e, e); System.exit(1); } } /** Saves train/test versions of a Dataset in tetrad format */ static public void saveTetradFile(Dataset data, String filename, boolean makeBinary) { Dataset[] d = new Dataset[]{data}; String[] fn = new String[]{filename}; saveTetradFile(d,fn,makeBinary); } /** Merges Dataset[], and then saves train/test versions in tetrad format */ static public void saveTetradFile(Dataset[] data, String[] filename, boolean makeBinary) { if (makeBinary) { MakeBinaryTransform filter = new MakeBinaryTransform(); for (int i=0; i<data.length; i++) { InstanceTransform t = filter.batchTrain(data[i]); data[i] = t.transform(data[i]); } System.out.println("Transform :: Counts to 0/1"); } Dataset allData = ConversionUtilities.composeDatasets(data); ExampleSchema schema = allData.getSchema(); int numberOfClasses = schema.getNumberOfClasses(); //String[] classLabels = new String[numberOfClasses]; //for (int i=0; i<numberOfClasses; i++) { classLabels[i]=schema.getClassName(i); } // train version BasicFeatureIndex fidx = new BasicFeatureIndex( allData ); try { for (int k=0; k<data.length; k++) { System.out.println(filename[k]+".train.ttd"); File outFILE = new File(filename[k]+".train.ttd"); PrintStream out = new PrintStream(new FileOutputStream(outFILE)); out.println("/discretevars"); StringBuffer secondLine = new StringBuffer("ClassificationLabel:"); for (int i=0; i<(numberOfClasses-1); i++) { secondLine.append(i+"="+i+","); } secondLine.append((numberOfClasses-1)+"="+(numberOfClasses-1)); out.println(secondLine); int ftCnt=0; for (Feature.Looper fl=fidx.featureIterator(); fl.hasNext();) { fl.nextFeature(); ftCnt++; if (makeBinary) { out.println( " V"+ftCnt+":0=0,1=1" ); } else { System.out.println("Not implemented yet!"); System.exit(1);} } out.println("/discretedata"); StringBuffer line = new StringBuffer("ClassificationLabel"); ftCnt=0; for (Feature.Looper fl=fidx.featureIterator(); fl.hasNext();) { fl.nextFeature(); ftCnt++; line.append(" V"+ftCnt); //line.append(" "+fl.nextFeature()); } out.println(line); ExampleSchema s = allData.getSchema(); for (Example.Looper el=data[k].iterator(); el.hasNext();) { Example ex = el.nextExample(); int label = s.getClassIndex( ex.getLabel().bestClassName() ); //System.out.println( ex.getLabel().bestClassName()+" "+label+" "+s.getClassIndex( ex.getLabel().bestClassName() )); //System.out.println( ex.getSource()+", "+ex.getSubpopulationId() ); StringBuffer lineBuffer = new StringBuffer(""+label); for (Feature.Looper i=fidx.featureIterator(); i.hasNext(); ) { int wgt = (int) ex.getWeight( i.nextFeature() ); lineBuffer.append(" "+wgt); } out.println( lineBuffer ); } } } catch (Exception e) { log.error(e, e); System.exit(1); } // test version try { for (int k=0; k<data.length; k++) { System.out.println(filename[k]+".test.ttd"); File outFILE = new File(filename[k]+".test.ttd"); PrintStream out = new PrintStream(new FileOutputStream(outFILE)); out.println("/discretedata"); StringBuffer line = new StringBuffer("ClassificationLabel"); int ftCnt = 0; for (Feature.Looper fl=fidx.featureIterator(); fl.hasNext();) { fl.nextFeature(); ftCnt++; line.append(" V"+ftCnt); //line.append(" "+fl.nextFeature()); } out.println(line); ExampleSchema s = allData.getSchema(); for (Example.Looper el=data[k].iterator(); el.hasNext();) { Example ex = el.nextExample(); int label = s.getClassIndex( ex.getLabel().bestClassName() ); //System.out.println( ex.getSource()+", "+ex.getSubpopulationId() ); StringBuffer lineBuffer = new StringBuffer(""+label); for (Feature.Looper i=fidx.featureIterator(); i.hasNext(); ) { int wgt = (int) ex.getWeight( i.nextFeature() ); lineBuffer.append(" "+wgt); } out.println( lineBuffer ); } } // print Vs to Features to screen int ftCnt = 0; for (Feature.Looper fl=fidx.featureIterator(); fl.hasNext();) { Feature ft = fl.nextFeature(); ftCnt++; System.out.println(" V"+ftCnt+" = "+ft); } } catch (Exception e) { log.error(e, e); System.exit(1); } } /** Makes a Dataset by unioning those passed in as arguments */ private static Dataset composeDatasets(Dataset[] data) { Dataset d = new BasicDataset(); for (int i=0; i<data.length; i++) { BasicFeatureIndex fidx = new BasicFeatureIndex(data[i]); System.out.println( "Dataset n."+i+" :: examples="+data[i].size()+", features="+fidx.numberOfFeatures() ); for (Example.Looper j=data[i].iterator(); j.hasNext();) { d.add( j.nextExample() ); } } BasicFeatureIndex fidx = new BasicFeatureIndex(d); System.out.println( "Dataset all :: examples="+d.size()+", features="+fidx.numberOfFeatures() ); return d; } // // Test Conversion Utilities // static public void main(String[] args) { String PATH = "C:\\Archive-Projects\\PNAS\\pnas\\plaintext\\"; File fin = new File(PATH+"pnas.abs.data.3rd"); Dataset din = new BasicDataset(); try { din = DatasetLoader.loadFile(fin); } catch (Exception x) { log.error(x,x); System.exit(1); } String filename = PATH+"pnas.abs"; ConversionUtilities.saveLdaFile(din,filename,false); /*try { Dataset d = DatasetLoader.loadFile( new File("/Users/eairoldi/cmu.research/8.Text.Learning.Group/src.MISC/movie-data.3rd") ); System.out.println("applying frequency filter ..."); OrderBasedTransformLearner f1 = new OrderBasedTransformLearner( "document" ); InstanceTransform t1 = f1.batchTrain( d ); //((OrderBasedInstanceTransform)t1).setNumberOfFeatures( 100 ); // 100 is default value d = t1.transform( d ); System.out.println("applying binary filter ..."); MakeBinaryTransform f2 = new MakeBinaryTransform(); InstanceTransform t2 = f2.batchTrain(d); d = t2.transform(d); //System.out.println("binary data ::\n"+d); System.out.println("applying tfidf filter ..."); TFIDFTransformLearner f3 = new TFIDFTransformLearner(); InstanceTransform t3 = f3.batchTrain(d); d = t3.transform(d); System.out.println("tfidf data ::\n"+d); BasicFeatureIndex fi = new BasicFeatureIndex(d); System.out.println("#ft = "+fi.numberOfFeatures() ); } catch (Exception x) {;} System.exit(0); */ /*String PATH = "/Users/eairoldi/cmu.research/Xue/xue.ROY/"; File fin = new File(PATH+"roy-data-fin.3rd"); File ma = new File(PATH+"roy-data-ma.3rd"); File mix = new File(PATH+"roy-data-mix.3rd"); Dataset[] data = new Dataset[3]; try { data[0] = DatasetLoader.loadFile(fin); data[1] = DatasetLoader.loadFile(ma); data[2] = DatasetLoader.loadFile(mix); } catch (Exception x) { log.error(x,x); System.exit(1); } PATH = PATH + "roy-cross-topic/"; String[] filenames = new String[]{PATH+"Vs.roy-fin",PATH+"Vs.roy-ma",PATH+"Vs.roy-mix"}; ConversionUtilities.saveTetradFile(data,filenames,true);*/ //Dataset allData = Utilities.composeDatasets(data); // DEBUG /*String PATH = "/Users/eairoldi/cmu.research/8.Text.Learning.Group/src.MISC/"; File d1 = new File(PATH+"data1.m3rd"); File d2 = new File(PATH+"data2.m3rd"); File d3 = new File(PATH+"data3.m3rd"); Dataset[] data = new Dataset[3]; try { data[0] = DatasetLoader.loadFile(d1); data[1] = DatasetLoader.loadFile(d2); data[2] = DatasetLoader.loadFile(d3); } catch (Exception x) { log.error(x,x); System.exit(1); } String[] filenames = new String[]{PATH+"data1-out",PATH+"data2-out",PATH+"data3-out"}; Utilities.saveTetradFile(data,filenames,true);*/ } }