package eairoldi.utils;
import edu.cmu.minorthird.classify.*;
import edu.cmu.minorthird.classify.transform.*;
import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.text.learn.SpanFeatureExtractor;
import edu.cmu.minorthird.text.learn.FeatureBuffer;
import edu.cmu.minorthird.text.learn.SpanFE;
import java.io.File;
import java.io.PrintStream;
import java.io.FileOutputStream;
import java.util.Set;
import java.util.TreeSet;
import org.apache.log4j.Logger;
/**
* User: Edoardo M. Airoldi (eairoldi@cs.cmu.edu)
* Date: Feb 16, 2005
*/
public class MakeData
{
static private Logger log = Logger.getLogger(MakeData.class);
static private String[] PnasTopics = new String[] {"AppliedBiologicalSciences", "SocialSciences",
"Pharmacology", "Geophysics", "Psychology",
"Astronomy", "EconomicSciences","AppliedMathematics",
"PoliticalSciences", "Engineering", "Anthropology-BS",
"U.S.FrontiersOfScienceSymposium",
"ChineseamericanFrontiersOfScienceSymposium",
"MedicalSciences", "Mathematics", "Immunology",
"Microbiology", "Ecology",
"German-americanFrontiersOfScienceSymposium",
"ComputerSciences", "ResearchArticles", "AgriculturalSciences",
"Neurobiology", "AppliedPhysicalSciences",
"FrontiersOfScienceSymposium", "Anthropology",
"Psychology-PS", "Psychology-BS", "Evolution",
"PlantBiology", "Physiology", "Physics",
"PopulationBiology", "Statistics", "CellBiology",
"JapaneseamericanFrontiersOfScienceSymposium",
"Biochemistry", "Geophyics", "Chemistry", "Biophysics",
"Genetics", "DevelopmentalBiology", "Introduction","Geology"};
// create dataset
public static Dataset make(File dirFile, File envFile, File dataFile, File wordFile)
{
Dataset d = new BasicDataset();
try {
// load the documents and labels
System.out.println("Load Texts");
TextBaseLoader loader = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE, false);
TextBase base = loader.load( dirFile );
MutableTextLabels labels = new BasicTextLabels(base);
new TextLabelsLoader().importOps(labels, base, envFile);
// set up a simple bag-of-words feature extractor
SpanFeatureExtractor fe = new SpanFeatureExtractor()
{
public Instance extractInstance(TextLabels labels, Span s) {
FeatureBuffer buf = new FeatureBuffer(labels, s);
SpanFE.from(s,buf).tokens().eq().lc().punk().emit();
return buf.getInstance();
}
public Instance extractInstance(Span s) {
return extractInstance(null,s);
}
};
// Extract features and create a dataset
System.out.println("Extract Features");
for (Span.Looper i = base.documentSpanIterator(); i.hasNext();) {
Span s = i.nextSpan();
boolean found = false;
for (int t=0; t<PnasTopics.length; t++) {
if ( labels.hasType(s,PnasTopics[t]) ) {
d.add(new Example(fe.extractInstance(labels,s), new ClassLabel(PnasTopics[t])));
// for binary :: label = +1 or -1;
// data.add(new BinaryExample(fe.extractInstance(labels,s), label));
found = true;
}
}
if (!found) {
System.out.println("error: missing label!");
System.out.print(s.toString());
System.exit(1);
}
}
} catch (Exception e) {
log.error(e, e);
System.exit(1);
}
return d;
}
private static Dataset filterDataset(Dataset data, String f)
{
if (f.equals("T1"))
{
System.out.println("Filter Features with T1");
T1InstanceTransformLearner filter = new T1InstanceTransformLearner();
filter.setREF_LENGTH(660.0);
//filter.setPDF("Negative-Binomial");
T1InstanceTransform t1stat = (T1InstanceTransform)filter.batchTrain( data );
t1stat.setALPHA(0.05);
t1stat.setMIN_WORDS(50); //t1stat.setMAX_WORDS(10000);
t1stat.setSAMPLE(2500);
data = t1stat.transform( data );
}
else if (f.equals("Freq"))
{
int minFreq =3; String model = "document"; // or "document"
System.out.println("Filter Features by Frequency");
FrequencyBasedTransformLearner filter = new FrequencyBasedTransformLearner( minFreq,model );
AbstractInstanceTransform ait = (AbstractInstanceTransform)filter.batchTrain( data );
data = ait.transform( data );
}
else if (f.equals("Info-Gain"))
{
int featureToKeep = 10; String model = "document"; // or "word"
System.out.println("Filter Features with Info-Gain");
InfoGainTransformLearner filter = new InfoGainTransformLearner( model );
InfoGainInstanceTransform infoGain = (InfoGainInstanceTransform)filter.batchTrain( data );
infoGain.setNumberOfFeatures( featureToKeep );
data = infoGain.transform( data );
}
else if (f.equals("Top"))
{
int featureToKeep = 50000; String model = "word"; // or "word"
System.out.println("Filter Features with Top="+featureToKeep);
OrderBasedTransformLearner filter = new OrderBasedTransformLearner( model );
OrderBasedInstanceTransform infoGain = (OrderBasedInstanceTransform)filter.batchTrain( data );
infoGain.setNumberOfFeatures( featureToKeep );
data = infoGain.transform( data );
}
else
{
System.out.println("No Filter was used");
}
return data;
}
public static double[][] computeDisctance(Dataset data)
{
double[][] d = new double[ data.size() ][ data.size() ];
int ii=0;
int jj=0;
for ( Example.Looper i=data.iterator(); i.hasNext(); )
{
System.out.println("ex = " +ii);
Example exi = i.nextExample();
for ( Example.Looper j=data.iterator(); j.hasNext(); )
{
Example exj = j.nextExample();
d[ii][jj] = distance(exi,exj);
jj = (jj+1)%data.size();
}
ii = (ii+1)%data.size();
}
return d;
}
public static double distance(Example exi, Example exj)
{
Instance ini = exi.asInstance();
Instance inj = exj.asInstance();
double num = 0.0;
double denomi = 0.0;
double denomj = 0.0;
Set set = new TreeSet();
for (Feature.Looper i=ini.featureIterator(); i.hasNext();)
{
Feature f = i.nextFeature();
set.add(f);
}
for (Feature.Looper j=inj.featureIterator(); j.hasNext();)
{
Feature f = j.nextFeature();
set.add(f);
}
Feature.Looper looper = new Feature.Looper( set.iterator() );
for ( Feature.Looper i=looper; i.hasNext(); )
{
Feature f = i.nextFeature();
double wi = 0.0;
double wj = 0.0;
try { wi = exi.getWeight(f); }
catch (Exception x) { ; }
try { wj = exj.getWeight(f); }
catch (Exception x) { ; }
num = num + wi*wj;
denomi = denomi + Math.pow( wi,2 );
denomj = denomj + Math.pow( wj,2 );
}
return num / (Math.sqrt(denomi) * Math.sqrt(denomj));
}
static public void main(String[] argv) {
try {
String path = "C:\\Archive-Projects\\PNAS\\pnas\\plaintext\\";
File dirFile = new File(path+"abstracts");
File envFile = new File(path+"plaintext.abs.env");
File dataFile = new File(path+"pnas.abs.data.3rd");
File wordFile = new File(path+"pnas.abs.words.txt");
Dataset d = MakeData.make(dirFile,envFile,dataFile,wordFile);
d = filterDataset(d,""); // Filter ca be "T1", "Freq", "Info-Gain", or "Top"
BasicFeatureIndex fidx = new BasicFeatureIndex(d);
System.out.println( "Dataset:\n # examples = "+d.size() );
System.out.println( " # features = "+fidx.numberOfFeatures() );
DatasetLoader.save( d,dataFile );
PrintStream out = new PrintStream(new FileOutputStream(wordFile));
for (Feature.Looper i=fidx.featureIterator(); i.hasNext(); )
{
Feature f = i.nextFeature();
out.println( f );
}
} catch (Exception x) {;}
System.exit(0);
}
}