import edu.cmu.minorthird.util.*;
import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.text.gui.*;
import edu.cmu.minorthird.text.learn.*;
import edu.cmu.minorthird.text.learn.experiments.*;
import edu.cmu.minorthird.text.mixup.*;
import edu.cmu.minorthird.classify.*;
import edu.cmu.minorthird.classify.experiments.*;
import edu.cmu.minorthird.classify.algorithms.linear.*;
import edu.cmu.minorthird.classify.algorithms.trees.*;
import java.util.*;
import java.util.regex.*;
import java.io.*;
import org.apache.log4j.*;
// 'from' => ["Y-caption.txt"],
// 'produces' => ["proteinInPanelLabeled Y","cellInPanelLabeled Y", "Y-label.txt"],
public class CaptionProcessor
{
static final boolean DEBUG = true;
static { Mixup.maxNumberOfMatchesPerToken = 20; }
private static Annotator regionalAnnotator,localAnnotator;
private static MixupProgram scopingProgram, cellTypeProgram, proteinProgram;
static {
try {
System.out.println("load scope.mixup..");
scopingProgram = new MixupProgram(new File("scope.mixup"));
System.out.println("load cell.mixup...");
cellTypeProgram = new MixupProgram(new File("cell.mixup"));
System.out.println("load protein.mixup...");
proteinProgram = new MixupProgram(new File("protein.mixup"));
System.out.println("load learned annotators...");
regionalAnnotator = loadAnnotator("regional");
localAnnotator = loadAnnotator("local");
} catch (Exception e) {
throw new IllegalStateException("mixup or io error: "+e);
}
}
//
// private data
//
private Map imgPtrForScope;
private List imagePtrList;
private Map imagePtrDefinition;
private Set allLabels;
private List imagePtrEntityPairs;
public void processCaption(String caption,String proteinFile,String cellFile, String labelFile)
{
BasicTextBase base = new BasicTextBase();
base.loadDocument("theCaption",caption);
MonotonicTextLabels labels = new BasicTextLabels(base);
processLabels(labels,proteinFile,cellFile,labelFile);
}
public void processLabels(MonotonicTextLabels labels,String proteinFile,String cellFile, String labelFile)
{
MixupInterpreter interp = new MixupInterpreter();
if (DEBUG) System.out.println("feature construction...");
interp.setProgram(LearnImagePtrExtractor.featureProgram);
interp.eval(labels);
if (DEBUG) System.out.println("regional annotator...");
regionalAnnotator.annotate(labels);
if (DEBUG) System.out.println("local annotator...");
localAnnotator.annotate(labels);
if (DEBUG) System.out.println("finding cells...");
interp.setProgram(cellTypeProgram);
interp.eval(labels);
if (DEBUG) System.out.println("finding proteins...");
interp.setProgram(proteinProgram);
interp.eval(labels);
if (DEBUG) System.out.println("scoping...");
interp.setProgram(scopingProgram);
interp.eval(labels);
// figure out which image pointer 'owns' which scope
imgPtrForScope = new TreeMap();
imagePtrList = new ArrayList();
String[] ptrTypes = new String[] { "local", "regional" };
for (int i=0; i<ptrTypes.length; i++) {
for (Span.Looper j=labels.instanceIterator(ptrTypes[i]); j.hasNext(); ) {
Span imgPtrSpan = j.nextSpan();
imagePtrList.add(imgPtrSpan);
Span scopeSpan = findContainingSpan(imgPtrSpan, labels, ptrTypes[i]+"Scope");
if (scopeSpan!=null) imgPtrForScope.put( scopeSpan, imgPtrSpan );
}
}
// figure out which entities belong to which scopes
imagePtrEntityPairs = new ArrayList();
String[] entityTypes = new String[] { "protein", "cell" };
for (int i=0; i<entityTypes.length; i++) {
for (Span.Looper j=labels.instanceIterator(entityTypes[i]); j.hasNext(); ) {
Span entitySpan = j.nextSpan();
if (DEBUG) System.out.println("associating "+entitySpan);
// find scope of each type containing span and associate imgPtrForScope(scope) & span
for (int k=0; k<ptrTypes.length; k++) {
Span containingScope = findContainingSpan(entitySpan,labels,ptrTypes[k]+"Scope");
if (containingScope!=null && imgPtrForScope.get(containingScope)!=null) {
associate((Span)imgPtrForScope.get(containingScope), entityTypes[i], entitySpan);
} else {
if (DEBUG) System.out.println(" - not in "+ptrTypes[k]+" scope");
}
}//ptrType k
// stuff in global scope is associated with all img ptrs
Span globalScope = findContainingSpan(entitySpan,labels,"globalScope");
if (globalScope!=null) {
if (DEBUG) System.out.println(" - in global scope");
String id = globalScope.getDocumentId();
for (int k=0; k<ptrTypes.length; k++) {
for (Span.Looper el=labels.instanceIterator(ptrTypes[k],id); el.hasNext(); ) {
Span imgPtrSpan = el.nextSpan();
associate(imgPtrSpan, entityTypes[i], entitySpan);
}
}
}//globalScope
} //entitySpan j
} //entity type i
// expand out the 'definition' of the image pointers
imagePtrDefinition = new TreeMap();
allLabels = new TreeSet();
Pattern p1 = Pattern.compile(".*\\b([A-Z])\\s*-\\s*([A-Z])\\b.*");
Pattern p2 = Pattern.compile(".*\\b([a-z])\\s*-\\s*([a-z])\\b.*");
Pattern p3 = Pattern.compile(".*\\b([A-Za-z])\\b.*");
for (Iterator i=imagePtrList.iterator(); i.hasNext(); ) {
Span span = (Span)i.next();
String string = span.asString();
Matcher m1 = p1.matcher(string);
while (m1.find()) defineRange(span,string,m1);
Matcher m2 = p2.matcher(string);
while (m2.find()) defineRange(span,string,m2);
Matcher m3 = p3.matcher(string);
while (m3.find()) defineLetter(span,string,m3);
}
try {
writeFacts( "protein", proteinFile );
writeFacts( "cell", cellFile );
PrintStream s = new PrintStream(new FileOutputStream(new File(labelFile)));
for (Iterator i=allLabels.iterator(); i.hasNext(); ) {
s.println( (String) i.next() );
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("Error: "+e.toString());
}
}
// write out entity-related facts
private void writeFacts(String entityType, String fileName) throws IOException
{
String myself = "caption/parseCaption.pl";
// String myself = caption/CaptionProcessor.java;
PrintStream s = new PrintStream(new FileOutputStream(new File(fileName)));
for (Iterator i=imagePtrEntityPairs.iterator(); i.hasNext(); ) {
ArrayList pair = (ArrayList)i.next();
String type = (String)pair.get(0);
if (entityType.equals(type)) {
Span imgPtrSpan = (Span)pair.get(1);
Span entitySpan = (Span)pair.get(2);
String figureId = imgPtrSpan.getDocumentId(); // set by caller!
for (Iterator j=((Set)imagePtrDefinition.get(imgPtrSpan)).iterator(); j.hasNext(); ) {
String entityName = entitySpan.asString();
String label = (String)j.next();
s.println(entityType+"InPanelLabeled\t"+figureId+"\tstring://"+entityName+"\tstring://"+label
+"\t"+myself+"\t1");
}
}
}
s.close();
}
// define the semantics of an imgPtrSpan of the form 'b-c'
private void defineRange(Span span, String string, Matcher matcher)
{
char lo = string.charAt(matcher.start(1));
char hi = string.charAt(matcher.start(2));
TreeSet set = (TreeSet)imagePtrDefinition.get(span);
if (set==null) imagePtrDefinition.put( span, (set=new TreeSet()) );
for (char ch=lo; ch<=hi; ch++) {
StringBuffer buf = new StringBuffer("");
buf.append(ch);
set.add( buf.toString() );
allLabels.add( buf.toString() );
}
}
// define the semantics of an imgPtrSpan of the form 'a'
private void defineLetter(Span span, String string, Matcher matcher)
{
char ch = string.charAt(matcher.start(1));
TreeSet set = (TreeSet)imagePtrDefinition.get(span);
if (set==null) imagePtrDefinition.put(span, (set=new TreeSet()) );
StringBuffer buf = new StringBuffer("");
buf.append(ch);
set.add( buf.toString() );
}
// associate an imgPtrSpan with an entity
private void associate(Span imgPtrSpan,String entityType,Span entitySpan)
{
if (DEBUG) System.out.println("img ptr: "+imgPtrSpan+" entity: "+entitySpan);
List pair = new ArrayList(3);
pair.add(entityType);
pair.add(imgPtrSpan);
pair.add(entitySpan);
imagePtrEntityPairs.add(pair);
}
// find a span of given type containing s
private Span findContainingSpan(Span s,TextLabels labels,String type)
{
String id = s.getDocumentId();
for (Span.Looper j=labels.instanceIterator(type,id); j.hasNext(); ) {
Span t = j.nextSpan();
if (t.contains(s)) return t;
}
return null;
}
// load classifier learned by LearnImagePtrExtractor and make it into an annotator
private static Annotator loadAnnotator(String className) throws IOException
{
BinaryClassifier filter = (BinaryClassifier)IOUtil.loadSerialized(new File("lib/"+className+"Filter.ser"));
SpanFeatureExtractor fe = new ImgPtrFE();
SpanFinder candidateFinder = LearnImagePtrExtractor.candidateFinder;
String output = className;
Annotator learnedAnnotator = new FinderAnnotator( new FilteredFinder(filter,fe,candidateFinder), output );
return learnedAnnotator;
}
/**
* Main program. Takes lines of the form
* <pre>
* X X-captionContent.txt X-proteinInPanelLabeled.FACTS X-cellInPanelLabeled.FACTS X-label.txt
*</pre>
* where each word is a file name, and creates the last three files from the first.
*/
static public void main(String argv[]) throws IOException,Mixup.ParseException
{
boolean interactive = argv.length>0 && argv[0].startsWith("-interact");
System.out.println("creating caption processor...");
CaptionProcessor cp = new CaptionProcessor();
System.out.println("caption processor created, interactive="+interactive);
String line = null;
LineNumberReader reader = new LineNumberReader(new BufferedReader(new InputStreamReader(System.in)));
while ((line = reader.readLine())!=null) {
String[] fileNames = line.split("\\s+");
if (fileNames.length!=5) {
System.out.println("Error: expected two input, three output files");
continue;
}
String caption = loadFileContent( fileNames[1]) ;
BasicTextBase base = new BasicTextBase();
base.loadDocument( fileNames[0], caption );
MutableTextLabels labels = new BasicTextLabels(base);
for (int i=0; i<100; i++) {
long start = System.currentTimeMillis();
System.out.println("processing labels with "+base.size()+" docs");
cp.processLabels(labels,fileNames[2],fileNames[3],fileNames[4]);
long end = System.currentTimeMillis();
System.out.println("processing time was "+((end-start)/1000.0)+" sec");
}
if (interactive) {
System.out.println("launching viewer...");
TextBaseEditor.edit( labels, null );
}
}
}
static private String loadFileContent(String fileName) {
try {
LineNumberReader bReader = new LineNumberReader(new BufferedReader(new FileReader(fileName)));
StringBuffer buf = new StringBuffer("");
String line = null;
while ((line = bReader.readLine()) != null) {
buf.append(line);
buf.append("\n");
}
bReader.close();
return buf.toString();
} catch (Exception e) {
e.printStackTrace();
System.out.println("Error: "+e.toString());
return null;
}
}
}