package edu.cmu.minorthird.text; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.StringTokenizer; import java.util.TreeSet; import org.apache.log4j.Logger; import edu.cmu.minorthird.util.ProgressCounter; import edu.cmu.minorthird.util.StringUtil; /** * Loads and saves the contents of a TextLabels into a file. * * Labels can be loaded from operations (see importOps) or from a serialized * TextLabels object. Labels can be serialized or types can be saved as * operations, xml, or plain lists. * * @author William Cohen */ public class TextLabelsLoader{ private static Logger log=Logger.getLogger(TextLabelsLoader.class); /** * Spans in labels are a complete list of all spans. */ static final public int CLOSE_ALL_TYPES=1; /** * If a document has been labeled for a type, assume all spans of that type * are there. */ static final public int CLOSE_TYPES_IN_LABELED_DOCS=2; /** Make no assumptions about closure. */ static final public int DONT_CLOSE_TYPES=3; static final public int CLOSE_BY_OPERATION=4; public static final String[] CLOSURE_NAMES= {"CLOSE_ALL_TYPES","CLOSE_TYPES_IN_LABELED_DOCS","DONT_CLOSE_TYPES", "CLOSE_BY_OPERATION"}; private int closurePolicy=CLOSE_BY_OPERATION; private int warnings=0; static private final int MAX_WARNINGS=10; /** * Set the closure policy. * * @param policy * one of CLOSE_ALL_TYPES, CLOSE_TYPES_IN_LABELED_DOCS, * DONT_CLOSE_TYPES */ public void setClosurePolicy(int policy){ this.closurePolicy=policy; } /** * Create a new labeling by importing from a file with importOps. */ public MutableTextLabels loadOps(TextBase base,File file) throws IOException, FileNotFoundException{ MutableTextLabels labels=new BasicTextLabels(base); importOps(labels,base,file); return labels; } /** * Load lines modifying a TextLabels from a file. There are four allowed * operations: addToType, closeType, closeAllTypes, setClosure * * For addToType: The lines must be of the form: * <code>addToType ID LOW LENGTH TYPE</code> where ID is a documentID in the * given TextBase, LOW is a character index into that document, and LENGTH is * the length in characters of the span that will be created as given type * TYPE. If LENGTH==-1, then the created span will go to the end of the * document. * * For closeType: Lines must be <code>closeType ID TYPE</code> where ID is a * documentID in the given TextBase and TYPE is the label type to close over * that document. * * For closeAllTypes: Lines must be <code>closeAllType ID</code> where ID is * a documentID in the given TextBase. The document will be closed for all * types present in the TextLabels <em>after all operations</em> are * performed. * * For setClosure: Lines must be <code>setClosure POLICY</code> where POLICY * is one of the policy types defined in this class. It will immediately * change the closure policy for the loader. This is best used at the * beginning of the file to indicate one of the generic policies or the * CLOSE_BY_OPERATION (default) policy. */ public void importOps(MutableTextLabels labels,TextBase base,File file) throws IOException,FileNotFoundException{ base=labels.getTextBase(); if(base==null) throw new IllegalStateException( "TextBase attached to labels must not be null"); LineNumberReader in=new LineNumberReader(new FileReader(file)); String line=null; List<String> docList=new ArrayList<String>(); try{ while((line=in.readLine())!=null){ if(line.trim().length()==0) continue; if(line.startsWith("#")) continue; log.debug("read line #"+in.getLineNumber()+": "+line); StringTokenizer tok=new StringTokenizer(line); String op; try{ op=advance(tok,in,file); }catch(IllegalArgumentException e){ throw getNewException(e,", failed to find operation."); } if("addToType".equals(op)){ addToType(tok,in,file,base,labels); }else if("setSpanProp".equals(op)){ setSpanProp(tok,in,file,base,labels); }else if("closeType".equals(op)){ String docId=advance(tok,in,file); String type=advance(tok,in,file); Span span=base.documentSpan(docId); if(span!=null){ labels.closeTypeInside(type,span); log.debug("closed "+type+" on "+docId); }else{ warnings++; if(warnings<MAX_WARNINGS){ log.warn("unknown id '"+docId+"' in closeType"); }else if(warnings==MAX_WARNINGS){ log.warn("there will be no more warnings of this sort given"); } } }else if("closeAllTypes".equalsIgnoreCase(op)){ String docId=advance(tok,in,file); docList.add(docId); }else{ throw new IllegalArgumentException("error on line "+ in.getLineNumber()+" of "+file.getName()); } } // close over the doc list for all types seen for(int i=0;i<docList.size();i++){ String docId=(String)docList.get(i); Span span=base.documentSpan(docId); closeLabels(labels.getTypes(),labels,span); } }catch(IllegalArgumentException e){ throw getNewException(e," on line: "+line); } in.close(); closeLabels(labels,closurePolicy); } private void addToType(StringTokenizer tok,LineNumberReader in,File file, TextBase base,MutableTextLabels labels){ String id=advance(tok,in,file); String loStr=advance(tok,in,file); String lenStr=advance(tok,in,file); String type=advance(tok,in,file); String confidence=tok.hasMoreTokens()?advance(tok,in,file):null; int lo,len; try{ lo=Integer.parseInt(loStr); len=Integer.parseInt(lenStr); Span span=base.documentSpan(id); if(span==null){ warnings++; if(warnings<MAX_WARNINGS){ log.warn("unknown id '"+id+"' in addToType "+lo+" "+len); }else if(warnings==MAX_WARNINGS){ log.warn("there will be no more warnings of this sort given"); } }else{ Details details=null; if(confidence!=null) details=new Details(StringUtil.atof(confidence)); if(lo==0&&len<0) labels.addToType(span,type,details); else{ // shortcut: char offsets "0 -1" means the whole document if(len<0) len=span.asString().length()-lo; labels.addToType(span.charIndexSubSpan(lo,lo+len),type,details); } } }catch(NumberFormatException e){ throw new IllegalArgumentException("bad number on line "+ in.getLineNumber()+" of "+file.getName()); } } private void setSpanProp(StringTokenizer tok,LineNumberReader in,File file, TextBase base,MutableTextLabels labels){ String id=advance(tok,in,file); String loStr=advance(tok,in,file); String lenStr=advance(tok,in,file); String prop=advance(tok,in,file); String value=advance(tok,in,file); int lo,len; try{ lo=Integer.parseInt(loStr); len=Integer.parseInt(lenStr); Span span=base.documentSpan(id); if(span==null){ warnings++; if(warnings<MAX_WARNINGS){ log.warn("unknown id '"+id+"'"); }else if(warnings==MAX_WARNINGS){ log.warn("there will be no more warnings of this sort given"); } }else{ if(lo==0&&len<0) labels.setProperty(span,prop,value); else{ if(len<0) len=span.asString().length()-lo; labels.setProperty(span.charIndexSubSpan(lo,lo+len),prop,value); } } }catch(NumberFormatException e){ throw new IllegalArgumentException("bad number on line "+ in.getLineNumber()+" of "+file.getName()); } } private static IllegalArgumentException getNewException( IllegalArgumentException e,String addToMsg){ String msg=e.getMessage()+addToMsg; StackTraceElement[] trace=e.getStackTrace(); IllegalArgumentException exception=new IllegalArgumentException(msg); exception.setStackTrace(trace); return exception; } private String advance(StringTokenizer tok,LineNumberReader in,File file){ if(!tok.hasMoreTokens()) throw new IllegalArgumentException("error on line "+in.getLineNumber()+ " of "+file.getName()+" failed to find token"); return tok.nextToken(); } /** * Close labels on the labels according to the policy. This applies the same * policy to all documents and types in the labels. To get finer control of * closure use closeLabels(Set, MutableTextLabels, Span) or * MutableTextLabels.closeTypeInside(...) * * @param labels * @param policy */ public void closeLabels(MutableTextLabels labels,int policy){ Set<String> types=labels.getTypes(); TextBase base=labels.getTextBase(); switch(policy){ case CLOSE_ALL_TYPES: for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){ Span document=i.next(); closeLabels(types,labels,document); } break; case CLOSE_TYPES_IN_LABELED_DOCS: Set<Span> labeledDocs=new TreeSet<Span>(); for(Iterator<String> j=types.iterator();j.hasNext();){ String type=j.next(); for(Iterator<Span> i=labels.instanceIterator(type);i.hasNext();){ Span span=i.next(); labeledDocs.add(span.documentSpan()); } } for(Iterator<Span> i=labeledDocs.iterator();i.hasNext();){ Span document=i.next(); closeLabels(types,labels,document); } break; case DONT_CLOSE_TYPES: // do nothing for this break; case CLOSE_BY_OPERATION: // already closed in theory break; default: log.warn("closure policy("+policy+") not recognized"); } } /** * Close all types in the typeSet on the given document * * @param typeSet * set of types to close for this document * @param labels * TextLabels holding the types * @param document * Span to close types over */ private void closeLabels(Set<String> types,MutableTextLabels labels, Span document){ for(Iterator<String> j=types.iterator();j.hasNext();){ String type=j.next(); labels.closeTypeInside(type,document); } } /** Read in a serialized TextLabels. */ public MutableTextLabels loadSerialized(File file,TextBase base) throws IOException,FileNotFoundException{ try{ ObjectInputStream in= new ObjectInputStream(new BufferedInputStream(new FileInputStream( file))); MutableTextLabels labels=(MutableTextLabels)in.readObject(); labels.setTextBase(base); in.close(); return labels; }catch(ClassNotFoundException e){ throw new IllegalArgumentException("can't read TextLabels from "+file+ ": "+e); } } /** Serialize a TextLabels. */ public void saveSerialized(MutableTextLabels labels,File file) throws IOException{ ObjectOutputStream out= new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream( file))); out.writeObject(labels); out.flush(); out.close(); } /** Save extracted data in a format readable with loadOps. */ public String printTypesAsOps(TextLabels labels){ StringBuffer out=new StringBuffer(); ProgressCounter pc= new ProgressCounter("saving labels","type",labels.getTypes().size()); for(Iterator<String> i=labels.getTypes().iterator();i.hasNext();){ String type=i.next(); ProgressCounter pc2=new ProgressCounter("saving type "+type,"span"); for(Iterator<Span> j=labels.instanceIterator(type);j.hasNext();){ Span s=j.next(); if(s.size()>0){ int lo=s.getTextToken(0).getLo(); int hi=s.getTextToken(s.size()-1).getHi(); Details details=labels.getDetails(s,type); if(details==null||details==Details.DEFAULT){ out.append("addToType "+s.getDocumentId()+" "+lo+" "+(hi-lo)+" "+ type+"\n"); }else{ out.append("addToType "+s.getDocumentId()+" "+lo+" "+(hi-lo)+" "+ type+" "+details.getConfidence()+"\n"); } }else{ warnings++; if(warnings<MAX_WARNINGS){ log.warn("forgetting label on empty span type "+type+": "+s); }else if(warnings==MAX_WARNINGS){ log.warn("there will be no more warnings of this sort given"); } } pc2.progress(); } pc2.finished(); Iterator<Span> it=labels.closureIterator(type); while(it.hasNext()){ Span s=it.next(); Span doc=s.documentSpan(); if(s.size()!=doc.size()){ throw new UnsupportedOperationException( "can't save environment with closureSpans!=docSpans"); } out.append("closeType "+s.getDocumentId()+" "+type+"\n"); } pc.progress(); } pc.finished(); ProgressCounter pc3= new ProgressCounter("saving labels","property",labels .getSpanProperties().size()); for(Iterator<String> i=labels.getSpanProperties().iterator();i.hasNext();){ String prop=i.next(); for(Iterator<Span> j=labels.getSpansWithProperty(prop);j.hasNext();){ Span s=j.next(); if(s.size()>0){ String val=labels.getProperty(s,prop); int lo=s.getTextToken(0).getLo(); int hi=s.getTextToken(s.size()-1).getHi(); out.append("setSpanProp "+s.getDocumentId()+" "+lo+" "+(hi-lo)+" "+ prop+" "+val+"\n"); } } pc3.progress(); } pc3.finished(); return out.toString(); } /** Save extracted data in a format readable with loadOps. */ public void saveTypesAsOps(TextLabels labels,File file) throws IOException{ PrintStream out=new PrintStream(new FileOutputStream(file)); out.println(printTypesAsOps(labels)); out.close(); } /** * Save spans of given type into the file, one per line. Linefeeds in strings * are replaced with spaces. */ public void saveTypesAsStrings(TextLabels labels,File file, boolean includeOffset) throws IOException{ PrintStream out=new PrintStream(new FileOutputStream(file)); // Do types for(Iterator<String> j=labels.getTypes().iterator();j.hasNext();){ String type=j.next(); for(Iterator<Span> i=labels.instanceIterator(type);i.hasNext();){ Span span=i.next(); if(span.size()>0){ out.print(type); if(includeOffset){ out.print(":"+span.getDocumentId()+":"+span.getTextToken(0).getLo()+ ":"+span.getTextToken(span.size()-1).getHi()); } out.println("\t"+span.asString().replace('\n',' ')); } } } // Do props for(Iterator<String> i=labels.getSpanProperties().iterator();i.hasNext();){ String prop=i.next(); for(Iterator<Span> j=labels.getSpansWithProperty(prop);j.hasNext();){ Span span=j.next(); if(span.size()>0){ String val=labels.getProperty(span,prop); if(!prop.equals("_prediction")){ out.print(prop); out.print("="); } out.print(val); if(includeOffset){ out.print(":"+span.getDocumentId()+":"+span.getTextToken(0).getLo()+ ":"+span.getTextToken(span.size()-1).getHi()); } out.println("\t"+span.asString().replace('\n',' ')); } } } out.close(); } /** * Save documents to specified directory with extracted types embedded as xml. */ public void saveDocsWithEmbeddedTypes(TextLabels labels,File dir) throws IOException{ Span currDoc; Iterator<Span> looper=labels.getTextBase().documentSpanIterator(); PrintStream out; if(dir.exists()){ log.warn(dir+" already exists, some files may be overwritten."); } else if(!dir.mkdir()){ throw new IOException("Could not create directory named: "+dir); } while(looper.hasNext()){ // this call returns the entire document with all labels embedded as xml currDoc=looper.next(); out= new PrintStream(new FileOutputStream(new File(dir+"/"+ currDoc.getDocumentId()))); out.println(createXMLmarkup(currDoc.getDocumentId(),labels)); out.close(); } } /** * Save extracted data in an XML format. Convert to string * <root>..<type>...</type>..</root>. <br> * <br> * In the even that labels overlap such as [A (B C] D)E an * IllegalArgumentException is thrown because a well-formed XML document * cannot be created. */ public String createXMLmarkup(String documentId,TextLabels labels){ Span docSpan=labels.getTextBase().documentSpan(documentId); String docString= labels.getTextBase().documentSpan(documentId).getDocumentContents(); // Put all labels and their info in a list List<LabelInfo> unsortedLabels=new ArrayList<LabelInfo>(); // Do types for(Iterator<String> i=labels.getTypes().iterator();i.hasNext();){ String type=i.next(); for(Iterator<Span> j=labels.instanceIterator(type,documentId);j.hasNext();){ Span s=j.next(); int start=s.documentSpanStartIndex(); int end=start+s.size()-1; unsortedLabels.add(new LabelInfo(s,type,start,end)); } } // Do props for(Iterator<String> i=labels.getSpanProperties().iterator();i.hasNext();){ String prop=i.next(); for(Iterator<Span> j=labels.getSpansWithProperty(prop,documentId);j.hasNext();){ Span s=j.next(); String val=labels.getProperty(s,prop); int start=s.documentSpanStartIndex(); int end=start+s.size()-1; if(prop.equals("_prediction")){ unsortedLabels.add(new LabelInfo(s,val,start,end)); } else{ unsortedLabels.add(new LabelInfo(s,prop+"."+val,start,end)); } } } // Sort the labels. If two spans are overlapping then throw an exception List<LabelInfo> sortedLabels= new ArrayList<LabelInfo>(unsortedLabels.size()); while(unsortedLabels.size()>0){ LabelInfo curLabel=unsortedLabels.remove(0); int position=-1; boolean overlap=false; // Iterate through sortedLabels for(int j=0;j<sortedLabels.size();j++){ LabelInfo compLabel=(LabelInfo)sortedLabels.get(j); // Find if there is an overlap if((curLabel.start<compLabel.start&&curLabel.end>compLabel.start)&& (curLabel.end<compLabel.end)) overlap=true; else if((curLabel.start>compLabel.start&&curLabel.start<compLabel.end)&& (curLabel.end>compLabel.end)) overlap=true; // Find position if((curLabel.start<compLabel.start)|| ((curLabel.start==compLabel.start)&&(curLabel.end>=compLabel.end))){ position=j; break; } } // If the label overlapped with another label, then throw an exception if(overlap) throw new IllegalArgumentException( "Labels contain overalpping spans, cannot save as XML format."); // Otherwise add the label to the proper position in the sorted list. if(position>-1) sortedLabels.add(position,curLabel); else sortedLabels.add(curLabel); } // Create sorted list of tags List<TagInfo> sortedTags=new ArrayList<TagInfo>(sortedLabels.size()*2); for(int i=0;i<sortedLabels.size();i++){ LabelInfo label=sortedLabels.get(i); sortedTags.add(new TagInfo(label.start,"<"+label.type+">",true)); } boolean added=false; while(sortedLabels.size()>0){ LabelInfo label=sortedLabels.remove(0); added=false; for(int y=0;y<sortedTags.size();y++){ TagInfo tag=(TagInfo)sortedTags.get(y); if(label.end<tag.pos){ sortedTags.add(y,new TagInfo(label.end,"</"+label.type+">",false)); added=true; break; } } if(!added){ sortedTags.add(new TagInfo(label.end,"</"+label.type+">",false)); } } // Create markedup StringBuffer StringBuffer buffer=new StringBuffer(); buffer.append("<root>"); int docPos=0,pos=0; while(sortedTags.size()>0){ TagInfo curTag=sortedTags.remove(0); if(curTag.pos<docSpan.size()){ if(curTag.isOpenTag) pos=docSpan.subSpan(curTag.pos,1).getTextToken(0).getLo(); else pos=docSpan.subSpan(curTag.pos,1).getTextToken(0).getHi(); }else pos=docString.length(); buffer.append(docString.substring(docPos,pos)); buffer.append(curTag.tag); docPos=pos; } buffer.append(docString.substring(docPos,docString.length())); buffer.append("</root>"); return buffer.toString(); } private class TagInfo{ public int pos; public String tag; public boolean isOpenTag; public TagInfo(int pos,String tag,boolean isOpenTag){ this.pos=pos; this.tag=tag; this.isOpenTag=isOpenTag; } } private class LabelInfo{ // public Span span; public String type; public int start; public int end; public LabelInfo(Span span,String type,int start,int end){ // this.span=span; this.type=type; this.start=start; this.end=end; } } // // Helper method used to maintain a set of tag boundaries // private void setBoundary(SortedMap<Span,Set<String[]>> boundaries, // String beginOrEnd,String type,Span s){ // Set<String[]> ops=boundaries.get(s); // if(ops==null) // boundaries.put(s,(ops=new HashSet<String[]>())); // ops.add(new String[]{beginOrEnd,type}); // } /** Save extracted data in an XML format */ public String saveTypesAsXML(TextLabels labels){ StringBuffer buf=new StringBuffer("<extractions>\n"); for(Iterator<String> i=labels.getTypes().iterator();i.hasNext();){ String type=i.next(); for(Iterator<Span> j=labels.instanceIterator(type);j.hasNext();){ Span s=j.next(); int lo=s.getTextToken(0).getLo(); int hi=s.getTextToken(s.size()-1).getHi(); buf.append(" <"+type+" lo="+lo+" hi="+hi+">"+s.asString()+"</"+type+ ">\n"); } } buf.append("</extractions>\n"); return buf.toString(); } }