package edu.cmu.minorthird.text; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; /** * Manages the mappings between TextBases. * * This class maintains a mapping of names to instances of TextBase. All of the TextBases in the * mapping are derived from the "root" level TextBase that was added first. Currently there are * two ways to derive a new TextBase from an existing one: {@link #filter(String, TextLabels, String, String) filter} * and {@link #retokenize(Tokenizer, String, String) retokenize}. * * * @author Quinten Mercer */ public class TextBaseManager{ private Map<String,TextBaseEntry> textBases=new HashMap<String,TextBaseEntry>(); private Map<String,TextBaseMapper> textBaseMappers=new HashMap<String,TextBaseMapper>(); /** * Creates a new TextBaseManager using the specified textbase as the root textbase * and "root" as the name to identify it. */ public TextBaseManager(TextBase rootBase){ textBases.put("root",new TextBaseEntry("root",rootBase,0,null)); } /** * Creates a new TextBaseManager using the specified textbase as the root textbase and * the specified name is used in place of "root" to identify it. */ public TextBaseManager(String rootBaseName,TextBase rootBase){ textBases.put(rootBaseName,new TextBaseEntry(rootBaseName,rootBase,0,null)); } /** Returns a boolean indicating whether or not this manager has a level with the specified name */ public boolean containsLevel(String levelName){ return textBases.containsKey(levelName); } /** Returns the textbase identified by name. */ public TextBase getTextBase(String name){ TextBaseEntry entry=textBases.get(name); return entry.getTextBase(); } /** * Adds a textbase to the manager that is a child of parentName. Null parentName * creates a new root textbase. Note that a single manager can maintain multiple * sets of textbases by adding multiple root textbases. */ private void addTextBase(String parentName,String childName, TextBase childTextBase,TextBaseMapper mapper){ TextBaseEntry parentEntry=null; // Make sure that there is not a textbase being managed with the desired child name. if(textBases.get(childName)!=null) throw new IllegalArgumentException("TextBase already exists with name: "+ childName); // Get the entry of the parent if(parentName!=null) parentEntry=textBases.get(parentName); // Add the new text base and it's mapper to the store of text bases and mappers if(parentEntry!=null){ // There's a parent so create a child level int parentLevel=parentEntry.getLevel(); textBases.put(childName,new TextBaseEntry(childName,childTextBase, parentLevel+1,parentEntry)); textBaseMappers.put(childName,mapper); }else{ // There is no parent so create a root level. textBases .put(childName,new TextBaseEntry(childName,childTextBase,0,null)); textBaseMappers.put(childName,mapper); } } /** * Sometimes you may not have a source span, but rather only have a char offset in the source doc. There * are two scenarios where this could happen. First, it may be the case that you really just want to map * some char offset of an existing document. In this case this method will simply get the documentSpan * for the doc, use Span.charIndexSubSpan to create a span to map, and then forward the call to the * getMatchingSpan method that takes a source Span instance. The other situation is where you may need to * map sequences of chars before the document is actually in a TextBase. For instance, FilterTokenizer * needs to map char sequences in order to tokenize a document. This works because you can create maps * between documents in two text bases even if the destination document doesn't yet exist in the TextBase. * To make it happed, this method first maps the char offset to a span in it's parent, then calls * getMatchingSpan to propagate the mapping down to the destination textbase. */ public Span getMatchingSpan(String srcName,String srcDocId,int srcOffset, int length,String dstName){ TextBaseEntry srcEntry=textBases.get(srcName); if(srcEntry==null) throw new IllegalArgumentException("There is no text base named: "+ srcName+" in this manager."); // First try to get the document span for the source document Span srcDocSpan=srcEntry.getTextBase().documentSpan(srcDocId); if(srcDocSpan!=null){ return this.getMatchingSpan(srcDocSpan.charIndexSubSpan(srcOffset, srcOffset+length),srcName,dstName); } // If the document is unavailable then get the mapper that maps between the source and dest text bases. TextBaseMapper mapper= textBaseMappers.get(srcEntry.getName()); if(mapper==null) return null; // Get the mapping for the char index sequence in the source document to its parent. TextBaseMapper.MapEntry mapping= mapper.getChildMapping(srcDocId,srcOffset,length); // If no mapping could be found just return null if(mapping==null) return null; // Extract the info from the mapping to create a span in the parent document String parentDocId=mapping.dstDocId; int parentOffset=mapping.dstOffset+(srcOffset-mapping.srcOffset); // Get the span in the parent document that corresponds to this char index sequence Span parentSpan= srcEntry.getParent().getTextBase().documentSpan(parentDocId) .charIndexSubSpan(parentOffset,parentOffset+length); // Finally, map this span to the destination level using the normal mechanisms. return this.getMatchingSpan(parentSpan,srcEntry.getParent().getName(), dstName); //Span srcSpan = srcEntry.getTextBase().documentSpan(srcDocId).charIndexSubSpan(srcOffset, srcOffset+length); //return this.getMatchingSpan(srcSpan, srcName, dstName); } /** * Finds a mapping path from the source text base to the destination textbase and translates * the specified span through each successive mapping until the coresponding span in the * destination text base is located. */ public Span getMatchingSpan(Span span,String srcName,String dstName){ TextBaseEntry srcEntry=textBases.get(srcName); TextBaseEntry dstEntry=textBases.get(dstName); if(srcEntry==null) throw new IllegalArgumentException("There is no text base named: "+ srcName+" in this manager."); if(dstEntry==null) throw new IllegalArgumentException("There is no text base named: "+ dstName+" in this manager."); if(srcEntry.getTextBase().getDocument(span.getDocumentId())==null) throw new IllegalArgumentException( "The document that the specified span refers to is not in the source text base."); // Lists to store the path from both text bases to one that is common between them List<TextBaseMapper> srcMapperList=new ArrayList<TextBaseMapper>(); List<TextBaseMapper> dstMapperList=new ArrayList<TextBaseMapper>(); // Generate a path of mappers that links from the src text base to the dst text base TextBaseEntry currSrcEntry=textBases.get(srcName); TextBaseEntry currDstEntry=textBases.get(dstName); while(currSrcEntry.getLevel()!=currDstEntry.getLevel()){ if(currSrcEntry.getLevel()>currDstEntry.getLevel()){ srcMapperList.add(textBaseMappers.get(currSrcEntry.getName())); currSrcEntry=currSrcEntry.getParent(); }else{ dstMapperList.add(textBaseMappers.get(currDstEntry.getName())); currDstEntry=currDstEntry.getParent(); } } while(currSrcEntry!=currDstEntry){ srcMapperList.add(textBaseMappers.get(currSrcEntry.getName())); currSrcEntry=currSrcEntry.getParent(); dstMapperList.add(textBaseMappers.get(currDstEntry.getName())); currDstEntry=currDstEntry.getParent(); } // Now follow that path from src to dst mapping the span to each intermediate text base // until we ultimately end up with the span in the dst text base. If at anytime we // encounter a null value for a mapped span, this indicates that there is no mapping // for this span between the source and destination text bases so return null Span matchingSpan=span; Iterator<TextBaseMapper> srcIterator=srcMapperList.iterator(); while(srcIterator.hasNext()){ TextBaseMapper currMapper=srcIterator.next(); matchingSpan=currMapper.getMappedParentSpan(matchingSpan); if(matchingSpan==null) return null; } Iterator<TextBaseMapper> dstIterator=dstMapperList.iterator(); while(dstIterator.hasNext()){ TextBaseMapper currMapper=dstIterator.next(); matchingSpan=currMapper.getMappedChildSpan(matchingSpan); if(matchingSpan==null) return null; } return matchingSpan; } /** * Creates a new TextBase named newLevelName from an existing TextBase named parentLevelName. This * new TextBase has the exact same document set as the parent, but all the docs will be retokenized * using the specified Tokenizer. */ public MutableTextBase retokenize(Tokenizer newTokenizer, String parentLevelName,String newLevelName){ TextBaseEntry parentEntry=textBases.get(parentLevelName); if(parentEntry==null) throw new IllegalArgumentException("There is no text base named: "+ parentLevelName+" in this manager."); BasicTextBase newTextBase=new BasicTextBase(newTokenizer); TextBaseMapper newMapper= new TextBaseMapper(parentEntry.getTextBase(),newTextBase); addTextBase(parentLevelName,newLevelName,newTextBase,newMapper); Iterator<Span> docsLooper=textBases.get(parentLevelName).getTextBase().documentSpanIterator(); while(docsLooper.hasNext()){ Span currDocSpan=docsLooper.next(); newTextBase.loadDocument(currDocSpan.getDocumentId(),currDocSpan .getDocumentContents()); // Retokenizing does NOT change the underlying document structure so all we need to do is add a single // map entry that maps position 0 from the parent text base to position 0 in the child text base. Also // the documentIds don't change in the new textbase. newMapper.mapPlace(currDocSpan.getDocumentId(),0,currDocSpan .getDocumentId(),0); } return newTextBase; } /** * Creates a new TextBase named newLevelName from an existing TextBase named parentLevelName. This * new TextBase will contain a document for each instance of the provided spanType in the parent * TextBase (specified by parentLabels). For example if a document in the parent TextBase has 3 * instances of the specified spanType, then the new TextBase will have 3 separate documents. All * text that is not part of the specified spanType is filtered out and does not appear in the * new TextBase anywhere. */ public TextBase filter(String parentLevelName,TextLabels parentLabels, String newLevelName,String spanType){ BasicTextBase newTextBase= new BasicTextBase( new FilterTokenizer(this,newLevelName,parentLevelName)); TextBaseMapper newMapper= new TextBaseMapper(parentLabels.getTextBase(),newTextBase); addTextBase(parentLevelName,newLevelName,newTextBase,newMapper); Iterator<Span> typeInstances=parentLabels.instanceIterator(spanType); String prevDocId=""; //useful for checking whether the next span is in the same doc int docNum=0; //counts how many spans have the type in each document while(typeInstances.hasNext()){ Span currInstance=typeInstances.next(); String curDocId=currInstance.getDocumentId(); // This code assumes that the TextBase.instanceIterator method returns the spans ordered // by document ID. This method makes NO guarantee that this will be true. if(curDocId.equals(prevDocId)) docNum++; else docNum=0; String newDocID="childTB"+docNum+"-"+curDocId; // Map the doc span in the old text base to the correct document in the new text base. No offset // is required in the new doc since it we are just chopping up the original doc into pieces. newMapper.mapPlace(curDocId,currInstance.getLoChar(),newDocID,0); prevDocId=curDocId; String newDocText=currInstance.asString(); int startIndex=currInstance.getLoChar(); newTextBase.loadDocument(newDocID,newDocText,startIndex); } return newTextBase; } // // Used internally to help manage the set of TextBases // private class TextBaseEntry{ private String entryName; private TextBase textBase; private TextBaseEntry parent; private int level; public TextBaseEntry(String newEntryName,TextBase newTextBase,int newLevel, TextBaseEntry newParent){ entryName=newEntryName; textBase=newTextBase; level=newLevel; parent=newParent; } public String getName(){ return entryName; } public TextBase getTextBase(){ return textBase; } public int getLevel(){ return level; } public TextBaseEntry getParent(){ return parent; } } // // Used internally to create the map between two textBases. // private class TextBaseMapper{ private TextBase parent; private TextBase child; private Map<String,SortedSet<MapEntry>> parentToChildMap; private Map<String,SortedSet<MapEntry>> childToParentMap; public TextBaseMapper(TextBase parent,TextBase child){ this.parent=parent; this.child=child; this.parentToChildMap=new HashMap<String,SortedSet<MapEntry>>(); this.childToParentMap=new HashMap<String,SortedSet<MapEntry>>(); } /** * Adds a mapping between two documents. This has the effect of mapping a point in the parent * document to a point in the child document (and vice versa). However, it is assumed that all * following characters up to the next mapped point are also mapped in order. * * For instance: Say the parent document is 20 characters long and there are two children docs * each of which is 10 characters long. If there are mappings from parent:0 to child1:0 and * from parent:11 to child2:0, then what we really have is a mapping of the first 10 chars of the * parent to the first 10 chars in child1 and a mapping of the last 10 chars in parent to the * first 10 chars in child2. */ public void mapPlace(String parentDocId,int parentOffset,String childDocId, int childOffset){ SortedSet<MapEntry> parentEntry=parentToChildMap.get(parentDocId); if(parentEntry==null){ parentEntry=new TreeSet<MapEntry>(); parentToChildMap.put(parentDocId,parentEntry); } parentEntry.add(new MapEntry(parentDocId,parentOffset,childDocId, childOffset)); SortedSet<MapEntry> childEntry=childToParentMap.get(childDocId); if(childEntry==null){ childEntry=new TreeSet<MapEntry>(); childToParentMap.put(childDocId,childEntry); } childEntry.add(new MapEntry(childDocId,childOffset,parentDocId, parentOffset)); } /** * Gets the MapEntry for the parent TextBase that includes the position listed in parentOffset */ public MapEntry getParentMapping(String parentDocId,int parentOffset, int length){ SortedSet<MapEntry> parentDocMap=parentToChildMap.get(parentDocId); if(parentDocMap==null) throw new IllegalArgumentException( "Document containing parent char sequence has no mappings."); // Iterate through this document's map entries until we find the entry that contains the entire parent span. // If there is no entry that contains the parent span, then give an error. The entry is found by finding the // first entry whose offset is greater than both the start and end of the parent, then the previous entry has // the info we need. Iterator<MapEntry> it=parentDocMap.iterator(); MapEntry curr=null,parentEntry=null; while(it.hasNext()){ curr=it.next(); // If the current entry is before the start of the parent span update the parentEntry if(curr.srcOffset<=parentOffset){ parentEntry=curr; }else if(curr.srcOffset<(parentOffset+length)){ return null; } } return parentEntry; } /** * Gets the MapEntry for the child TextBase that includes the position listed in childOffset */ public MapEntry getChildMapping(String childDocId,int childOffset,int length){ SortedSet<MapEntry> childDocMap=childToParentMap.get(childDocId); if(childDocMap==null) throw new IllegalArgumentException( "Document containing child char sequence has no mappings."); // Iterate through this document's map entries until we find the entry that contains the entire parent span. // If there is no entry that contains the parent span, then give an error. The entry is found by finding the // first entry whose offset is greater than both the start and end of the parent, then the previous entry has // the info we need. Iterator<MapEntry> it=childDocMap.iterator(); MapEntry curr=null,childEntry=null; while(it.hasNext()){ curr=it.next(); // If the current entry is before the start of the parent span update the childEntry if(curr.srcOffset<=childOffset){ childEntry=curr; }else if(curr.srcOffset<(childOffset+length)){ return null; } } return childEntry; } /** * Finds the span in the child TextBase that corresponds to the provided span in the parent TextBase. */ public Span getMappedChildSpan(Span parentSpan){ if(parent.getDocument(parentSpan.getDocumentId())==null) throw new IllegalArgumentException( "Document containing parent span not in the child text base of this mapper."); int parentLo=parentSpan.getTextToken(0).getLo(); int parentHi=parentSpan.getTextToken(parentSpan.size()-1).getHi(); MapEntry parentEntry= this.getParentMapping(parentSpan.getDocumentId(),parentLo,parentHi- parentLo); // If no approptiate entry was found that maps the parent span, then there is no mapping for this // span between these two text bases so just return null. if(parentEntry==null) return null; // Otherwise compute the index offsets for the new (mapperd) span as follows: // lo index: the mapped offset (destination) from the entry return child.documentSpan(parentEntry.dstDocId).charIndexSubSpan( parentEntry.dstOffset+(parentLo-parentEntry.srcOffset), parentEntry.dstOffset+(parentHi-parentEntry.srcOffset)); } /** * Finds the span in the parent TextBase that corresponds to the provided span in the child TextBase. */ public Span getMappedParentSpan(Span childSpan){ if(child.getDocument(childSpan.getDocumentId())==null) throw new IllegalArgumentException( "Document containing child span not in the parent text base of this mapper."); int childLo=childSpan.getTextToken(0).getLo(); int childHi=childSpan.getTextToken(childSpan.size()-1).getHi(); MapEntry childEntry= this.getChildMapping(childSpan.getDocumentId(),childLo,childHi- childLo); // If no approptiate entry was found that maps the parent span, then there is no mapping for this // span between these two text bases so just return null. if(childEntry==null) return null; // Otherwise compute the index offsets for the new (mapped) span as follows: // lo index: the mapped offset (destination) from the entry return parent.documentSpan(childEntry.dstDocId).charIndexSubSpan( childEntry.dstOffset+(childLo-childEntry.srcOffset), childEntry.dstOffset+(childHi-childEntry.srcOffset)); } /** * Used for debugging purposes. */ // public void printMap(){ // System.out // .println("****************************************************"); // System.out.println("*** Mapper Between Parent: "+parent+" and Child: "+ // child+" ***"); // System.out // .println("*** ***"); // System.out // .println("*** Parent To Child mappings: ***"); // // Iterator<String> keyIterator=parentToChildMap.keySet().iterator(); // while(keyIterator.hasNext()){ // String currKey=keyIterator.next(); // SortedSet<MapEntry> currDocMapings=parentToChildMap.get(currKey); // Iterator<MapEntry> mappingsIterator=currDocMapings.iterator(); // while(mappingsIterator.hasNext()){ // System.out.println("*** "+mappingsIterator.next()+" ***"); // } // } // System.out // .println("*** ***"); // System.out // .println("*** Child To Parent mappings: ***"); // // keyIterator=childToParentMap.keySet().iterator(); // while(keyIterator.hasNext()){ // String currKey=keyIterator.next(); // SortedSet<MapEntry> currDocMapings=childToParentMap.get(currKey); // Iterator<MapEntry> mappingsIterator=currDocMapings.iterator(); // while(mappingsIterator.hasNext()){ // System.out.println("*** "+mappingsIterator.next()+" ***"); // } // } // System.out // .println("****************************************************\n\n"); // } /** * A mapping of an offset between documents. This is used by {@link edu.cmu.minorthird.text.TextBaseManager TextBaseManager} * to map spans from one TextBase to one that was derived from it. */ public class MapEntry implements Comparable<MapEntry>{ public String srcDocId; public int srcOffset; public String dstDocId; public int dstOffset; public MapEntry(String sid,int sos,String did,int dos){ srcDocId=sid; srcOffset=sos; dstDocId=did; dstOffset=dos; } @Override public int compareTo(MapEntry o){ int res=srcDocId.compareTo(o.srcDocId); if(res==0) res=srcOffset-o.srcOffset; return res; } @Override public String toString(){ return srcDocId+":"+srcOffset+" -> "+dstDocId+":"+dstOffset; } } } }