package edu.cmu.minorthird.text; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import edu.cmu.minorthird.util.ProgressCounter; /** * Configurable Text Loader. * <p> * Usage: Configure a loader object using the constructors. Call .load(File) * with the file object to your data (could be a directory) load(File) returns * the TextBase object for the data. * <p> * * <pre> * Default: * TextBaseLoader tbl = new TextBaseLoader(); * Loads One Document per File and uses embedded labels * ------------------------------------------------------ * Specify Document Style * TextBaseLoader tbl = new TextBaseLoader(TextBaseLoader.DOC_PER_LINE); // Loads One document per line * TextBaseLoader tbl = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE); // Loads One document per file * ------------------------------------------------------ * Specify document type and whether to use embedded Labels * // ex: Loads one doc per line and ignores embedded labels * TextBaseLoader tbl = new TextBaseLoader(TextBaseLoader.DOC_PER_LINE, false); * ------------------------------------------------------ * Specify document type and whether to use embedded Labels * // ex: Loads one doc per file, uses embedded labels, and recurses directories * TextBaseLoader tbl = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE, true, true); * <p> * In ALL cases use: * tbl.load(FILE); * </pre> * * @author William Cohen * @author Kevin Steppe * @author Cameron Williams * @author Quinten Mercer */ public class TextBaseLoader{ // style/location for IDs, groupID, Category of doc // Kept to support the old TextBaseLoader api public static final int NONE=0; // could be given as a param at some point public static final int DIRECTORY_NAME=1; public static final int FILE_NAME=2; public static final int IN_FILE=3; // document style public static final int DOC_PER_LINE=0; public static final int DOC_PER_FILE=1; // XML tags public static final boolean USE_XML=true; public static final boolean IGNORE_XML=false; // Parameters for loading // One document per line in a file or One document per file private int documentStyle=DOC_PER_FILE; // tagging -- whether to use embedded XML tags private boolean use_markup=USE_XML; // recursion -- if loading from a directory should subdirectories be loaded // too? private boolean recurseDirectories=false; // internal structure private static Logger log=Logger.getLogger(TextBaseLoader.class); private int closurePolicy=TextLabelsLoader.CLOSE_ALL_TYPES; private List<StackEntry> stack; // xml tag stack // saves labels associated with last set of files loaded private MutableTextLabels labels; private MutableTextBase textBase; // --------------------- Constructors // ----------------------------------------------------- /** * Default constructor. It will load each file as a single document, use XML * markup, and NOT recurse recurse. */ public TextBaseLoader(){ } /** * Specifies the document style to use, but leaves all other properties to * their defaults. */ public TextBaseLoader(int documentStyle){ this.documentStyle=documentStyle; } public TextBaseLoader(int documentStyle,boolean use_markup){ this.documentStyle=documentStyle; this.use_markup=use_markup; } public TextBaseLoader(int documentStyle,boolean use_markup, boolean recurseDirectories){ this.documentStyle=documentStyle; this.use_markup=use_markup; this.recurseDirectories=recurseDirectories; } // --------------------- Constructors // ----------------------------------------------------- // --------------------- Public methods // --------------------------------------------------- /** * Load data from the given location according to configuration and whether * location is a directory or not * * Calling load a second time will load into the same text base (thus the * second call returns documents from both the first and second locations). * Use setTextBase(null) to reset the text base. * * * @param dataLocation * File representation of location (single file or directory) * @return the loaded TextBase * @throws IOException - * problem reading the file * @throws ParseException - * problem with xml of internal tagging */ public MutableTextBase load(File dataLocation) throws IOException, ParseException{ // Create new TextBase and TextLabels to hold the data this.textBase=new BasicTextBase(); this.labels=new BasicTextLabels(this.textBase); // check whether it's a dir or single dataLocation if(dataLocation.isDirectory()) loadDirectory(dataLocation); else loadFile(dataLocation); return textBase; } /** * Load data from the given location according to configuration and whether * location is a directory or not * * Calling load a second time will load into the same text base (thus the * second call returns documents from both the first and second locations). * Use setTextBase(null) to reset the text base. * * * @param dataLocation * File representation of location (single file or directory) * @return the loaded TextBase * @throws IOException - * problem reading the file * @throws ParseException - * problem with xml of internal tagging */ public MutableTextBase load(File dataLocation,Tokenizer tok) throws IOException,ParseException{ // Create new TextBase and TextLabels to hold the data this.textBase=new BasicTextBase(tok); this.labels=new BasicTextLabels(this.textBase); // check whether it's a dir or single dataLocation if(dataLocation.isDirectory()) loadDirectory(dataLocation); else loadFile(dataLocation); return textBase; } /** * Load a document where each word has it's own line and is follwed by three * desscriptor words. The first item on each line is a word, the second a * part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth * the named entity tag. */ public MutableTextBase loadWordPerLineFile(File file) throws IOException, FileNotFoundException{ // Create the new TextBase and TextLabels that will contain this data. this.textBase=new BasicTextBase(new SplitTokenizer(" ")); labels=new BasicTextLabels(this.textBase); // Buffer to temporarily hold the contents of each doc read in. StringBuffer buf=new StringBuffer(""); // Each doc in the file needs a unique documentId String id=file.getName(); int docNum=1; String curDocID=id+"-"+docNum; // Lists of spans and properties that are included in the data file List<CharSpan> spanList=new ArrayList<CharSpan>(); List<String> tokenPropList=new ArrayList<String>(); // Read in the file line by line String line; LineNumberReader in=new LineNumberReader(new FileReader(file)); int start=0,end=0; while((line=in.readLine())!=null){ String[] words=line.split("\\s"); // If we're in the middle of a doc, just keep adding to its buffer if(!(words[0].equals("-DOCSTART-"))){ if(words.length>2){ start=buf.length(); buf.append(words[0]+" "); end=buf.length()-1; tokenPropList.add(words[1]); if(!words[3].equals("O")) spanList.add(new CharSpan(start,end,words[3],curDocID)); } } // Otherwise we're at the end of a doc, so add it to the TextBase and // continue else{ // Add the finished doc to the TextBase addDocument(buf.toString(),curDocID,spanList,tokenPropList); // Clear out the doc info variables spanList.clear(); tokenPropList.clear(); buf=new StringBuffer(""); // Increment the document id. docNum++; curDocID=id+"-"+docNum; } } in.close(); return this.textBase; } /** * Sets whether the loader should use or ignore XML markup in the files. <br> * <br> * Valid values are: TextBaseLoader.IGNORE_XML and TextBaseLoader.USE_XML */ public void setLabelsInFile(boolean b){ this.use_markup=b; } /** * Sets the document style for loaded documents. <br> * <br> * Valid styles are: TextBaseLoader.DOC_PER_LINE and * TextBaseLoader.DOC_PER_FILE */ public void setDocumentStyle(int style){ this.documentStyle=style; } /** Sets whether the loader should recurse directories when loading docs. */ public void setRecurseDirectories(boolean rec){ this.recurseDirectories=rec; } /** get labeling generated by tags in data file */ public MutableTextLabels getLabels(){ return labels; } // ---------------Old Methods kept to support old // api------------------------------- // WARNING: These are all deprecated. How long have they been this way, should // we delete them? // /** // * One document per file in a directory, labels are embedded in the data as // * xml tags NB: Don't use this if the data isn't labbed - it will remove // * things that look like <just a note> which could cause problems. // * // * Returns the TextLabels object, the textbase is embedded // * // * @deprecated // */ // public static MutableTextLabels loadDirOfTaggedFiles(File dir) // throws ParseException,IOException{ // TextBaseLoader loader=new TextBaseLoader(DOC_PER_FILE,true); // loader.load(dir); // // return loader.getLabels(); // } // // /** @deprecated */ // public void loadTaggedFiles(TextBase base,File dir) throws IOException, // FileNotFoundException{ // try{ // TextBaseLoader loader=new TextBaseLoader(DOC_PER_FILE,true); // loader.load(dir); // }catch(Exception e){ // e.printStackTrace(); // } // } // // /** @deprecated */ // public static TextBase loadDocPerLine(File file,boolean hasGroupID) // throws ParseException,IOException{ // try{ // TextBaseLoader loader=new TextBaseLoader(DOC_PER_LINE); // return loader.load(file); // }catch(Exception e){ // e.printStackTrace(); // } // return null; // } // --------------------- Public methods // --------------------------------------------------- // --------------------- Private methods // -------------------------------------------------- private void loadDirectory(File directory) throws IOException,ParseException{ // loop on files in directory or loop on directories? File[] files=directory.listFiles(); Arrays.sort(files); if(files==null) throw new IllegalArgumentException("can't list directory "+ directory.getName()); ProgressCounter pc= new ProgressCounter("loading directory "+directory.getName(),"file", files.length); for(int i=0;i<files.length;i++){ // skip CVS directories if("CVS".equals(files[i].getName())) continue; if(files[i].isDirectory()&&this.recurseDirectories) loadDirectory(files[i]); if(files[i].isFile()) loadFile(files[i]); pc.progress(); } pc.finished(); } /** * Load the given single file according the current settings * * @param file * @throws IOException */ private void loadFile(File file) throws IOException,ParseException{ log.debug("loadFile: "+file.getName()); // build the correct reader BufferedReader in; if(documentStyle==DOC_PER_LINE){ in=new LineNumberReader(new FileReader(file)); }else{ in=new BufferedReader(new FileReader(file)); } // set the docid String curDocID=file.getName(); // list of labeled spans if internally tagged List<CharSpan> spanList=new ArrayList<CharSpan>(); // Clear the xml tag stack stack=new ArrayList<StackEntry>(); // loop through the file StringBuffer buf=new StringBuffer(); while(in.ready()){ // in.ready may cause problems on Macintosh String line=in.readLine(); // BUG: THIS METHOD ADDS BLANK LINES AS DOCS FOR DOC_PER_LINE STYLE FILES // appends to the buffer internally if(this.use_markup){ line=labelLine(line,buf,curDocID,spanList); } // If this reader is set to create a doc for each line then add the doc // now if(this.documentStyle==DOC_PER_LINE){ if(line.trim().length()>0){ curDocID= file.getName()+"@line:"+((LineNumberReader)in).getLineNumber(); addDocument(line,curDocID,spanList,null); buf=new StringBuffer(); spanList.clear(); } } // Otherwise add the line to the buffer and continue reading else{ if(!this.use_markup){ buf.append(line); buf.append("\n"); // need line feed } } } if(this.documentStyle==DOC_PER_FILE) addDocument(buf.toString(),curDocID,spanList,null); in.close(); } /** * Add this text to the textBase as a new document, including group id and * categorization * * @param docText * String version of text */ private void addDocument(String docText,String documentId,List<CharSpan> spans, List<String> tokenProps){ // Blank documents are dropped if(docText.length()==0){ log .warn("Text for document "+documentId+ " is length zero or all white space, it will not be added to the text base."); return; } if(log.isDebugEnabled()) log.debug("add document "+documentId); // Add the document to the TextBase textBase.loadDocument(documentId,docText); // Now add all of the extracted spans to the labels set for(Iterator<CharSpan> j=spans.iterator();j.hasNext();){ CharSpan charSpan=j.next(); Span approxSpan; // = // textBase.documentSpan(documentId).subSpan(charSpan.lo, // charSpan.hi-charSpan.lo-1); boolean flag=false; for(int i=charSpan.lo;i<charSpan.hi;i++){ if(docText.charAt(i)!=' '&&docText.charAt(i)!='\n') flag=true; } if(flag) approxSpan= textBase.documentSpan(documentId).charIndexSubSpan(charSpan.lo, charSpan.hi); else approxSpan= textBase.documentSpan(documentId).charIndexSubSpan(charSpan.lo, charSpan.hi).getLeftBoundary(); if(log.isDebugEnabled()){ int hi=charSpan.hi; if(hi>docText.length()) hi=docText.length(); log.debug("approximating "+charSpan.type+" span '"+ docText.substring(charSpan.lo,hi)+"' with token span '"+approxSpan); } labels.addToType(approxSpan,charSpan.type); } // Next add all extracted token properties to the labels set if(tokenProps!=null&&tokenProps.size()>0){ Document doc=textBase.getDocument(documentId); TextToken[] tokens=doc.getTokens(); Iterator<String> itr=tokenProps.iterator(); if(tokens.length>0){ for(int x=0;x<tokens.length;x++){ String nextPOS=itr.next(); if(nextPOS!=null&&tokens[x]!=null){ labels.setProperty(tokens[x],"POS",nextPOS); } } } } // Close the labels set new TextLabelsLoader().closeLabels(labels,closurePolicy); } /** * Takes a single line of text. Uses the markupPattern field to remove * labelings (must be xml styled). These labelling are added to the span list * * @param line - * String of a single line to have it's labels parsed * @param spanList - * List of span labelings * @return a String with the labelings removed * @throws ParseException * improper xml format will cause a parse exception */ protected String labelLine(String line,StringBuffer docBuffer,String docId, List<CharSpan> spanList) throws ParseException{ // stack of open tags if(stack==null) stack=new ArrayList<StackEntry>(); // Create the matcher to find any XML marked up tags Pattern markupPattern=Pattern.compile("</?([^ ><]+)( [^<>]+)?>"); Matcher matcher=markupPattern.matcher(line); int currentChar=0; while(matcher.find()){ String tag=matcher.group(1); boolean isOpenTag=!matcher.group().startsWith("</"); if(log.isDebugEnabled()){ log.debug("matcher.group='"+matcher.group()+"'"); log.debug("found '"+tag+"' tag ,open="+isOpenTag+", at "+ matcher.start()+" in:\n"+line); } // copy stuff up to tag into buffer docBuffer.append(line.substring(currentChar,matcher.start())); currentChar=matcher.end(); if(isOpenTag){ stack.add(new StackEntry(docBuffer.length(),tag)); }else{ // pop the corresponding open off the stack StackEntry entry=null; for(int j=stack.size()-1;j>=0;j--){ entry=stack.get(j); if(tag.equals(entry.markupTag)){ stack.remove(j); break; } } if(entry==null) throw new ParseException( "close '"+tag+"' tag with no open in "+docId,0); if(!tag.equals(entry.markupTag)) throw new ParseException("close '"+tag+"' tag paired with open '"+ entry.markupTag+"'",entry.index); if(log.isDebugEnabled()){ log.debug("adding a "+tag+" span from "+entry.index+" to "+ docBuffer.length()+": '"+docBuffer.substring(entry.index)+"'"); } // spanList.add( new CharSpan(entry.index, docBuffer.length()-1, tag) ); spanList.add(new CharSpan(entry.index,docBuffer.length(),tag,docId)); } } // append stuff from end of last tag to end of line into the buffer docBuffer.append(line.substring(currentChar,line.length())); // BUG: THIS IS CAUSING BLANK LINES IN FILES TO BE ADDED AS DOCUMENTS WHEN // LOADED in DOC_PER_LINE FORMAT // HOWEVER, SIMPLY REMOVING IT BREAKS BASIC FUNCTIONALITY docBuffer.append("\n"); return docBuffer.toString(); } private class StackEntry{ public int index; public String markupTag; public StackEntry(int index,String markupTag){ this.index=index; this.markupTag=markupTag; } } private class CharSpan{ public int lo,hi; String type; // String docID; public CharSpan(int lo,int hi,String type,String docID){ this.lo=lo; this.hi=hi; this.type=type; // this.docID=docID; } } // --------------------- End Private methods // -------------------------------------------------- }