/** DataCruncher.java * * @author Sunita Sarawagi * @since 1.0 * @version 1.3 */ package iitb.Segment; import iitb.CRF.DataSequence; import iitb.CRF.Segmentation; import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.StringTokenizer; import java.util.Vector; import java.util.regex.Pattern; class DCTrainRecord implements TrainRecord, Segmentation { private static final long serialVersionUID = -3412644222368304767L; int[] ls; String[][] _tokens; int[] labelsPerToken; int[] snum, spos; DCTrainRecord(int[] ts, String[][] toks) { ls = ts; _tokens = toks; spos = new int[ls.length]; int len = 0; for (int i = 0; i < numSegments(); i++) { spos[i] = len; len+= _tokens[i].length; } labelsPerToken = new int[len]; snum = new int[len]; int pos = 0; for (int i = 0; i < ls.length; i++) { for (int p = 0; p < _tokens[i].length; p++) { snum[pos] = i; labelsPerToken[pos++] = ls[i]; } } } public int[] labels() { return ls; } public void set_y(int i, int l) {labelsPerToken[i] = l;} // not applicable for training data. public int length() {return labelsPerToken.length;} public Object x(int i) {return _tokens[snum[i]][i - spos[snum[i]]];} public int y(int i) {return labelsPerToken[i];} public int numSegments() { return ls.length; } public int numSegments(int l) { int sz = 0; for (int i = 0; i < ls.length; i++) if (ls[i] == l) sz++; return sz; } public String[] tokens(int snum) { return _tokens[snum]; } public String[] tokens(int l, int p) { int pos = 0; for (int i = 0; i < ls.length; i++) if (ls[i] == l) { if (pos == p) return _tokens[i]; pos++; } return null; } /* (non-Javadoc) * @see iitb.CRF.SegmentDataSequence#getSegmentEnd(int) */ public int getSegmentEnd(int segmentStart) { for (int i = segmentStart+1; i < length(); i++) { if (y(i)!= y(segmentStart)) return i-1; } return length()-1; } /* (non-Javadoc) * @see iitb.CRF.SegmentDataSequence#setSegment(int, int, int) */ public void setSegment(int segmentStart, int segmentEnd, int y) { for (int i = segmentStart; i <= segmentEnd; i++) set_y(i,y); assert(false); } public int getSegmentId(int offset) { return snum[offset]; } public int segmentEnd(int segmentNum) { return segmentStart(segmentNum) + _tokens[segmentNum].length-1; } public int segmentLabel(int segmentNum) { return ls[segmentNum]; } public int segmentStart(int segmentNum) { return spos[segmentNum]; } }; class DCTrainData implements TrainData { ArrayList<DCTrainRecord> trainRecs; int pos; DCTrainData(ArrayList<DCTrainRecord> trs) { trainRecs = trs; } public int size() { return trainRecs.size(); } public void startScan() { pos = 0; } public TrainRecord nextRecord() { return (TrainRecord)trainRecs.get(pos++); } public boolean hasMoreRecords() { return (pos < size()); } public boolean hasNext() { return hasMoreRecords(); } public DataSequence next() { return nextRecord(); } }; class TestData { BufferedReader rin; String line; String seq[]; String fname; String delimit, impDelimit; TestData(String file,String delimitP,String impDelimitP, String grpDelimit) { try { fname = file; rin =new BufferedReader(new FileReader(file+".raw")); delimit = delimitP; impDelimit = impDelimitP; } catch(IOException e) { System.out.println("I/O Error"+e); System.exit(-1); } } void startScan() { try { rin =new BufferedReader(new FileReader(fname+".raw")); } catch(IOException e) { System.out.println("I/O Error"+e); System.exit(-1); } } int[] groupedTokens() { /* if (grp == null) return null; return grp.groupingArray(seq.length); */ return null; } String[] nextRecord() { try { if ((line=rin.readLine())!=null) { StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true); int len = tok.countTokens(); if ((seq == null) || (seq.length < len)) seq =new String[len]; int count=0; for(int i=0 ; i<len; i++) { String tokStr=tok.nextToken(); if (delimit.indexOf(tokStr)==-1 || impDelimit.indexOf(tokStr)!=-1) { seq[count++]=new String(tokStr); } } String aseq[]=new String[count]; for(int i=0 ; i<count ; i++) { aseq[i]=seq[i]; } return aseq; } else { rin.close(); return null; } } catch(IOException e) { System.out.println("I/O Error"+e); System.exit(-1); } return null; } }; class TestDataWrite { PrintWriter out; BufferedReader rin; String outputBuffer; String rawLine; String delimit, tagDelimit, impDelimit; LabelMap labelmap; TestDataWrite(String outfile,String rawfile,String delimitP,String tagDelimitP,String impDelimitP, LabelMap linfo) { try { labelmap = linfo; out=new PrintWriter(new FileOutputStream(outfile+".tagged")); rin=new BufferedReader(new FileReader(rawfile+".raw")); outputBuffer=new String(); delimit = delimitP; tagDelimit = tagDelimitP; impDelimit = impDelimitP; } catch(IOException e) { System.err.println("I/O Error"+e); System.exit(-1); } } void writeRecord(int[] tok, int tokLen) { try { rawLine=rin.readLine(); StringTokenizer rawTok=new StringTokenizer(rawLine,delimit,true); String tokArr[]=new String[rawTok.countTokens()]; for(int j=0 ; j<tokArr.length ; j++) { tokArr[j]=rawTok.nextToken(); } int ptr=0; int t=tok[0]; for(int j=0 ; j<=tokLen ; j++) { if ((j < tokLen) && (t==tok[j])) { while(ptr<tokArr.length && delimit.indexOf(tokArr[ptr])!=-1 && impDelimit.indexOf(tokArr[ptr])==-1) { outputBuffer=new String(outputBuffer+tokArr[ptr]); ptr++; } if (ptr<tokArr.length) { outputBuffer=new String(outputBuffer+tokArr[ptr]); ptr++; } while(ptr<tokArr.length && delimit.indexOf(tokArr[ptr])!=-1 && impDelimit.indexOf(tokArr[ptr])==-1) { outputBuffer=new String(outputBuffer+tokArr[ptr]); ptr++; } } else { int revScanPtr=outputBuffer.length()-1; int goBackPtr=0; boolean foundOpenChar=false; while((revScanPtr >= 0) && (outputBuffer.charAt(revScanPtr)==' ' || outputBuffer.charAt(revScanPtr)=='(' || outputBuffer.charAt(revScanPtr)=='{' || outputBuffer.charAt(revScanPtr)=='[')) { char currChar=outputBuffer.charAt(revScanPtr); if (impDelimit.indexOf(currChar)!=-1) { break; } if (currChar=='{' || currChar=='[' || currChar=='(') { foundOpenChar=true; } revScanPtr--; goBackPtr++; } if (foundOpenChar) { outputBuffer=outputBuffer.substring(0,revScanPtr+1); ptr-=goBackPtr; } outputBuffer=new String(outputBuffer+tagDelimit+labelmap.revMap(t)); out.println(outputBuffer); outputBuffer=new String(); // out.println(tagDelimit+t); // System.out.println(tagDelimit+t); if (j < tokLen) { t=tok[j]; j--; } } } out.println(); } catch(IOException e) { System.err.println("I/O Error"+e); System.exit(-1); } } void close() { try { rin.close(); out.close(); } catch(IOException e) { System.err.println("I/O Error"+e); System.exit(-1); } } }; public class DataCruncher { /** * * @param text * @param delimit A set of delimiters used by the Tokenizer. * @param impDelimit Delimiters to be retained for tagging. * @return an Array of tokens. */ protected static String[] getTokenList(String text, String delimit, String impDelimit) { text = text.toLowerCase(); StringTokenizer textTok = new StringTokenizer(text, delimit, true); //This allocates space for all tokens and delimiters, //but will make a second pass through the String unnecessary. ArrayList<String> tokenList = new ArrayList<String>(textTok.countTokens()); while (textTok.hasMoreTokens()) { String tokStr = textTok.nextToken(); if (!delimit.contains(tokStr) || impDelimit.contains(tokStr)) { tokenList.add(tokStr); } } //Finally, the storage is trimmed to the actual size. return tokenList.toArray(new String[tokenList.size()]); } /** * Reads a block of text ended by a blank line or the end of the file. * The block contains lines of tokens with a label. * @param numLabels The maximal number of labels expected * @param tin * @param tagDelimit Separator between tokens and tag number * @param delimit Used to define token boundaries * @param impDelimit Delimiters to be retained for tagging * @param t Stores the labels * @param cArray Stores the tokens * @return number of lines read * @throws IOException */ public static int readRowVarCol(int numLabels, BufferedReader tin, String tagDelimit, String delimit, String impDelimit, int[] t, String[][] cArray) throws IOException { int ptr = 0; String line; while(true) { line = tin.readLine(); StringTokenizer firstSplit=null; if (line!=null) { firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit); } if ((line==null) || (firstSplit.countTokens()<2)) { // Empty Line return ptr; } String w = firstSplit.nextToken(); int label=Integer.parseInt(firstSplit.nextToken()); t[ptr] = label; cArray[ptr++] = getTokenList(w,delimit,impDelimit); } } static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit, String delimit, String impDelimit, int[] t, String[][] cArray, int labels[]) throws IOException { String line=tin.readLine(); if (line == null) return 0; StringTokenizer firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit,true); int ptr = 0; for (int i = 0; (i < labels.length) && firstSplit.hasMoreTokens(); i++) { int label = labels[i]; String w = firstSplit.nextToken(); if (tagDelimit.indexOf(w)!=-1) { continue; } else { if (firstSplit.hasMoreTokens()) // read past the delimiter. firstSplit.nextToken(); } if ((label > 0) && (label <= numLabels)) { t[ptr] = label; cArray[ptr++] = getTokenList(w,delimit,impDelimit); } } return ptr; } /** * Checks, if the data are available in fixed column format, or variable * column format. * @param numLabels The maximal number of labels expected * @param tin * @param tagDelimit A character as String that acts as a delimiter between tokens and label. * @return An array with labels if the data are in fixed column format, null otherwise. * @throws IOException */ protected static int[] readHeaderInfo(int numLabels, BufferedReader tin, String tagDelimit) throws IOException { tin.mark(1000); String line = tin.readLine(); if (line == null) { throw new IOException("Header row not present in tagged file"); } if (!line.toLowerCase().startsWith("fixed-column-format")) { tin.reset(); return null; } line = tin.readLine(); Pattern delimitPattern = Pattern.compile(tagDelimit, Pattern.LITERAL); String[] parts = delimitPattern.split(line); int labels[] = new int[numLabels]; for (int i = 0, size = parts.length; i < size; ++i) { labels[i] = Integer.parseInt(parts[i]); } return labels; } public static TrainData readTagged(int numLabels, String tfile, String rfile, String delimit, String tagDelimit, String impDelimit, LabelMap labelMap) { try { ArrayList<DCTrainRecord> td = new ArrayList<DCTrainRecord>(); BufferedReader tin = new BufferedReader(new FileReader(tfile + ".tagged")); BufferedReader rin = new BufferedReader(new FileReader(rfile + ".raw")); boolean fixedColFormat = false; String rawLine; StringTokenizer rawTok; int t[] = new int[0]; String cArray[][] = new String[0][0]; int[] labels = null; // read list of columns in the header of the tag file labels = readHeaderInfo(numLabels, tin, tagDelimit); if (labels != null) { fixedColFormat = true; } while ((rawLine = rin.readLine()) != null) { rawTok = new StringTokenizer(rawLine, delimit, true); int len = rawTok.countTokens(); if (len > t.length) { t = new int[len]; cArray = new String[len][0]; } int ptr = 0; if (fixedColFormat) { ptr = readRowFixedCol(numLabels, tin, tagDelimit, delimit, impDelimit, t, cArray, labels); } else { ptr = readRowVarCol(numLabels, tin, tagDelimit, delimit, impDelimit, t, cArray); } if (ptr == 0) { break; } int at[] = new int[ptr]; String[][] c = new String[ptr][0]; for (int i = 0; i < ptr; i++) { at[i] = labelMap.map(t[i]); c[i] = cArray[i]; } td.add(new DCTrainRecord(at, c)); } return new DCTrainData(td); } catch (IOException e) { System.err.println("I/O Error" + e); System.exit(-1); } return null; } public static void readRaw(Vector<String[]> data,String file,String delimit,String impDelimit) { try { BufferedReader rin=new BufferedReader(new FileReader(file+".raw")); String line; while((line=rin.readLine())!=null) { StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true); String seq[]=new String[tok.countTokens()]; int count=0; for(int i=0 ; i<seq.length ; i++) { String tokStr=tok.nextToken(); if (delimit.indexOf(tokStr)==-1 || impDelimit.indexOf(tokStr)!=-1) { seq[count++]=new String(tokStr); } } String aseq[]=new String[count]; for(int i=0 ; i<count ; i++) { aseq[i]=seq[i]; } data.add(aseq); } rin.close(); } catch(IOException e) { System.out.println("I/O Error"+e); System.exit(-1); } } /** * * @param file * @param tagDelimit A character as String that acts as a delimiter between tokens and label. */ public static void createRaw(String file, String tagDelimit) { BufferedReader in = null; PrintWriter out = null; try { in = new BufferedReader(new FileReader(file + ".tagged")); out = new PrintWriter(new FileOutputStream(file + ".raw")); String line; StringBuilder rawLine; rawLine = new StringBuilder(200); while((line=in.readLine())!=null) { StringTokenizer t=new StringTokenizer(line,tagDelimit); if(t.countTokens()<2) { out.println(rawLine); rawLine.setLength(0); } else { rawLine.append(" "); rawLine.append(t.nextToken()); } } out.println(rawLine); } catch (IOException e) { System.out.println("I/O Error" + e); System.exit(-1); } finally { if (in != null) { try { in.close();} catch (IOException e) {} } if (out != null) { out.close(); } } } }