/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ package cc.mallet.types; import java.io.*; import java.util.regex.*; import java.util.HashMap; import gnu.trove.TObjectIntHashMap; import java.util.Set; import java.util.Iterator; // xxx A not very space-efficient version. I'll compress it later. public class StringEditFeatureVectorSequence extends FeatureVectorSequence implements Serializable { private int string1Length, string2Length; private String string1, string2; private String[] string1Blocks, string2Blocks; private TObjectIntHashMap string1Present, string2Present; private TObjectIntHashMap lexicon; private int[] block1Indices, block2Indices; private char delim = ':'; private static final char defaultDelimiter = ':'; public StringEditFeatureVectorSequence (FeatureVector[] featureVectors, String s1, String s2) { this (featureVectors, s1, s2, defaultDelimiter); } public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, char delimiter) { this (featureVectors, s1, s2, delimiter, null); } public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, HashMap lexic) { this (featureVectors, s1, s2, defaultDelimiter, lexic); } public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, char delimiter, HashMap lexic) { super (featureVectors); this.delim = delimiter; this.lexicon = new TObjectIntHashMap(); if (lexic != null) { Set keys = lexic.keySet(); java.util.Iterator iter = keys.iterator(); while (iter.hasNext()) this.lexicon.put((String) iter.next(), 1); } this.string1 = s1; this.string2 = s2; this.string1Length = s1.length() + 2; this.string2Length = s2.length() + 2; string1Blocks = string1.split("" + delim); string2Blocks = string2.split("" + delim); string1Present = new TObjectIntHashMap(); string2Present = new TObjectIntHashMap(); block1Indices = new int[string1Length]; if (string1Blocks.length > 0) { int whichBlock = 0; block1Indices[0] = whichBlock++; for (int i = 0; i < string1Blocks.length; i++) string1Present.put(string1Blocks[i], 1); for (int i = 1; i < string1Length-1; i++) block1Indices[i] = ((string1.charAt(i-1) == delim) ? whichBlock++ : -1); block1Indices[string1Length-1] = -1; } block2Indices = new int[string2Length]; if (string2Blocks.length > 0) { int whichBlock = 0; block2Indices[0] = whichBlock++; for (int i = 0; i < string2Blocks.length; i++) string2Present.put(string2Blocks[i], 1); for (int i = 1; i < string2Length - 1; i++) block2Indices[i] = ((string2.charAt(i-1) == delim) ? whichBlock++ : -1); block2Indices[string2Length-1] = -1; } } public String getString1() { return string1; } public String getString2() { return string2; } public int getString1Length () { return string1Length; } public int getString2Length () { return string2Length; } // End of Block public int getString1EOBIndex(String delimiter) { return getString1EOBIndex(delimiter, 0); } public int getString1EOBIndex(String delimiter, int start) { return getString1IndexOf(delimiter, start); } public String getString1BlockAtIndex(int idx) { if (idx < 0 || idx >= block1Indices.length || block1Indices[idx] < 0 || block1Indices[idx] >= string1Blocks.length) return null; else return string1Blocks[block1Indices[idx]]; } public int getString1IndexOf(String str, int start) { int toret = string1.indexOf(str, start); if (toret == -1) toret = string1.length() - 1 - start; else toret = toret - start; if (toret < 1) return -1; return toret; } public boolean isPresent1(String patternStr) { Pattern p = Pattern.compile(patternStr); Matcher m = p.matcher(string1); boolean b = m.matches(); return b; } public boolean isPresentInString1(String str) { return string1Present.containsKey(str); } public char getString1Char(int index) { index = index - 1; if (index < 0 || index >= string1.length()) return (char) 0; else return string1.charAt(index); } public int getString2EOBIndex(String delimiter) { return getString2EOBIndex(delimiter, 0); } public int getString2EOBIndex(String delimiter, int start) { return getString2IndexOf(delimiter, start); } public String getString2BlockAtIndex(int idx) { if (idx < 0 || idx >= block2Indices.length || block2Indices[idx] < 0 || block2Indices[idx] >= string2Blocks.length) return null; else return string2Blocks[block2Indices[idx]]; } public boolean isPresentInString2(String str) { return string2Present.containsKey(str); } public int getString2IndexOf(String str, int start) { int toret = string2.indexOf(str, start); if (toret == -1) toret = string2.length() - 1 - start; else toret = toret - start; if (toret < 1) return -1; return toret; } public boolean isPresent2(String patternStr) { Pattern p = Pattern.compile(patternStr); Matcher m = p.matcher(string2); boolean b = m.matches(); return b; } public char getString2Char(int index) { index = index - 1; if (index < 0 || index >= string2.length()) return (char) 0; else return string2.charAt(index); } public boolean isInLexicon(String str) { if (lexicon == null || str == null) return false; return lexicon.containsKey(str); } public String toString () { StringBuffer sb = new StringBuffer (); sb.append (super.toString()); sb.append ('\n'); sb.append ("String 1: " + string1Length + " String 2: " + string2Length); return sb.toString(); } // Serialization of Instance private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; private static final int NULL_INTEGER = -1; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); out.writeInt (string1Length); out.writeInt (string2Length); out.writeObject (string1); out.writeObject (string2); if (string1Blocks == null) { out.writeInt(NULL_INTEGER); } else { int size = string1Blocks.length; out.writeInt(size); for(int i=0; i<size; i++) { out.writeObject(string1Blocks[i]); } } if (string2Blocks == null) { out.writeInt(NULL_INTEGER); } else { int size = string2Blocks.length; out.writeInt(size); for(int i=0; i<size; i++) { out.writeObject(string2Blocks[i]); } } out.writeObject(string1Present); out.writeObject(string2Present); out.writeObject(lexicon); if (block1Indices == null) { out.writeInt(NULL_INTEGER); } else { int size = block1Indices.length; out.writeInt(size); for (int i=0; i<size; i++) { out.writeInt(block1Indices[i]); } } if (block2Indices == null) { out.writeInt(NULL_INTEGER); } else { int size = block2Indices.length; out.writeInt(size); for (int i=0; i<size; i++) { out.writeInt(block2Indices[i]); } } out.writeChar(delim); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int version = in.readInt (); int string1Length = in.readInt(); int string2Length = in.readInt(); String string1 = (String) in.readObject(); String string2 = (String) in.readObject(); int size = in.readInt(); if (size == NULL_INTEGER) { string1Blocks = null; } else { string1Blocks = new String[size]; for (int i = 0; i<size; i++) { string1Blocks[i] = (String) in.readObject(); } } size = in.readInt(); if (size == NULL_INTEGER) { string2Blocks = null; } else { string2Blocks = new String[size]; for (int i = 0; i<size; i++) { string2Blocks[i] = (String) in.readObject(); } } TObjectIntHashMap string1Present = (TObjectIntHashMap) in.readObject(); TObjectIntHashMap string2Present = (TObjectIntHashMap) in.readObject(); TObjectIntHashMap lexicon = (TObjectIntHashMap) in.readObject(); size = in.readInt(); if (size == NULL_INTEGER) { block1Indices = null; } else { block1Indices = new int[size]; for (int i = 0; i<size; i++) { block1Indices[i] = in.readInt(); } } size = in.readInt(); if (size == NULL_INTEGER) { block2Indices = null; } else { block2Indices = new int[size]; for (int i = 0; i<size; i++) { block2Indices[i] = in.readInt(); } } delim = in.readChar(); } }