/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.pipe; import java.io.*; import java.net.URI; import java.util.regex.*; import java.util.Set; import cc.mallet.types.Instance; import cc.mallet.types.Token; import cc.mallet.types.TokenSequence; import cc.mallet.util.CharSequenceLexer; import cc.mallet.util.Lexer; /** Similar to {@link SGML2TokenSequence}, except that only the tags listed in <code>allowedTags</code> are converted to {@link Label}s. @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a> */ public class SelectiveSGML2TokenSequence extends Pipe implements Serializable { Pattern sgmlPattern = Pattern.compile ("</?([^>]*)>"); CharSequenceLexer lexer; String backgroundTag; Set allowedTags; /** @param lexer to tokenize input @param backgroundTag default tag when not in any other tag @param allowed set of tags (Strings) that will be converted to labels */ public SelectiveSGML2TokenSequence (CharSequenceLexer lexer, String backgroundTag, Set allowed) { this.lexer = lexer; this.backgroundTag = backgroundTag; this.allowedTags = allowed; } public SelectiveSGML2TokenSequence (String regex, String backgroundTag, Set allowed) { this (new CharSequenceLexer (regex), backgroundTag, allowed); } public SelectiveSGML2TokenSequence (Set allowed) { this (new CharSequenceLexer(), "O", allowed); } public SelectiveSGML2TokenSequence (CharSequenceLexer lex, Set allowed) { this (lex, "O", allowed); } public Instance pipe (Instance carrier) { if (!(carrier.getData() instanceof CharSequence)) throw new ClassCastException ("carrier.data is a " + carrier.getData().getClass().getName() + " not a CharSequence"); TokenSequence dataTokens = new TokenSequence (); TokenSequence targetTokens = new TokenSequence (); CharSequence string = (CharSequence) carrier.getData(); String tag = backgroundTag; String nextTag = backgroundTag; Matcher m = sgmlPattern.matcher (string); int textStart = 0; int textEnd = 0; int nextStart = 0; boolean done = false; while (!done) { done = !findNextValidMatch (m); if (done) textEnd = string.length()-1; else { String sgml = m.group(); int groupCount = m.groupCount(); if (sgml.charAt(1) == '/') nextTag = backgroundTag; else{ nextTag = m.group(0); nextTag = sgml.substring(1, sgml.length()-1); } nextStart = m.end(); textEnd = m.start(); } if (textEnd - textStart > 0) { lexer.setCharSequence (string.subSequence (textStart, textEnd)); while (lexer.hasNext()) { dataTokens.add (new Token ((String) lexer.next())); targetTokens.add (new Token (tag)); } } textStart = nextStart; tag = nextTag; } carrier.setData(dataTokens); carrier.setTarget(targetTokens); carrier.setSource(dataTokens); return carrier; } /** Finds the next match contained in <code> allowedTags </code>. */ private boolean findNextValidMatch (Matcher m) { if (!m.find ()) return false; String sgml = m.group(); int start = m.start (); int first = 1; int last = sgml.length() - 1; if (sgml.charAt(1) == '/') first = 2; sgml = sgml.substring (first, last); if (allowedTags.contains (sgml)) { m.find (start); return true; } else return findNextValidMatch (m); } public String toString () { String ret = "sgml pattern: " + sgmlPattern.toString(); ret += "\nlexer: " + lexer.getPattern().toString(); ret += "\nbg tag: " + backgroundTag.toString(); ret += "\nallowedHash: " + allowedTags + "\n"; return ret; } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt(CURRENT_SERIAL_VERSION); out.writeObject(sgmlPattern); out.writeObject(lexer); out.writeObject(backgroundTag); out.writeObject(allowedTags); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int version = in.readInt (); sgmlPattern = (Pattern) in.readObject(); lexer = (CharSequenceLexer) in.readObject(); backgroundTag = (String) in.readObject(); allowedTags = (Set) in.readObject(); } }