/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.extract; import java.util.regex.Pattern; import java.util.*; import cc.mallet.types.Label; import cc.mallet.types.LabelAlphabet; import cc.mallet.types.Sequence; /** * Tokenization filter that will create nested spans based on a hierarchical labeling of the data. * The labels should be of the form <tt>LBL1[|LBLk]*</tt>. For example, * <pre> * A A|B A|B|C A|B|C A|B A A * w1 w2 w3 w4 w5 w6 w7 * </pre> * will result in LabeledSpans like * <tt><A>w1 <B>w2 <C>w3 w4</C> w5</B> w6 w7</A></tt> * * Also, labels of the form <tt><B-field></tt> will force a new instance of the field to begin, * even if it is already active. And prefixes of <tt>I-</tt> are ignored so you can use BIO labeling. * * Created: Nov 12, 2004 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: HierarchicalTokenizationFilter.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $ */ public class HierarchicalTokenizationFilter implements TokenizationFilter { Pattern ignorePattern = null; public HierarchicalTokenizationFilter () { } public HierarchicalTokenizationFilter (Pattern ignorePattern) { this.ignorePattern = ignorePattern; } public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag, Tokenization input, Sequence seq) { LabeledSpans labeled = new LabeledSpans (document); addSpansFromTags (labeled, input, seq, dict, backgroundTag); return labeled; } private static class TagStart { int start; Label label; public TagStart (int start, Label label) { this.start = start; this.label = label; } } private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; LinkedList openTags = new LinkedList(); String[] lastTagSplit = new String [0]; while (i < tags.size()) { Label thisTag = dict.lookupLabel (tags.get(i).toString()); String[] thisTagSplit = splitTag (thisTag); int numToClose = compareSplitTags (thisTagSplit, lastTagSplit); // close all that need to be closed while (numToClose > 0) { TagStart tagStart = (TagStart) openTags.removeLast (); addLabeledSpan (labeled, input, tagStart, i, backgroundTag); numToClose--; } // open all that need to be opened for (int tidx = openTags.size (); tidx < thisTagSplit.length; tidx++) { openTags.add (new TagStart (i, dict.lookupLabel (thisTagSplit [tidx]))); } lastTagSplit = thisTagSplit; i++; } // Close all remaining tags while (!openTags.isEmpty ()) { TagStart tagStart = (TagStart) openTags.removeLast (); addLabeledSpan (labeled, input, tagStart, i, backgroundTag); } } private void addLabeledSpan (LabeledSpans labeled, Tokenization input, TagStart tagStart, int end, Label backgroundTag) { Span span = input.subspan (tagStart.start, end); Label splitTag = tagStart.label; labeled.add (new LabeledSpan (span, splitTag, splitTag == backgroundTag)); } private int compareSplitTags (String[] thisTagSplit, String[] lastTagSplit) { int idx = lastTagSplit.length - 1; for (; idx >= 0; idx--) { if (idx >= thisTagSplit.length) continue; String thisTag = thisTagSplit [idx]; if (isBeginName (thisTag)) continue; if (matches (lastTagSplit [idx], thisTag)) break; } int numToClose = lastTagSplit.length - idx - 1; // sanity check while (idx >= 0) { if (!matches (thisTagSplit[idx], lastTagSplit [idx])) { throw new IllegalArgumentException ("Tags don't match."); } idx--; } return numToClose; } private boolean matches (String str1, String str2) { return trim (str1).equals (trim (str2)); } private String trim (String name) { if (isBeginName (name) || isInsideName (name)) return (name.substring (2)); else return name; } private String[] splitTag (Label tag) { String name = tag.toString (); List split1 = new ArrayList (Arrays.asList (name.split ("\\|"))); Iterator it = split1.iterator (); while (it.hasNext()) { String str = (String) it.next(); if (ignorePattern != null && ignorePattern.matcher (str).matches ()) it.remove (); } return (String[]) split1.toArray (new String[0]); } private boolean isBeginName (String name) { return name.startsWith ("B-"); } private boolean isInsideName (String name) { return name.startsWith ("I-"); } }