/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ package edu.nd.nina.extract; import edu.nd.nina.types.TokenSequence; import edu.nd.nina.util.CharSequenceLexer; public class StringTokenization extends TokenSequence implements Tokenization { private static final long serialVersionUID = 1L; private CharSequence document; /** Create an empty StringTokenization */ public StringTokenization(CharSequence seq) { document = seq; } /** * Creates a tokenization of the given string. Tokens are added from all the * matches of the given lexer. */ public StringTokenization(CharSequence string, CharSequenceLexer lexer) { super(); this.document = string; lexer.setCharSequence(string); while (lexer.hasNext()) { lexer.next(); this.add(new StringSpan(string, lexer.getStartOffset(), lexer .getEndOffset())); } } // xxx Refactor into AbstractTokenization public Span subspan(int firstToken, int lastToken) { StringSpan firstSpan = (StringSpan) get(firstToken); int startIdx = firstSpan.getStartIdx(); int endIdx; if (lastToken > size()) { endIdx = document.length(); } else { StringSpan lastSpan = (StringSpan) get(lastToken - 1); endIdx = lastSpan.getEndIdx(); } return new StringSpan(document, startIdx, endIdx); } public Span getSpan(int i) { return (Span) get(i); } public Object getDocument() { return document; } }