/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.util;
import java.io.*;
import java.lang.CharSequence;
import java.util.Iterator;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.*;
import cc.mallet.util.Lexer;
public class CharSequenceLexer implements Lexer, Serializable
{
// Some predefined lexing rules
public static final Pattern LEX_ALPHA = Pattern.compile ("\\p{Alpha}+");
public static final Pattern LEX_WORDS = Pattern.compile ("\\w+");
public static final Pattern LEX_NONWHITESPACE_TOGETHER = Pattern.compile ("\\S+");
public static final Pattern LEX_WORD_CLASSES =
Pattern.compile ("\\p{Alpha}+|\\p{Digit}+");
public static final Pattern LEX_NONWHITESPACE_CLASSES =
Pattern.compile ("\\p{Alpha}+|\\p{Digit}+|\\p{Punct}");
// Lowercase letters and uppercase letters
public static final Pattern UNICODE_LETTERS =
Pattern.compile("[\\p{Ll}&&\\p{Lu}]+");
Pattern regex;
Matcher matcher = null;
CharSequence input;
String matchText;
boolean matchTextFresh;
public CharSequenceLexer ()
{
this (LEX_ALPHA);
}
public CharSequenceLexer (Pattern regex)
{
this.regex = regex;
setCharSequence (null);
}
public CharSequenceLexer (String regex)
{
this (Pattern.compile (regex));
}
public CharSequenceLexer (CharSequence input, Pattern regex)
{
this (regex);
setCharSequence (input);
}
public CharSequenceLexer (CharSequence input, String regex)
{
this (input, Pattern.compile (regex));
}
public void setCharSequence (CharSequence input)
{
this.input = input;
this.matchText = null;
this.matchTextFresh = false;
if (input != null)
this.matcher = regex.matcher(input);
}
public CharSequence getCharSequence()
{
return input;
}
public String getPattern()
{
return regex.pattern();
}
public void setPattern(String reg)// added by Fuchun
{
if(!regex.equals( getPattern() )){
this.regex = Pattern.compile(reg);
// this.matcher = regex.matcher(input);
}
}
public int getStartOffset ()
{
if (matchText == null)
return -1;
return matcher.start();
}
public int getEndOffset ()
{
if (matchText == null)
return -1;
return matcher.end();
}
public String getTokenString ()
{
return matchText;
}
// Iterator interface methods
private void updateMatchText ()
{
if (matcher != null && matcher.find()) {
matchText = matcher.group();
if (matchText.length() == 0) {
// xxx Why would this happen?
// It is happening to me when I use the regex ".*" in an attempt to make
// Token's out of entire lines of text. -akm.
updateMatchText();
//System.err.println ("Match text is empty!");
}
//matchText = input.subSequence (matcher.start(), matcher.end()).toString ();
} else
matchText = null;
matchTextFresh = true;
}
public boolean hasNext ()
{
if (! matchTextFresh)
updateMatchText ();
return (matchText != null);
}
public Object next ()
{
if (! matchTextFresh)
updateMatchText ();
matchTextFresh = false;
return matchText;
}
public void remove ()
{
throw new UnsupportedOperationException ();
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
// xxx hmph... Pattern.java seems to have serialization
// problems. Work around: serialize the String and flags
// representing the regex, and recompile Pattern.
if (CURRENT_SERIAL_VERSION == 0)
out.writeObject (regex);
else if (CURRENT_SERIAL_VERSION == 1) {
out.writeObject (regex.pattern());
out.writeInt (regex.flags());
//out.writeBoolean(matchTextFresh);
}
out.writeBoolean (matchTextFresh);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
if (version == 0)
regex = (Pattern) in.readObject();
else if (version == 1) {
String p = (String) in.readObject();
int flags = in.readInt();
regex = Pattern.compile (p, flags);
}
matchTextFresh = in.readBoolean();
}
public static void main (String[] args)
{
try {
BufferedReader in
= new BufferedReader(new FileReader(args[0]));
for (String line = in.readLine(); line != null; line = in.readLine()) {
CharSequenceLexer csl =
new CharSequenceLexer (line, LEX_NONWHITESPACE_CLASSES );
while (csl.hasNext())
System.out.println (csl.next());
}
} catch (Exception e) {
System.out.println (e.toString());
}
}
}