/*
* Copyright 1996-2002 by Andruid Kerne. All rights reserved. CONFIDENTIAL. Use is subject to
* license terms.
*/
package ecologylab.bigsemantics.model;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import ecologylab.bigsemantics.model.text.Term;
import ecologylab.generic.StringBuilderPool;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.ElementState;
import ecologylab.serialization.annotations.Hint;
import ecologylab.serialization.annotations.simpl_hints;
import ecologylab.serialization.annotations.simpl_inherit;
import ecologylab.serialization.annotations.simpl_other_tags;
import ecologylab.serialization.annotations.simpl_scalar;
/**
* Smallest unit of top-level text in an HTML page; that is, a token of text that lives outside of
* html tags -- not a tag name, not an attribute, not part of a style, nor of javascript -- that
* results from an html parse.
*
* @author alexgrau
*/
public @simpl_inherit
class TextToken extends ElementState
{
@simpl_scalar
@simpl_hints(Hint.XML_LEAF)
protected String string = "";
/** Link for the TextToken, if one exists */
@simpl_scalar
protected ParsedURL href;
/** Delimiters that come before the string found in WordForms */
@simpl_scalar
protected String delimsBefore = " ";
/**
* Style of the token, plus one. Style are found in Font. 0 is the same as the ChunkStyle, 1 is
* Plain, 2 is Bold, etc. Returned and set using values in Font, just plus or minus one. Done so
* that if the token is the same as the chunk, the value is not stored in the xml.
*/
@simpl_scalar
protected int stylePlusOne = 0;
/**
* Face index (font) for the token. Done in same manner as StylePlusOne. Uses same index as
* fontIndex just plus one so that 0 can be the same as the chunk, rather than -1. Although in the
* methods it is coded as -1.
*/
@simpl_scalar
protected int facePlusOne = 0;
/**
* Font size of the token. 0 means that it is the same as the Chunk.
*/
@simpl_scalar
protected int fontSize = 0;
// TODO Boolean
@simpl_scalar
@simpl_other_tags({"eol"})
protected int endOfLine = 0;
/**
* Underline is an integer that has constants that mean different things. 0-Same as Chunk
* Underline Style. 1-Don't Underline Token 2-Do Underline String but not Delims 3-Do Underline
* Entire Token 4-Do Double Underline String but not Delims 5-Do Double Underline Entire Token
*/
@simpl_scalar
protected int underline = 0;
/** Variable used to denote the end of a link */
public boolean endOfLink;
private boolean fullStringIsOld = true;
private String fullString;
private long ormId;
public static final int NOT_END_OF_LINE = 0;
public static final int END_OF_LINE = 1;
/** Constant that indicates that the style or face index is the same as the chunk */
public static final int SAME_AS_CHUNK_STYLE_OR_FACE = -1;
/** Constant that indicates tha the font size is the same as the chunk */
public static final int SAME_AS_CHUNK_SIZE = 0;
/** Constant that indicates that you underline this token in the same fashion as the chunk */
public static final int SAME_AS_CHUNK_UNDERLINE = 0;
/** Constant that indicates that you do not underline this token */
public static final int UNDERLINE_NONE = 1;
/**
* Constant that indicates that you do underline the token string part of it but not the delims
* before.
*/
public static final int UNDERLINE_STRING = 2;
/** Constant that indicates that you underline the entire token, even the delims before */
public static final int UNDERLINE_ENTIRE_TOKEN = 3;
/**
* Constant that indicates that you do double underline the token string part of it but not the
* delims before.
*/
public static final int DOUBLE_UNDERLINE_STRING = 4;
/** Constant that indicates that you double underline the entire token, even the delims before */
public static final int DOUBLE_UNDERLINE_ENTIRE_TOKEN = 5;
public static final StringBuilderPool stringBufPool = new StringBuilderPool(25);
/**
* Empty constructor for opening in xml translation.
*/
public TextToken ()
{
super();
}
/**
* Creates a token based on a string, delimeters and a link, uses the same other featurs as the
* chunk
*
* @param s
* @param delims
* @param h
*/
public TextToken ( String s, String delims, ParsedURL h )
{
this(s, h, SAME_AS_CHUNK_STYLE_OR_FACE, SAME_AS_CHUNK_SIZE, delims,
SAME_AS_CHUNK_STYLE_OR_FACE, SAME_AS_CHUNK_UNDERLINE);
}
/**
* Creates a textToken.
*
* @param s
* @param delims
* @param h
* @param style
* @param fontSize
* @param faceIndex
*/
public TextToken ( String s, String delims, ParsedURL h, int style, int fontSize, int faceIndex )
{
this(s, h, style, fontSize, delims, faceIndex, SAME_AS_CHUNK_UNDERLINE);
}
/**
* This is for the coping previousToken
*/
public TextToken ( TextToken previousToken )
{
this(previousToken.string, previousToken.href, previousToken.fontStyle(), previousToken
.getFontSize(), previousToken.delimsBefore, previousToken.faceIndex(), previousToken
.getUnderline());
}
/**
* Creates a textToken with predesignated features.
*
* @param s
* @param h
* @param fontStyle
* @param tokenFontSize
* @param delims
* @param faceIndex
* @param underlineOpp
*/
public TextToken ( String s, ParsedURL h, int fontStyle, int tokenFontSize, String delims,
int faceIndex, int under )
{
if (s != null)
string = s;
href = h;
stylePlusOne = fontStyle + 1;
fontSize = tokenFontSize;
delimsBefore = delims;
facePlusOne = faceIndex + 1;
underline = under;
}
public void setString ( String string )
{
if (string == null)
this.string = "";
else
this.string = string;
resetFullString();
}
public String lc()
{
return getString().toLowerCase();
}
public String toString ( )
{
StringBuilder sb = stringBufPool.acquire();
sb.append(getString());
sb.append("->");
sb.append(href);
return stringBufPool.releaseAndGetString(sb);
}
public String getString ( )
{
return string;
}
public String fullString ( )
{
if (fullStringIsOld)
rebuildFullString();
return fullString;
}
public int faceIndex ( )
{
return facePlusOne - 1;
}
public void setFaceIndex ( int faceIndex )
{
facePlusOne = faceIndex + 1;
}
public int getUnderline ( )
{
return underline;
}
public void setUnderline ( int value )
{
underline = value;
}
public boolean empty ( )
{
return string.equals("");
}
public ParsedURL getHref ( )
{
return href;
}
public int fontStyle ( )
{
return stylePlusOne - 1;
}
public int getStylePlusOne ( )
{
return stylePlusOne;
}
public int getFontSize ( )
{
return fontSize;
}
public String getDelimsBefore ( )
{
return delimsBefore;
}
public void setHref ( ParsedURL newHref )
{
href = newHref;
}
public void setHref ( String value )
{
if (!value.equals("null"))
{
try
{
// System.out.println("***********" + value);
href = new ParsedURL(new URL(value));
}
catch (Exception e)
{
e.printStackTrace();
}
}
else
href = null;
}
public void setFontStyle ( int style )
{
stylePlusOne = style + 1;
}
public void setStylePlusOne( int stylePlusOne )
{
this.stylePlusOne = stylePlusOne;
}
public void setFontSize ( int tokenSize )
{
fontSize = tokenSize;
}
public void setDelimsBefore ( String s )
{
delimsBefore = s;
resetFullString();
}
public void addDelimBefore ( String s )
{
StringBuilder sb = stringBufPool.acquire();
sb.append(delimsBefore);
sb.append(s);
delimsBefore = stringBufPool.releaseAndGetString(sb);
resetFullString();
}
public void addDelimBefore ( String s, int i )
{
StringBuilder sb = stringBufPool.acquire();
sb.append(delimsBefore);
sb.insert(i+1, s);
delimsBefore = stringBufPool.releaseAndGetString(sb);
resetFullString();
}
public void removeDelimBefore ( )
{
if (!delimsBefore.equals(""))
{
delimsBefore = delimsBefore.substring(0, delimsBefore.length() - 1);
resetFullString();
}
}
public void removeDelimBefore ( int i )
{
if (!delimsBefore.equals(""))
{
StringBuilder sb = stringBufPool.acquire();
sb.append(delimsBefore);
sb.deleteCharAt(i);
delimsBefore = stringBufPool.releaseAndGetString(sb);
resetFullString();
}
}
/**
* Free resources associated with this.
*/
public void recycle ( )
{
href = null;
super.recycle();
}
public int getEndOfLine ( )
{
return endOfLine;
}
public void setEndOfLine ( int endOfLine )
{
this.endOfLine = endOfLine;
}
private void resetFullString()
{
fullStringIsOld = true;
}
private void rebuildFullString()
{
StringBuilder sb = stringBufPool.acquire();
sb.append(delimsBefore);
sb.append(getString());
fullString = stringBufPool.releaseAndGetString(sb);
fullStringIsOld = false;
}
/**
*
* @param c
* @return true if this token has a String, and the character is contained in it.
*/
public boolean contains(char c)
{
return string == null ? false : string.indexOf(c) >= 0;
}
public Term xterm()
{
return null;
}
public static final Set<String> TERMINAL_PUNCTUATION = new HashSet<String>();
static
{
TERMINAL_PUNCTUATION.add(".");
TERMINAL_PUNCTUATION.add(".\"");
TERMINAL_PUNCTUATION.add("?");
TERMINAL_PUNCTUATION.add("?\"");
TERMINAL_PUNCTUATION.add("!");
}
public static final Set<String> TERMINAL_EXCEPTIONS = new HashSet<String>();
static
{
TERMINAL_EXCEPTIONS.add("i.e.");
TERMINAL_EXCEPTIONS.add("e.g.");
TERMINAL_EXCEPTIONS.add("mr.");
TERMINAL_EXCEPTIONS.add("mrs.");
TERMINAL_EXCEPTIONS.add("ms.");
TERMINAL_EXCEPTIONS.add("jr.");
TERMINAL_EXCEPTIONS.add("sr.");
TERMINAL_EXCEPTIONS.add("sgt.");
TERMINAL_EXCEPTIONS.add("u.s.");
TERMINAL_EXCEPTIONS.add("corp.");
TERMINAL_EXCEPTIONS.add("inc.");
TERMINAL_EXCEPTIONS.add("co.");
TERMINAL_EXCEPTIONS.add("jan.");
TERMINAL_EXCEPTIONS.add("feb.");
TERMINAL_EXCEPTIONS.add("mar.");
TERMINAL_EXCEPTIONS.add("apr.");
// No need for May
TERMINAL_EXCEPTIONS.add("jun.");
TERMINAL_EXCEPTIONS.add("jul.");
TERMINAL_EXCEPTIONS.add("aug.");
TERMINAL_EXCEPTIONS.add("sep.");
TERMINAL_EXCEPTIONS.add("oct.");
TERMINAL_EXCEPTIONS.add("nov.");
TERMINAL_EXCEPTIONS.add("dec.");
}
protected static Pattern HONORIFIC_MATCHER = Pattern.compile("(\\p{Upper}\\.)+");
public boolean endsWithTerminal()
{
String string = this.getString();
if (!TERMINAL_EXCEPTIONS.contains(string.toLowerCase())
&& !HONORIFIC_MATCHER.matcher(string).matches())
{
return endsWith(TERMINAL_PUNCTUATION);
}
return false;
}
private boolean endsWith(java.util.Collection<String> endings)
{
String string = this.getString();
for (String ending : endings)
{
if (string.endsWith(ending))
{
return true;
}
}
return false;
}
public long getOrmId()
{
return ormId;
}
public void setOrmId(long ormId)
{
this.ormId = ormId;
}
public int getFacePlusOne()
{
return facePlusOne;
}
public void setFacePlusOne(int facePlusOne)
{
this.facePlusOne = facePlusOne;
}
}