TextChunkBase.java example

Explorer
BigSemanticsJava-master
/*
 * Copyright 1996-2002 by Andruid Kerne. All rights reserved. CONFIDENTIAL. Use is subject to
 * license terms.
 */
package ecologylab.bigsemantics.model;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import ecologylab.appframework.types.prefs.Pref;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.bigsemantics.model.text.Term;
import ecologylab.bigsemantics.model.text.TermDictionary;
import ecologylab.generic.StringBuilderPool;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.ElementState;
import ecologylab.serialization.annotations.simpl_collection;
import ecologylab.serialization.annotations.simpl_composite;
import ecologylab.serialization.annotations.simpl_inherit;
import ecologylab.serialization.annotations.simpl_nowrap;
import ecologylab.serialization.annotations.simpl_other_tags;
import ecologylab.serialization.annotations.simpl_scalar;
import ecologylab.serialization.annotations.simpl_scope;
import ecologylab.serialization.types.FundamentalTypes;
import ecologylab.serialization.types.ScalarType;
import ecologylab.textformat.NamedStyle;

/**
 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an ordered
 * collection of {@link TextToken TextToken}s.
 */
abstract public @simpl_inherit
class TextChunkBase<T extends TextToken> extends ElementState implements
		Iterable<T>
{
	
	@simpl_collection
	@simpl_scope(TextTokenTranslations.TEXT_TOKEN_SCOPE_NAME)
	@simpl_nowrap
	protected List<T>						tokens;
	/**
	 * Named Style for this text chunk. Default is to an anonymous style.
	 */
	@simpl_composite @simpl_other_tags({"anon_style"})
	protected NamedStyle							namedStyle									= new NamedStyle(DEFAULT_POINT_SIZE);

	/**
	 * Current style name. Either this or anon style will be null so that only one will be sent to
	 * xml.
	 */
	@simpl_scalar
	protected String									styleName									= null;

	@simpl_scalar
	protected ParsedURL								commonHref								= null;

	@simpl_scalar
	protected float										nonStopIndex;

	private boolean										recycled;

	private ScalarType								scalarType								= null;
	
	/**
	 * used by the ORM layer as the database generated surrogate id.
	 */
	private long											ormId;

	
	public static final int						DEFAULT_POINT_SIZE				= 21;

	/** Estimate used for StringBuffer allocation. */
	public static int									CHARS_PER_TOKEN						= 12;

	public static final int						LEFT											= 0;

	public static final int						CENTER										= 1;

	public static final int						RIGHT											= 2;

	/**
	 * The maximum number of words for a text surrogate.
	 */
	public static final int				MAX_WORDS							= 9;

	/**
	 * The minimum number of words for a text surrogate.
	 */
	public static final int					MIN_WORDS							= 5;
	
	private static StringBuilderPool sbp = new StringBuilderPool(25);

	static HashMap<String, String[]>	garbageFilterMap					= new HashMap<String, String[]>();

	final static String								garbageFilterStrings[][]	=
																															{
																															{ "all", "rights", "reserved" },
																															{ "is", "a", "trademark" },
																															{ "copyright" },
																															{ "last", "updated" },
																															{ "you", "searched" },
																															{ "email", "inquiries" },
																															{ "best", "viewed", "with" },
																															{ "search", "for", "more" },
																															{ "password" },
																															{ "last", "modified" },
																															{ "posted", "at" },
																															{ "subscriber", "id" },
																															{ "terms", "under" },
																															{ "text", "only" },
																															{ "text", "version" },
																															{ "search", "results" },
																															{ "hotbot", "results" },
																															{ "disclaimer" },
																															{ "see", "results", "from" },
																															{ "altavista", "found" },
																															{ "results", "for" },
																															{ "hits", "since" },
																															{ "visitor", "number" },
																															{ "error", "occurred" },
																															{ "support", "frames" },
																															{ "error", "404" },
																															{ "found", "error" },
																															{ "contact", "us" },
																															{ "slide", "shows" },
																															{ "see", "sample" },
																															{ "special", "offer" },
																															{ "privacy", "policy" },
																															{ "license", "agreement" },
																															{ "terms", "of", "use" },
																															{ "sign", "up" },
																															{ "sign", "in" },
																															{ "sign", "off" },
																															{ "ameritrade" }, };

	static
	{
		for (int i = 0; i != garbageFilterStrings.length; i++)
		{
			String[] thatFilter = garbageFilterStrings[i];
			garbageFilterMap.put(thatFilter[0], thatFilter);
		}
	}

	/**
	 * Empty constructor for opening in xml translation.
	 */
	protected TextChunkBase()
	{
		this(false);
	}

	/**
	 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an
	 * ordered collection of {@link TextToken TextToken}s.
	 */
	protected TextChunkBase(boolean doUnderline, ParsedURL commonHref)
	{
		this(doUnderline);
		this.commonHref = commonHref;
	}

	/**
	 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an
	 * ordered collection of {@link TextToken TextToken}s.
	 */
	protected TextChunkBase(boolean doUnderline, ParsedURL commonHref, int size, int faceIndex,
			int fontStyle, int alignment)
	{
		this(doUnderline);
		this.commonHref = commonHref;
		getNamedStyle().setFontSize(size);
		getNamedStyle().setFaceIndex(faceIndex);
		getNamedStyle().setFontStyle(fontStyle);
		getNamedStyle().setAlignment(alignment);
	}

	/**
	 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an
	 * ordered collection of {@link TextToken TextToken}s.
	 */
	protected TextChunkBase(boolean doUnderline)
	{
		this(doUnderline, FundamentalTypes.STRING_TYPE);
	}

	/**
	 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an
	 * ordered collection of {@link TextToken TextToken}s.
	 */
	protected TextChunkBase(boolean doUnderline, ScalarType scalarType)
	{
		getNamedStyle().setUnderline(doUnderline);
		this.scalarType = scalarType;
		this.tokens = new ArrayList<T>();
	}

	/**
	 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an
	 * ordered collection of {@link TextToken TextToken}s.
	 */
	protected TextChunkBase(boolean doUnderlineArg, CharSequence untokenized)
	{
		this(doUnderlineArg);
		tokenize(untokenized, scalarType);
	}

	/**
	 * @param doUnderlineArg
	 * @param untokenized
	 * @param delims
	 * @param keepDelims
	 *          true if the delimiters should be kept in the String values of each token.
	 */
	protected TextChunkBase(boolean doUnderlineArg, CharSequence untokenized, ScalarType scalarType)
	{
		this(doUnderlineArg, scalarType);
		tokenize(untokenized, scalarType);
	}

	/**
	 * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an
	 * ordered collection of {@link TextToken TextToken}s.
	 */
	protected TextChunkBase(TextChunkBase<T> copyChunk)
	{
		this(copyChunk.namedStyle.getUnderline());
		int size = copyChunk.size();
		for (int i = 0; i < size; i++)
		{
			T newToken = newToken(copyChunk.token(i));
			add(newToken);
		}
	}

	/**
	 * Factory method to call the correct constructor.
	 * 
	 * @param doUnderlineArg
	 * @param untokenized
	 * @param scalarType
	 * @return
	 */
	public abstract TextChunkBase<T> newTextChunk(boolean doUnderlineArg, CharSequence untokenized,
			ScalarType scalarType);

	public abstract TextChunkBase<T> newTextChunk();

	/**
	 * Create a new constituent TextToken of the correct subtype.
	 * 
	 * @param string
	 * @param href
	 * @return
	 */
	abstract public T newToken(String string, String delims, ParsedURL href);

	/**
	 * Create a new constituent TextToken of the correct subtype.
	 * 
	 * @param string
	 * @param delmis
	 * @param href
	 * @param style
	 * @return
	 */
	abstract public T newToken(String string, String delims, ParsedURL href, int style, int fontSize,
			int faceIndex);

	/**
	 * Create a new constituent TextToken of the correct subtype.
	 * 
	 * @param prevToken
	 * @return
	 */
	abstract public T newToken(TextToken prevToken);

	/**
	 * Tokenizes a string given a ScalarType
	 * 
	 * @param untokenized
	 * @param scalarType
	 */
	protected void tokenize(CharSequence untokenized, ScalarType scalarType)
	{
		if (scalarType == null)
			error("tokenize() scalarType==null; untokenized = " + untokenized);
		
		if (untokenized == null)
			return;

		Pattern pattern = scalarType.delimitersTokenizer();
		Matcher matcher = pattern.matcher(untokenized);

		String delimsBefore = "";
		final boolean allowDelimitersInTokens = scalarType.allowDelimitersInTokens();
		boolean first		= true;

		while (matcher.find())
		{
			String group1 	= matcher.group(1);
			if (group1.contains("\n"))
				group1				= "\n";
			if (allowDelimitersInTokens && !first)
			{
				delimsBefore	= group1;
			}
			T textToken = newToken(matcher.group(2), delimsBefore, null);
			addTextToken(textToken);
			if (!allowDelimitersInTokens)
				delimsBefore	= scalarType.primaryDelimiter();
			first						= false;
		}
	}

	public void initDoubleUnderline()
	{
		int hrefInRow = 0;
		for (int i = 0; i < this.size(); i++)
		{
			T token = this.get(i);
			if (token.getHref() != null)
			{
				if (hrefInRow == 0)
				{
					token.setUnderline(TextToken.DOUBLE_UNDERLINE_STRING);
				}
				else
				{
					token.setUnderline(TextToken.DOUBLE_UNDERLINE_ENTIRE_TOKEN);
				}
				hrefInRow++;
			}
			else
			{
				hrefInRow = 0;
			}
		}
	}

	/**
	 * Adds and Tokenizes a string to the end of the TextChunkBase
	 * 
	 * @param string
	 */
	public void appendString(CharSequence string)
	{
		tokenize(string, scalarType);
	}

	/**
	 * Adds the tokens in TextChunkBase to the end of this TextChunkBase
	 * 
	 * @param textChunk
	 */
	public void appendTextChunk(TextChunkBase<T> textChunk)
	{
		for (T textToken : textChunk)
		{
			addTextToken(textToken);
		}
	}
	
	

	/** Should be called addTextToken(). */
	public void add(String string, ParsedURL href)
	{
		T token;
		token = newToken(string, "", href);
		add(token);
	}

	public void add(String string, String delims, ParsedURL href)
	{
		T token;
		token = newToken(string, delims, href);
		add(token);
	}

	public void addTextToken(T textToken)
	{
		if (textToken == null)
		{
			return;
		}
		add(textToken);
	}

	public void endLink()
	{
		((TextToken) lastElement()).endOfLink = true;
	}

	/**
	 * @return the length in characters of the TextChunk does include whitespace
	 */
	public int length()
	{
		int sum = 0;
		int n = size();

		for (int i = 0; i != n; i++)
		{
			sum += token(i).fullString().length();
		}

		return sum;
	}

	/**
	 * Returns the ith TextToken
	 * 
	 * @param i
	 * @return
	 */
	public T token(int i)
	{
		return (i >= size()) ? null : get(i);
	}

	/**
	 * Returns the ith WordForm.
	 * 
	 * @param i
	 * @return
	 */
	public TextToken textToken(int i)
	{
		return token(i);
	}

	/**
	 * Returns the string for the token i in the chunk.
	 * 
	 * @param i
	 * @return
	 */
	public String string(int i)
	{
		TextToken token = token(i);
		return (token == null) ? null : token.fullString();
	}

	public String lc(int i)
	{
		TextToken token = token(i);
		return (token == null) ? null : token.lc();
	}

	public String string()
	{
		return stringBuilder().toString();
	}

	public StringBuilder stringBuilder()
	{
		StringBuilder sb = new StringBuilder();

		for (int i = 0; i != size(); i++)
		{
			TextToken t = token(i);
			sb.append(t.getDelimsBefore());
			sb.append(t.getString());
		}
		return sb;
	}

	public boolean empty()
	{
		// return (size() == 0) || token(0).empty();
		return (size() == 0 || string().trim().equals(""));
	}

	static final String	xmlCloseAnchor			= "</a>";

	static final String	regularCloseAnchor	= "</a>";

	/**
	 * The equivalent of emitHtml(false)
	 * 
	 * @return An HTML representation of this TextChunkBase.
	 */
	public String emitHtml()
	{
		return emitHtml(false);
	}

	/**
	 * @param xmlBrackets
	 *          If true, generate HTML for inclusion within XML.
	 * @return An HTML representation of this TextChunkBase.
	 */
	public String emitHtml(boolean xmlBrackets)
	{
		String result = "";
		boolean inHref = false;
		String open, close, closeAnchor;

		if (xmlBrackets)
		{
			open = "<";
			close = ">";
			closeAnchor = xmlCloseAnchor;
		}
		else
		{
			open = "<";
			close = ">";
			closeAnchor = regularCloseAnchor;
		}

		for (int i = 0; i != size(); i++)
		{
			TextToken thatToken = token(i);
			ParsedURL href = thatToken.getHref();

			if (!inHref && (href != null))
			{
				result += open + "a " + ecologylab.serialization.XMLTools.nameVal("href", href.url()) + close;
				inHref = true;
			}

			result += thatToken.getDelimsBefore() + thatToken.getString() + " ";

			if (inHref && (href == null))
			{
				result += closeAnchor;
				inHref = false;
			}
		}

		if (inHref)
		{
			result += closeAnchor;
		}

		return result;
	}

	public T lastElement()
	{
		return (size() == 0) ? null : this.get(size() - 1);
	}

	public void removeElementAt(int i)
	{
		if (tokens != null)
			tokens.remove(i);
	}

	/**
	 * @return true if this TextChunkBase looks like garbage
	 */
	public boolean isGarbage()
	{
		String s0;
		// prime the pump
		String s1 = lc(0);
		String s2 = lc(1);

		int n = size();

		for (int i = 0; i < n; i++)
		{
			// lower case w all lower case in contentIntegrator filter -- ignore case!m
			s0 = s1;
			s1 = s2;
			s2 = lc(i + 2);
			String filterJ[] = garbageFilterMap.get(s0);

			if (filterJ != null)
			{
				int filterLength = filterJ.length;

				if ((filterLength < 2)
						|| (StringTools.contains(s1, filterJ[1]) && ((filterLength < 3) || StringTools
								.contains(s2, filterJ[2]))))
				{
					return true;
				}
			}
		}
		return false;
	}

	String	string;

	// TODO -- invaliate this cached string when editing happens!!!!!!

	public String toString()
	{
		StringBuilder buffy	= toStringBuilder();

		String result = StringTools.toString(buffy);
		
		StringBuilderUtils.release(buffy);
		
		return result;
	}

	/**
	 * @param buffy
	 */
	public void toStringBuilder(StringBuilder buffy)
	{
		int size = size();
		for (int i = 0; i < size; i++)
		{
			buffy.append(token(i).fullString());
		}
	}
	
	public StringBuilder toStringBuilder()
	{
		StringBuilder result	= StringBuilderUtils.acquire();
		toStringBuilder(result);
		return result;
	}	

	public ParsedURL getCommonHref()
	{
		return commonHref;
	}
	
	public void setCommonHref(ParsedURL commonHref)
	{
		this.commonHref = commonHref;
	}

	/**
	 * An HTML representation of this MediaElement, suitable for drag and drop, copy and paste, ....
	 * 
	 * @return HTML String.
	 */
	public String toHTML()
	{
		int size = size();
		StringBuffer buffy = new StringBuffer(size * CHARS_PER_TOKEN);
		ParsedURL currentPURL = null;
		for (int i = 0; i < size; i++)
		{
			TextToken token = token(i);
			ParsedURL tokenPURL = token.getHref();
			if (currentPURL != tokenPURL)
			{
				// changes in hyperlink
				if (currentPURL != null)
				{
					// close a hyperlink
					buffy.append("</a>");
				}
				if (tokenPURL != null)
				{
					// open a hyperlink
					buffy.append("<a href=\"").append(tokenPURL.toString()).append("\">");
				}
			}
			buffy.append(token.fullString());
		}

		// close hyperlink if still open
		if (currentPURL != null)
		{
			buffy.append("</a>");
		}

		return buffy.toString();
	}

	public ScalarType scalarType()
	{
		return scalarType;
	}

	/**
	 * Clear data structures and references to enable garbage collecting of resources associated with
	 * this.
	 */
	public void recycle()
	{
		if (!recycled)
		{
			recycled		= true;
			if (namedStyle != null)
				namedStyle.recycle();
			namedStyle 	= null;
			string 			= null;
			super.recycle();
			int last 		= size() - 1;
			for (int i = last; i >= 0; i--)
			{
				T tt = remove(i);
				tt.recycle();
			}
		}
	}

	public NamedStyle getNamedStyle()
	{
		return namedStyle;
	}

	public void setNamedStyle(NamedStyle style)
	{
		this.namedStyle = style;
	}

	public String getStyleName()
	{
		return styleName;
	}

	public void setStyleName(String styleName)
	{
		this.styleName = styleName;
	}

	static final String	TEST_STRING	= "Querying Web Metadata: Native Score\nManagement and Text Support\nin Databases\nG\n�\nULTEKIN\n�\nOZSOYO\n?\nGLU\nCase Western Reserve University\nISMAIL SENG\n�\nOR ALTING\n�\nOVDE\nBilkent Universit";
	
	public T get(int i)
	{
		return tokens == null ? null : tokens.get(i);
	}
	
	public int size()
	{
		return tokens == null ? 0 : tokens.size();
	}
	
	public boolean add(T token)
	{
		return (tokens == null) ? false :	tokens.add(token);
	}
	
	public void add(int index, T token)
	{
		if (tokens != null) 
			tokens.add(index, token);
	}
	
	abstract public Iterator<T> iterator();
	
	public T remove(int i)
	{
		return tokens == null ? null : tokens.remove(i);
	}
	
	public boolean isEmpty()
	{
		return size() == 0;
	}
	
	public void clear()
	{
		if (tokens != null)
			tokens.clear();
	}

	/**
	 * Calculates the weight of a text chunk by summing all of the term weights contained in this
	 * chunk.
	 * 
	 * @param chunk
	 *          a chunk of text
	 * @return sum of weights of all terms in the specified chunk
	 */
	protected final float getWeight()
	{
		float weight = 0;
		for (int i = 0; i < this.size(); i++)
		{
			T cfTextToken = this.get(i);
			String tokenString = cfTextToken.getString();
			if (tokenString.length() > 1)		//  ignore 1 character strings. they are not valuable.
			{
				Term xterm = cfTextToken.xterm();
				double w = xterm.idf();
				weight += w;
			}
		}
		return weight;
	}

	public final float getAvgWeight()
	{
		int size = size();
		if (size == 0)
		{
			return 0;
		}
		return getWeight() / size;
	}
	
	/**
	 * Returns a subchunk of a chunk where the subchunk contains the <code>begin</code>th word to the
	 * <code>end</code>th word.
	 * 
	 * @param chunk
	 *          a text chunk
	 * @param begin
	 *          the index in chunk of the first word to be in the subchunk
	 * @param end
	 *          the index in chunk of the last word to be in the subchunk
	 * @return a subchunk of the specified chunk
	 */
	public final TextChunkBase<T> getSubchunk(int begin, int end)
	{
		TextChunkBase<T> subchunk = this.newTextChunk();
		for (int i = begin; i <= end; i++)
		{
			if (!TermDictionary.mostObviousStopWordTerms.containsKey(removeNonAlpha(get(i).getString()))
					|| !subchunk.isEmpty())
			{
				subchunk.add(get(i));
			}
		}
		int lastI = subchunk.size() - 1;
		while (lastI >= 0
				&& TermDictionary.mostObviousStopWordTerms.containsKey(removeNonAlpha(subchunk.get(lastI).getString())))
		{
			subchunk.remove(lastI);
			lastI--;
		}
		return subchunk;
	}

	private static final String removeNonAlpha(String s)
	{
		StringBuilder buff = sbp.acquire();
		char c;
		for (int i = 0; i < s.length(); i++)
		{
			c = s.charAt(i);
			if (Character.isLetter(c))
				buff.append(Character.toLowerCase(c));
		}
		return sbp.releaseAndGetString(buff);
	}

	/**
	 * Retrieves the sentence from the context with the highest average term weight.
	 * 
	 * @param context
	 *          a text chunk
	 * @return sentence with the highest average term weight
	 */
	public final TextChunkBase<T> getSentence()
	{
		List<TextChunkBase<T>> sentences = new LinkedList<TextChunkBase<T>>();
		for (int i = 0; i < this.size();)
		{
			TextChunkBase<T> sentence = newTextChunk();
			int j = 0;
			boolean sentenceHasEmailAddr	= false;	// cleaned-up by andruid, jon, sashi 5/20/2010
			while ((i + j) < size())
			{
				T token = get(i + j);
				if (token.contains('@'))							// look out for email addresses. discard any sentence that seems to contain one.
					sentenceHasEmailAddr	= true;
				if (!sentenceHasEmailAddr)
					sentence.add(token);
				if (token.endsWithTerminal())
				{
					break;
				}
				j++;
			}
			i += j + 1;
			if (!sentenceHasEmailAddr && sentence.size() > 0)
			{
				sentences.add(sentence);
			}
		}

		int maxI = 0;
		if (sentences.size() == 0)
			return this;
		float maxVal = sentences.get(0).getAvgWeight();
		for (int i = 1; i < sentences.size(); i++)
		{
			float val = sentences.get(i).getAvgWeight();
			if (val > maxVal)
			{
				maxVal = val;
				maxI = i;
			}
		}
		TextChunkBase<T> sentence = sentences.get(maxI);
		return sentence;
	}

	/**
	 * Trim phatSurrogate for visualization part
	 * 
	 * @param semanticText
	 * 						Boolean to increase the maximum size of text surrogates
	 * 						set true for metaMetaData semantic action text, false for all else.
	 * @return	The skinny chunk, a short phrase from a larger context, which we may show to the user.
	 */
	public TextChunkBase<T> trimPhatChunk(boolean semanticText)
	{
		// Set text length max and min bound from prefs
		float modifier = Pref.lookupFloat("text_length_modifier", 1);
		int maxLength = Math.round(MAX_WORDS * modifier);
		int minLength = Math.round(MIN_WORDS * modifier);		
		
		// Find the sentence in the context with highest average weight
		TextChunkBase<T> sentence 	= this.getSentence();
		int sentenceSize 			= sentence.size();
		
		int sizeIncrease = (semanticText) ? 3 : 0;		
		
		if (sentenceSize > (maxLength + sizeIncrease))
		{
			// Shorten the sentence by examining all contiguous sub-sentences between MIN_WORDS and
			// MAX_WORDS lengths. The sub-sentence with the highest average value is the winner.
			TextChunkBase<T> maxChunk 	= null;
			float maxVal 					= Float.NEGATIVE_INFINITY;
			for (int i = 0; i <= (sentenceSize - minLength); i++)
			{
				for (int j = (minLength - 1); j < (maxLength + sizeIncrease); j++)
				{
					if ((i + j) >= sentenceSize)
					{
						break;
					}
					TextChunkBase<T> chunk = sentence.getSubchunk(i, i + j);
					float val = chunk.getAvgWeight();
					if (val > maxVal)
					{
						maxVal = val;
						maxChunk = chunk;
					}
				}
			}
			if (maxChunk != null && maxChunk.size() > 0)
			{
				maxChunk.get(0).setDelimsBefore("");
			}
			return maxChunk;
		}
		// Removes most obvious stop words off the front of a sentence
		sentence = sentence.getSubchunk(0, sentenceSize - 1);
		if (sentence.size() > 0)
		{
			sentence.get(0).setDelimsBefore("");
		}
		return sentence;
	}

	public long getOrmId()
	{
		return ormId;
	}

	public void setOrmId(long ormId)
	{
		this.ormId = ormId;
	}

	public List<T> getTokens()
	{
		return tokens;
	}

	public void setTokens(List<T> tokens)
	{
		this.tokens = tokens;
	}

	public float getNonStopIndex()
	{
		return nonStopIndex;
	}

	public void setNonStopIndex(float nonStopIndex)
	{
		this.nonStopIndex = nonStopIndex;
	}


}