/* * Copyright 1996-2002 by Andruid Kerne. All rights reserved. CONFIDENTIAL. Use is subject to * license terms. */ package ecologylab.bigsemantics.model; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import ecologylab.appframework.types.prefs.Pref; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.bigsemantics.model.text.Term; import ecologylab.bigsemantics.model.text.TermDictionary; import ecologylab.generic.StringBuilderPool; import ecologylab.generic.StringTools; import ecologylab.net.ParsedURL; import ecologylab.serialization.ElementState; import ecologylab.serialization.annotations.simpl_collection; import ecologylab.serialization.annotations.simpl_composite; import ecologylab.serialization.annotations.simpl_inherit; import ecologylab.serialization.annotations.simpl_nowrap; import ecologylab.serialization.annotations.simpl_other_tags; import ecologylab.serialization.annotations.simpl_scalar; import ecologylab.serialization.annotations.simpl_scope; import ecologylab.serialization.types.FundamentalTypes; import ecologylab.serialization.types.ScalarType; import ecologylab.textformat.NamedStyle; /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an ordered * collection of {@link TextToken TextToken}s. */ abstract public @simpl_inherit class TextChunkBase<T extends TextToken> extends ElementState implements Iterable<T> { @simpl_collection @simpl_scope(TextTokenTranslations.TEXT_TOKEN_SCOPE_NAME) @simpl_nowrap protected List<T> tokens; /** * Named Style for this text chunk. Default is to an anonymous style. */ @simpl_composite @simpl_other_tags({"anon_style"}) protected NamedStyle namedStyle = new NamedStyle(DEFAULT_POINT_SIZE); /** * Current style name. Either this or anon style will be null so that only one will be sent to * xml. */ @simpl_scalar protected String styleName = null; @simpl_scalar protected ParsedURL commonHref = null; @simpl_scalar protected float nonStopIndex; private boolean recycled; private ScalarType scalarType = null; /** * used by the ORM layer as the database generated surrogate id. */ private long ormId; public static final int DEFAULT_POINT_SIZE = 21; /** Estimate used for StringBuffer allocation. */ public static int CHARS_PER_TOKEN = 12; public static final int LEFT = 0; public static final int CENTER = 1; public static final int RIGHT = 2; /** * The maximum number of words for a text surrogate. */ public static final int MAX_WORDS = 9; /** * The minimum number of words for a text surrogate. */ public static final int MIN_WORDS = 5; private static StringBuilderPool sbp = new StringBuilderPool(25); static HashMap<String, String[]> garbageFilterMap = new HashMap<String, String[]>(); final static String garbageFilterStrings[][] = { { "all", "rights", "reserved" }, { "is", "a", "trademark" }, { "copyright" }, { "last", "updated" }, { "you", "searched" }, { "email", "inquiries" }, { "best", "viewed", "with" }, { "search", "for", "more" }, { "password" }, { "last", "modified" }, { "posted", "at" }, { "subscriber", "id" }, { "terms", "under" }, { "text", "only" }, { "text", "version" }, { "search", "results" }, { "hotbot", "results" }, { "disclaimer" }, { "see", "results", "from" }, { "altavista", "found" }, { "results", "for" }, { "hits", "since" }, { "visitor", "number" }, { "error", "occurred" }, { "support", "frames" }, { "error", "404" }, { "found", "error" }, { "contact", "us" }, { "slide", "shows" }, { "see", "sample" }, { "special", "offer" }, { "privacy", "policy" }, { "license", "agreement" }, { "terms", "of", "use" }, { "sign", "up" }, { "sign", "in" }, { "sign", "off" }, { "ameritrade" }, }; static { for (int i = 0; i != garbageFilterStrings.length; i++) { String[] thatFilter = garbageFilterStrings[i]; garbageFilterMap.put(thatFilter[0], thatFilter); } } /** * Empty constructor for opening in xml translation. */ protected TextChunkBase() { this(false); } /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an * ordered collection of {@link TextToken TextToken}s. */ protected TextChunkBase(boolean doUnderline, ParsedURL commonHref) { this(doUnderline); this.commonHref = commonHref; } /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an * ordered collection of {@link TextToken TextToken}s. */ protected TextChunkBase(boolean doUnderline, ParsedURL commonHref, int size, int faceIndex, int fontStyle, int alignment) { this(doUnderline); this.commonHref = commonHref; getNamedStyle().setFontSize(size); getNamedStyle().setFaceIndex(faceIndex); getNamedStyle().setFontStyle(fontStyle); getNamedStyle().setAlignment(alignment); } /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an * ordered collection of {@link TextToken TextToken}s. */ protected TextChunkBase(boolean doUnderline) { this(doUnderline, FundamentalTypes.STRING_TYPE); } /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an * ordered collection of {@link TextToken TextToken}s. */ protected TextChunkBase(boolean doUnderline, ScalarType scalarType) { getNamedStyle().setUnderline(doUnderline); this.scalarType = scalarType; this.tokens = new ArrayList<T>(); } /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an * ordered collection of {@link TextToken TextToken}s. */ protected TextChunkBase(boolean doUnderlineArg, CharSequence untokenized) { this(doUnderlineArg); tokenize(untokenized, scalarType); } /** * @param doUnderlineArg * @param untokenized * @param delims * @param keepDelims * true if the delimiters should be kept in the String values of each token. */ protected TextChunkBase(boolean doUnderlineArg, CharSequence untokenized, ScalarType scalarType) { this(doUnderlineArg, scalarType); tokenize(untokenized, scalarType); } /** * A text chunk from an HTML page; delimited by markup, or a size threshold; composed of an * ordered collection of {@link TextToken TextToken}s. */ protected TextChunkBase(TextChunkBase<T> copyChunk) { this(copyChunk.namedStyle.getUnderline()); int size = copyChunk.size(); for (int i = 0; i < size; i++) { T newToken = newToken(copyChunk.token(i)); add(newToken); } } /** * Factory method to call the correct constructor. * * @param doUnderlineArg * @param untokenized * @param scalarType * @return */ public abstract TextChunkBase<T> newTextChunk(boolean doUnderlineArg, CharSequence untokenized, ScalarType scalarType); public abstract TextChunkBase<T> newTextChunk(); /** * Create a new constituent TextToken of the correct subtype. * * @param string * @param href * @return */ abstract public T newToken(String string, String delims, ParsedURL href); /** * Create a new constituent TextToken of the correct subtype. * * @param string * @param delmis * @param href * @param style * @return */ abstract public T newToken(String string, String delims, ParsedURL href, int style, int fontSize, int faceIndex); /** * Create a new constituent TextToken of the correct subtype. * * @param prevToken * @return */ abstract public T newToken(TextToken prevToken); /** * Tokenizes a string given a ScalarType * * @param untokenized * @param scalarType */ protected void tokenize(CharSequence untokenized, ScalarType scalarType) { if (scalarType == null) error("tokenize() scalarType==null; untokenized = " + untokenized); if (untokenized == null) return; Pattern pattern = scalarType.delimitersTokenizer(); Matcher matcher = pattern.matcher(untokenized); String delimsBefore = ""; final boolean allowDelimitersInTokens = scalarType.allowDelimitersInTokens(); boolean first = true; while (matcher.find()) { String group1 = matcher.group(1); if (group1.contains("\n")) group1 = "\n"; if (allowDelimitersInTokens && !first) { delimsBefore = group1; } T textToken = newToken(matcher.group(2), delimsBefore, null); addTextToken(textToken); if (!allowDelimitersInTokens) delimsBefore = scalarType.primaryDelimiter(); first = false; } } public void initDoubleUnderline() { int hrefInRow = 0; for (int i = 0; i < this.size(); i++) { T token = this.get(i); if (token.getHref() != null) { if (hrefInRow == 0) { token.setUnderline(TextToken.DOUBLE_UNDERLINE_STRING); } else { token.setUnderline(TextToken.DOUBLE_UNDERLINE_ENTIRE_TOKEN); } hrefInRow++; } else { hrefInRow = 0; } } } /** * Adds and Tokenizes a string to the end of the TextChunkBase * * @param string */ public void appendString(CharSequence string) { tokenize(string, scalarType); } /** * Adds the tokens in TextChunkBase to the end of this TextChunkBase * * @param textChunk */ public void appendTextChunk(TextChunkBase<T> textChunk) { for (T textToken : textChunk) { addTextToken(textToken); } } /** Should be called addTextToken(). */ public void add(String string, ParsedURL href) { T token; token = newToken(string, "", href); add(token); } public void add(String string, String delims, ParsedURL href) { T token; token = newToken(string, delims, href); add(token); } public void addTextToken(T textToken) { if (textToken == null) { return; } add(textToken); } public void endLink() { ((TextToken) lastElement()).endOfLink = true; } /** * @return the length in characters of the TextChunk does include whitespace */ public int length() { int sum = 0; int n = size(); for (int i = 0; i != n; i++) { sum += token(i).fullString().length(); } return sum; } /** * Returns the ith TextToken * * @param i * @return */ public T token(int i) { return (i >= size()) ? null : get(i); } /** * Returns the ith WordForm. * * @param i * @return */ public TextToken textToken(int i) { return token(i); } /** * Returns the string for the token i in the chunk. * * @param i * @return */ public String string(int i) { TextToken token = token(i); return (token == null) ? null : token.fullString(); } public String lc(int i) { TextToken token = token(i); return (token == null) ? null : token.lc(); } public String string() { return stringBuilder().toString(); } public StringBuilder stringBuilder() { StringBuilder sb = new StringBuilder(); for (int i = 0; i != size(); i++) { TextToken t = token(i); sb.append(t.getDelimsBefore()); sb.append(t.getString()); } return sb; } public boolean empty() { // return (size() == 0) || token(0).empty(); return (size() == 0 || string().trim().equals("")); } static final String xmlCloseAnchor = "</a>"; static final String regularCloseAnchor = "</a>"; /** * The equivalent of emitHtml(false) * * @return An HTML representation of this TextChunkBase. */ public String emitHtml() { return emitHtml(false); } /** * @param xmlBrackets * If true, generate HTML for inclusion within XML. * @return An HTML representation of this TextChunkBase. */ public String emitHtml(boolean xmlBrackets) { String result = ""; boolean inHref = false; String open, close, closeAnchor; if (xmlBrackets) { open = "<"; close = ">"; closeAnchor = xmlCloseAnchor; } else { open = "<"; close = ">"; closeAnchor = regularCloseAnchor; } for (int i = 0; i != size(); i++) { TextToken thatToken = token(i); ParsedURL href = thatToken.getHref(); if (!inHref && (href != null)) { result += open + "a " + ecologylab.serialization.XMLTools.nameVal("href", href.url()) + close; inHref = true; } result += thatToken.getDelimsBefore() + thatToken.getString() + " "; if (inHref && (href == null)) { result += closeAnchor; inHref = false; } } if (inHref) { result += closeAnchor; } return result; } public T lastElement() { return (size() == 0) ? null : this.get(size() - 1); } public void removeElementAt(int i) { if (tokens != null) tokens.remove(i); } /** * @return true if this TextChunkBase looks like garbage */ public boolean isGarbage() { String s0; // prime the pump String s1 = lc(0); String s2 = lc(1); int n = size(); for (int i = 0; i < n; i++) { // lower case w all lower case in contentIntegrator filter -- ignore case!m s0 = s1; s1 = s2; s2 = lc(i + 2); String filterJ[] = garbageFilterMap.get(s0); if (filterJ != null) { int filterLength = filterJ.length; if ((filterLength < 2) || (StringTools.contains(s1, filterJ[1]) && ((filterLength < 3) || StringTools .contains(s2, filterJ[2])))) { return true; } } } return false; } String string; // TODO -- invaliate this cached string when editing happens!!!!!! public String toString() { StringBuilder buffy = toStringBuilder(); String result = StringTools.toString(buffy); StringBuilderUtils.release(buffy); return result; } /** * @param buffy */ public void toStringBuilder(StringBuilder buffy) { int size = size(); for (int i = 0; i < size; i++) { buffy.append(token(i).fullString()); } } public StringBuilder toStringBuilder() { StringBuilder result = StringBuilderUtils.acquire(); toStringBuilder(result); return result; } public ParsedURL getCommonHref() { return commonHref; } public void setCommonHref(ParsedURL commonHref) { this.commonHref = commonHref; } /** * An HTML representation of this MediaElement, suitable for drag and drop, copy and paste, .... * * @return HTML String. */ public String toHTML() { int size = size(); StringBuffer buffy = new StringBuffer(size * CHARS_PER_TOKEN); ParsedURL currentPURL = null; for (int i = 0; i < size; i++) { TextToken token = token(i); ParsedURL tokenPURL = token.getHref(); if (currentPURL != tokenPURL) { // changes in hyperlink if (currentPURL != null) { // close a hyperlink buffy.append("</a>"); } if (tokenPURL != null) { // open a hyperlink buffy.append("<a href=\"").append(tokenPURL.toString()).append("\">"); } } buffy.append(token.fullString()); } // close hyperlink if still open if (currentPURL != null) { buffy.append("</a>"); } return buffy.toString(); } public ScalarType scalarType() { return scalarType; } /** * Clear data structures and references to enable garbage collecting of resources associated with * this. */ public void recycle() { if (!recycled) { recycled = true; if (namedStyle != null) namedStyle.recycle(); namedStyle = null; string = null; super.recycle(); int last = size() - 1; for (int i = last; i >= 0; i--) { T tt = remove(i); tt.recycle(); } } } public NamedStyle getNamedStyle() { return namedStyle; } public void setNamedStyle(NamedStyle style) { this.namedStyle = style; } public String getStyleName() { return styleName; } public void setStyleName(String styleName) { this.styleName = styleName; } static final String TEST_STRING = "Querying Web Metadata: Native Score\nManagement and Text Support\nin Databases\nG\n�\nULTEKIN\n�\nOZSOYO\n?\nGLU\nCase Western Reserve University\nISMAIL SENG\n�\nOR ALTING\n�\nOVDE\nBilkent Universit"; public T get(int i) { return tokens == null ? null : tokens.get(i); } public int size() { return tokens == null ? 0 : tokens.size(); } public boolean add(T token) { return (tokens == null) ? false : tokens.add(token); } public void add(int index, T token) { if (tokens != null) tokens.add(index, token); } abstract public Iterator<T> iterator(); public T remove(int i) { return tokens == null ? null : tokens.remove(i); } public boolean isEmpty() { return size() == 0; } public void clear() { if (tokens != null) tokens.clear(); } /** * Calculates the weight of a text chunk by summing all of the term weights contained in this * chunk. * * @param chunk * a chunk of text * @return sum of weights of all terms in the specified chunk */ protected final float getWeight() { float weight = 0; for (int i = 0; i < this.size(); i++) { T cfTextToken = this.get(i); String tokenString = cfTextToken.getString(); if (tokenString.length() > 1) // ignore 1 character strings. they are not valuable. { Term xterm = cfTextToken.xterm(); double w = xterm.idf(); weight += w; } } return weight; } public final float getAvgWeight() { int size = size(); if (size == 0) { return 0; } return getWeight() / size; } /** * Returns a subchunk of a chunk where the subchunk contains the <code>begin</code>th word to the * <code>end</code>th word. * * @param chunk * a text chunk * @param begin * the index in chunk of the first word to be in the subchunk * @param end * the index in chunk of the last word to be in the subchunk * @return a subchunk of the specified chunk */ public final TextChunkBase<T> getSubchunk(int begin, int end) { TextChunkBase<T> subchunk = this.newTextChunk(); for (int i = begin; i <= end; i++) { if (!TermDictionary.mostObviousStopWordTerms.containsKey(removeNonAlpha(get(i).getString())) || !subchunk.isEmpty()) { subchunk.add(get(i)); } } int lastI = subchunk.size() - 1; while (lastI >= 0 && TermDictionary.mostObviousStopWordTerms.containsKey(removeNonAlpha(subchunk.get(lastI).getString()))) { subchunk.remove(lastI); lastI--; } return subchunk; } private static final String removeNonAlpha(String s) { StringBuilder buff = sbp.acquire(); char c; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); if (Character.isLetter(c)) buff.append(Character.toLowerCase(c)); } return sbp.releaseAndGetString(buff); } /** * Retrieves the sentence from the context with the highest average term weight. * * @param context * a text chunk * @return sentence with the highest average term weight */ public final TextChunkBase<T> getSentence() { List<TextChunkBase<T>> sentences = new LinkedList<TextChunkBase<T>>(); for (int i = 0; i < this.size();) { TextChunkBase<T> sentence = newTextChunk(); int j = 0; boolean sentenceHasEmailAddr = false; // cleaned-up by andruid, jon, sashi 5/20/2010 while ((i + j) < size()) { T token = get(i + j); if (token.contains('@')) // look out for email addresses. discard any sentence that seems to contain one. sentenceHasEmailAddr = true; if (!sentenceHasEmailAddr) sentence.add(token); if (token.endsWithTerminal()) { break; } j++; } i += j + 1; if (!sentenceHasEmailAddr && sentence.size() > 0) { sentences.add(sentence); } } int maxI = 0; if (sentences.size() == 0) return this; float maxVal = sentences.get(0).getAvgWeight(); for (int i = 1; i < sentences.size(); i++) { float val = sentences.get(i).getAvgWeight(); if (val > maxVal) { maxVal = val; maxI = i; } } TextChunkBase<T> sentence = sentences.get(maxI); return sentence; } /** * Trim phatSurrogate for visualization part * * @param semanticText * Boolean to increase the maximum size of text surrogates * set true for metaMetaData semantic action text, false for all else. * @return The skinny chunk, a short phrase from a larger context, which we may show to the user. */ public TextChunkBase<T> trimPhatChunk(boolean semanticText) { // Set text length max and min bound from prefs float modifier = Pref.lookupFloat("text_length_modifier", 1); int maxLength = Math.round(MAX_WORDS * modifier); int minLength = Math.round(MIN_WORDS * modifier); // Find the sentence in the context with highest average weight TextChunkBase<T> sentence = this.getSentence(); int sentenceSize = sentence.size(); int sizeIncrease = (semanticText) ? 3 : 0; if (sentenceSize > (maxLength + sizeIncrease)) { // Shorten the sentence by examining all contiguous sub-sentences between MIN_WORDS and // MAX_WORDS lengths. The sub-sentence with the highest average value is the winner. TextChunkBase<T> maxChunk = null; float maxVal = Float.NEGATIVE_INFINITY; for (int i = 0; i <= (sentenceSize - minLength); i++) { for (int j = (minLength - 1); j < (maxLength + sizeIncrease); j++) { if ((i + j) >= sentenceSize) { break; } TextChunkBase<T> chunk = sentence.getSubchunk(i, i + j); float val = chunk.getAvgWeight(); if (val > maxVal) { maxVal = val; maxChunk = chunk; } } } if (maxChunk != null && maxChunk.size() > 0) { maxChunk.get(0).setDelimsBefore(""); } return maxChunk; } // Removes most obvious stop words off the front of a sentence sentence = sentence.getSubchunk(0, sentenceSize - 1); if (sentence.size() > 0) { sentence.get(0).setDelimsBefore(""); } return sentence; } public long getOrmId() { return ormId; } public void setOrmId(long ormId) { this.ormId = ormId; } public List<T> getTokens() { return tokens; } public void setTokens(List<T> tokens) { this.tokens = tokens; } public float getNonStopIndex() { return nonStopIndex; } public void setNonStopIndex(float nonStopIndex) { this.nonStopIndex = nonStopIndex; } }