/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package com.xpn.xwiki.plugin.autotag;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import com.xpn.xwiki.XWikiContext;
import com.xpn.xwiki.plugin.XWikiDefaultPlugin;
import com.xpn.xwiki.plugin.XWikiPluginInterface;
/**
* Plugin which extracts a set of tags from a text.
*
* @version $Id: e30fac037d84bc9c172fff38493f900354bdcc6b $
* @deprecated the plugin technology is deprecated, consider rewriting as components
*/
@Deprecated
public class AutoTagPlugin extends XWikiDefaultPlugin implements XWikiPluginInterface
{
/** Identifier for the French language. */
public static final int LANG_FRENCH = 0;
/** Identifier for the English language. */
public static final int LANG_ENGLISH = 1;
/**
* The name of the plugin, which is used for retrieving the plugin from the plugin manager (and from the public
* {@code $xwiki.get()} API).
*/
private static final String PLUGIN_NAME = "autotag";
/**
* Special characters that will cause a token word to be ignored when that token contains one of these characters.
*/
private static final Pattern SPECIAL_CHARS = Pattern.compile("<|>|=|/|\"|\u0093");
/** Needed to make chekstyle pass. */
private static final String A = "a";
/** Needed to make chekstyle pass. */
private static final String ON = "on";
/** French words that should be ignored since they don't add any value, they're very common words. */
private static final String[] FRENCH_STOP_WORDS = {
A, "afin", "ai", "ainsi", "apr\u00e8s", "attendu", "au", "aujourd", "auquel", "aussi",
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
"c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
"certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
"combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
"dedans", "dehors", "del\u00e0", "depuis", "derri\u00e8re", "des", "d\u00e9sormais",
"desquelles", "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers",
"diverse", "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "d\u00e8s",
"elle", "elles", "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux",
"except\u00e9", "hormis", "hors", "h\u00e9las", "hui", "il", "ils", "j", "je", "jusqu",
"jusque", "l", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur",
"leurs", "lorsque", "lui", "l\u00e0", "ma", "mais", "malgr\u00e9", "me", "merci", "mes",
"mien", "mienne", "miennes", "miens", "moi", "moins", "mon", "moyennant", "m\u00eame",
"m\u00eames", "n", "ne", "ni", "non", "nos", "notre", "nous", "n\u00e9anmoins",
"n\u00f4tre", "n\u00f4tres", ON, "ont", "ou", "outre", "o\u00f9", "par", "parmi",
"partant", "pas", "pass\u00e9", "pendant", "plein", "plus", "plusieurs", "pour",
"pourquoi", "proche", "pr\u00e8s", "puisque", "qu", "quand", "que", "quel", "quelle",
"quelles", "quels", "qui", "quoi", "quoique", "revoici", "revoil\u00e0", "s", "sa",
"sans", "sauf", "se", "selon", "seront", "ses", "si", "sien", "sienne", "siennes",
"siens", "sinon", "soi", "soit", "son", "sont", "sous", "suivant", "sur", "ta", "te",
"tes", "tien", "tienne", "tiennes", "tiens", "toi", "ton", "tous", "tout", "toute",
"toutes", "tu", "un", "une", "va", "vers", "voici", "voil\u00e0", "vos", "votre", "vous",
"vu", "v\u00f4tre", "v\u00f4tres", "y", "\u00e0", "\u00e7a", "\u00e8s", "\u00e9t\u00e9",
"\u00eatre", "\u00f4", "avez", "parce", "suis"};
/** English words that should be ignored since they don't add any value, they're very common words. */
private static final String[] ENGLISH_STOP_WORDS = {
"the", "of", "and", A, "to", "in", "is", "you", "that", "it", "he", "was", "for", ON,
"are", "as", "with", "his", "they", "I", "at", "be", "this", "have", "from", "or", "one",
"had", "by", "but", "not", "what", "all", "were", "we", "when", "your", "can", "said",
"there", "use", "an", "each", "which", "she", "do", "how", "their", "if", "will", "up",
"other", "about", "out", "many", "then", "them", "these", "so", "some", "her", "would",
"make", "like", "him", "into", "time", "has", "look", "two", "more", "go", "see", "no",
"way", "could", "my", "than", "first", "been", "call", "who", "its", "now", "find", "long",
"down", "day", "did", "get", "come", "may"};
/** The list of words (or, more generally, tokens) that should be ignored. */
private List<String> ignoreList = Collections.synchronizedList(new ArrayList<String>());
/** The list of words that should never be ignored, even if they're placed in {@link #ignoreList}. */
private List<String> dontignoreList = Collections.synchronizedList(new ArrayList<String>());
/** The maximum number of tags to generate in the tag cloud. */
private int maximumNumberOfTags = 100;
/** The maximum size of a tag, corresponding to the most common tag in the document list. */
private int maxTagSize = 64;
/** The minimum size of a tag, corresponding to the least common tag that gets included in the tag cloud. */
private int minTagSize = 12;
/**
* The mandatory plugin constructor, this is the method called (through reflection) by the plugin manager.
*
* @param name the plugin name, usually ignored, since plugins have a fixed name
* @param className the name of this class, ignored
* @param context the current request context
*/
public AutoTagPlugin(String name, String className, XWikiContext context)
{
super(name, className, context);
init(context);
}
@Override
public void init(XWikiContext context)
{
super.init(context);
}
@Override
public String getName()
{
return PLUGIN_NAME;
}
@Override
public AutoTagPluginAPI getPluginApi(XWikiPluginInterface plugin, XWikiContext context)
{
return new AutoTagPluginAPI((AutoTagPlugin) plugin, context);
}
/**
* Analyze a piece of text, and extract the most common words into a "tag cloud". In detail, this splits the text
* into tokens, counts how many times each token appears in the text, removes the "stop-words", joins together words
* from the same root (stemming), and prepares an HTML tag cloud which can be printed in the response.
*
* @param text the text to analyze
* @param lang the language in which the text is written, {@code 0} for French or {@code 1} for English
* @return the resulting TagCloud with all the analyzed data, including the HTML tag cloud
*/
public TagCloud generateTagCloud(String text, int lang)
{
TagCloud tagcloud = countWords(text, lang);
calculateTags(tagcloud);
return tagcloud;
}
/**
* Analyze a piece of text, splitting it into individual words, along with their frequencies. In detail, this splits
* the text into tokens, counts how many times each token appears in the text, removes the "stop-words", and joins
* together words from the same root (stemming). {@link #generateTagCloud(String, int)} also prepares an HTML tag
* cloud which can be printed in the response.
*
* @param text the text to analyze
* @param lang the language, {@code 0} for French or {@code 1} for English
* @return the resulting TagCloud with all the analyzed data, except the HTML tag cloud
*/
public TagCloud countWords(String text, int lang)
{
TagCloud tagcloud = new TagCloud();
tagcloud.setText(text);
splitWords(tagcloud);
countWords(tagcloud);
clearStopWords(tagcloud, lang);
stemmer(tagcloud, lang);
return tagcloud;
}
// Utility methods
/**
* Return a sorted copy of a set.
*
* @param <T> the type of the items in the set
* @param oSet the set containing the values to sort; it is not affected in any way by this method
* @return a new sorted set containing all the values in the input set
*/
public static <T extends Comparable<T>> SortedSet<T> sortSet(Set<T> oSet)
{
return new TreeSet<T>(oSet);
}
/**
* Return a copy of a map, sorted in ascending order of their values.
*
* @param <K> the type of the map keys
* @param <V> the type of the map values
* @param hmap the map containing the entries to sort; it is not affected in any way by this method
* @return a new sorted map containing all the entries in the input map
*/
public static <K, V> Map<K, V> sortMap(Map<K, V> hmap)
{
Map<K, V> map = new LinkedHashMap<K, V>();
List<K> mapKeys = new ArrayList<K>(hmap.keySet());
List<V> mapValues = new ArrayList<V>(hmap.values());
Set<V> sortedSet = new TreeSet<V>(mapValues);
@SuppressWarnings("unchecked")
V[] sortedArray = (V[]) sortedSet.toArray();
for (int i = 0; i < sortedArray.length; i++) {
for (int j = 0; j < mapValues.size(); j++) {
if (mapValues.get(j).equals(sortedArray[i])) {
map.put(mapKeys.get(j), sortedArray[i]);
}
}
}
return map;
}
/**
* Get the identifier corresponding to the given two-leter country code. Currently the only supported values are
* "en" and "fr".
*
* @param lang the two-letter ISO 3166-1 alpha-2 code of a country
* @return {@code 0} for French ("fr") or {@code 1} for English ("en") and all other passed values
*/
public int getLanguageConstant(String lang)
{
if (lang.trim().equalsIgnoreCase("fr")) {
return AutoTagPlugin.LANG_FRENCH;
}
// default English
return AutoTagPlugin.LANG_ENGLISH;
}
/**
* Split the text into tokens. Newlines, spaces, tabs, comma, dot, semi-colon, colon, exclamation, question mark,
* and apostrophe are considered separators.
*
* @param tagCloud the instance to process
* @return the resulting list of tokens, which is also stored in the instance {@link TagCloud#getWordList()
* TagCloud}
*/
private String[] splitWords(TagCloud tagCloud)
{
String text = tagCloud.getText();
text = text.replaceAll("\n", " ");
text = text.replaceAll("\r", " ");
text = text.replaceAll("'", " ");
text = text.replaceAll("\u0092", " ");
text = text.toLowerCase();
String[] words = text.split("[\\s,.;:!\\?]+");
tagCloud.setWordList(words);
return words;
}
/**
* Count all the appearances of each token extracted from the text. This method must be called after
* {@link #splitWords(TagCloud)}.
*
* @param tagCloud the instance to process
* @return the resulting map of {@code token->number of appearances} count for each token present in the text, which
* is also stored in the instance {@link TagCloud#getCountedWordMap() TagCloud}
*/
private Map<String, Integer> countWords(TagCloud tagCloud)
{
String[] words = tagCloud.getWordList();
Map<String, Integer> wordsCount = new HashMap<String, Integer>();
for (int i = 0; i < words.length; i++) {
String word = words[i];
if (!wordsCount.containsKey(word)) {
wordsCount.put(word, Integer.valueOf(0));
}
wordsCount.put(word, Integer.valueOf((wordsCount.get(word)).intValue() + 1));
}
tagCloud.setCountedWordMap(wordsCount);
return wordsCount;
}
/**
* Remove "stop words", words that should be ignored since they don't add any value, they're very common. This
* method must be called after {@link #countWords(TagCloud)}.
*
* @param tagCloud the instance to process
* @param lang the language in which the text is written, {@code 0} for French or {@code 1} for English
* @return the resulting map of {@code token->number of appearances} count for each token present in the text, which
* is also stored in the instance {@link TagCloud#getCountedWordMap() TagCloud}
*/
private Map<String, Integer> clearStopWords(TagCloud tagCloud, int lang)
{
Map<String, Integer> words = tagCloud.getCountedWordMap();
String[] stopWordsArray = new String[0];
switch (lang) {
case LANG_ENGLISH:
stopWordsArray = ENGLISH_STOP_WORDS;
break;
case LANG_FRENCH:
stopWordsArray = FRENCH_STOP_WORDS;
break;
default:
// nothing
break;
}
for (String word : stopWordsArray) {
words.remove(word);
}
Set<String> ignored = new HashSet<String>();
for (String word : words.keySet()) {
if (SPECIAL_CHARS.matcher(word).find()) {
ignored.add(word);
}
}
for (String word : this.ignoreList) {
if (!this.dontignoreList.contains(word)) {
words.remove(word);
}
}
for (String word : ignored) {
if (!this.dontignoreList.contains(word)) {
words.remove(word);
}
}
tagCloud.setCountedWordMap(words);
return words;
}
/**
* Group tokens based on their common stem. For example, "hand" and "hands" both refer to the same term, "hand",
* thus they will be grouped together under the "hand" stem. This method must be called after
* {@link #countWords(TagCloud)} or {@link #clearStopWords(TagCloud, int)}.
*
* @param tagCloud the instance to process
* @param lang the language in which the text is written, {@code 0} for French or {@code 1} for English
* @return the resulting list of token groups, which is also stored in the instance
* {@link TagCloud#getStemmedWordMap() TagCloud}
*/
private Map<String, Map<String, Integer>> stemmer(TagCloud tagCloud, int lang)
{
Map<String, Integer> words = tagCloud.getCountedWordMap();
// SnowballProgram stemmer;
// if (lang == LANG_FRENCH) {
// stemmer = new frenchStemmer();
// } else {
// stemmer = new englishStemmer();
// }
FrenchStemmer stemmer;
Map<String, Map<String, Integer>> stemmedWordMap = new HashMap<String, Map<String, Integer>>();
stemmer = new FrenchStemmer();
for (String word : words.keySet()) {
if (word.length() <= 2) {
continue;
}
String stemmedWord = stemmer.stem(word);
if (!stemmedWordMap.containsKey(stemmedWord)) {
stemmedWordMap.put(stemmedWord, new HashMap<String, Integer>());
}
stemmedWordMap.get(stemmedWord).put(word, words.get(word));
}
tagCloud.setStemmedWordMap(stemmedWordMap);
return stemmedWordMap;
}
/**
* Determine which are the most frequent {@link #maximumNumberOfTags} tokens and build a tag cloud using their
* relative frequencies. This method must be called after {@link #stemmer(TagCloud, int)}.
*
* @param tagCloud the instance to process
* @return the resulting set of tags, which is also stored in the instance {@link TagCloud#getTags() TagCloud}
*/
private Set<Tag> calculateTags(TagCloud tagCloud)
{
Map<String, Map<String, Integer>> stemmedWords = tagCloud.getStemmedWordMap();
Map<String, Integer> stemmedWordFreqMap = new HashMap<String, Integer>();
// Determine the "lead" word for each stem as the most common token by comparing the frequency of each sub-token
// Calculate the total frequency of each lead word as the sum of the frequencies of all tokens having that stem
for (Map.Entry<String, Map<String, Integer>> stemmedWord : stemmedWords.entrySet()) {
Integer totalFreqency = Integer.valueOf(0);
String leadWord = "";
Integer leadFrequency = Integer.valueOf(0);
Map<String, Integer> wordMap = stemmedWord.getValue();
for (Map.Entry<String, Integer> word : wordMap.entrySet()) {
Integer frequency = word.getValue();
totalFreqency = Integer.valueOf(frequency.intValue() + totalFreqency.intValue());
if (frequency.intValue() > leadFrequency.intValue()) {
leadFrequency = word.getValue();
leadWord = word.getKey();
}
}
stemmedWordFreqMap.put(leadWord, totalFreqency);
}
tagCloud.setStemmedWordFreqMap(stemmedWordFreqMap);
return calculateTagSizes(tagCloud);
}
/**
* Build a tag cloud using the relative frequencies of the selected tags. This method must be called by
* {@link #calculateTags(TagCloud)}.
*
* @param tagCloud the instance to process
* @return the resulting set of tags, which is also stored in the instance {@link TagCloud#getTags() TagCloud}
*/
private Set<Tag> calculateTagSizes(TagCloud tagCloud)
{
Map<String, Integer> stemmedWordFreqMap = tagCloud.getStemmedWordFreqMap();
// If there's no text, just use an empty set of tags and return
if (stemmedWordFreqMap == null || stemmedWordFreqMap.size() == 0) {
tagCloud.setTags(new TreeSet<Tag>());
return tagCloud.getTags();
}
// We order the list by the value to select the most frequent tags
Map<String, Integer> orderedMap = sortMap(stemmedWordFreqMap);
Map<String, Integer> tagMap = new LinkedHashMap<String, Integer>();
int i = 0;
for (Entry<String, Integer> word : orderedMap.entrySet()) {
tagMap.put(word.getKey(), word.getValue());
if (++i > this.maximumNumberOfTags) {
break;
}
}
Integer[] freqs = tagMap.values().toArray(new Integer[0]);
Integer minFreq = freqs[0];
Integer maxFreq = freqs[freqs.length - 1];
int ftot = 0;
for (Integer f : freqs) {
ftot += f.intValue();
}
SortedSet<Tag> tagSet = new TreeSet<Tag>();
for (String tagName : sortSet(tagMap.keySet())) {
long size = getTagSize(tagMap.get(tagName), maxFreq, minFreq, ftot);
Tag tag = new Tag(tagName, size);
tagSet.add(tag);
}
tagCloud.setTags(tagSet);
return tagSet;
}
/**
* Get the size that corresponds to a given tag popularity, relative to all the other tag frequencies.
*
* @param tagOccurrences the number of occurrences of the tag
* @param maxOccurrences the maximum number of occurrences among all tags
* @param minOccurrences the minimum number of occurrences among all tags
* @param totalOccurrences the total number of occurrences of all the tags
* @return a number between {@link #minTagSize} and {@link #maxTagSize} corresponding to the relative popularity of
* this tag compared to all the other tags
*/
private long getTagSize(double tagOccurrences, double maxOccurrences, double minOccurrences,
double totalOccurrences)
{
// The number of available tag sizes
int fontRange = this.maxTagSize - this.minTagSize;
// tweak this if all the words seem too similar in size or extremely different
// rely on the cumulative by x% (0 = 0%, 1 = 100%)
double cumulativeImportance = 0.7;
// sizes based on word's frequency vs total/cumulative frequency
double sumpx = ((fontRange * cumulativeImportance) + 1) * (fontRange * cumulativeImportance) / 2;
double px = tagOccurrences / totalOccurrences * sumpx;
// sizes based on word's frequency deviation from max/min frequencies
px += Math.pow((tagOccurrences - minOccurrences)
/ (1 > maxOccurrences - minOccurrences ? 1 : maxOccurrences - minOccurrences), 0.8)
* (fontRange * (1 - cumulativeImportance));
double result = this.maxTagSize < px + this.minTagSize ? this.maxTagSize : px + this.minTagSize;
return Math.round(result);
}
}