/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.core.tokenize; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.galagosearch.core.parse.Document; import org.galagosearch.core.parse.TagTokenizer; import org.tartarus.snowball.ext.englishStemmer; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; public class GalagoTokenizer extends Tokenizer { private static final String[] TERRIER_STOP_WORDS = { "a", "abaft", "abafter", "abaftest", "about", "abouter", "aboutest", "above", "abover", "abovest", "accordingly", "aer", "aest", "afore", "after", "afterer", "afterest", "afterward", "afterwards", "again", "against", "aid", "ain", "albeit", "all", "aller", "allest", "alls", "allyou", "almost", "along", "alongside", "already", "also", "although", "always", "amid", "amidst", "among", "amongst", "an", "and", "andor", "anear", "anent", "another", "any", "anybody", "anyhow", "anyone", "anything", "anywhere", "apart", "aparter", "apartest", "appear", "appeared", "appearing", "appears", "appropriate", "appropriated", "appropriater", "appropriates", "appropriatest", "appropriating", "are", "ares", "around", "as", "ases", "aside", "asides", "aslant", "astraddle", "astraddler", "astraddlest", "astride", "astrider", "astridest", "at", "athwart", "atop", "atween", "aught", "aughts", "available", "availabler", "availablest", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "becominger", "becomingest", "becomings", "been", "before", "beforehand", "beforehander", "beforehandest", "behind", "behinds", "below", "beneath", "beside", "besides", "better", "bettered", "bettering", "betters", "between", "betwixt", "beyond", "bist", "both", "but", "buts", "by", "by-and-by", "byandby", "c", "cannot", "canst", "cant", "canted", "cantest", "canting", "cants", "cer", "certain", "certainer", "certainest", "cest", "chez", "circa", "co", "come-on", "come-ons", "comeon", "comeons", "concerning", "concerninger", "concerningest", "consequently", "considering", "could", "couldst", "cum", "d", "dday", "ddays", "describe", "described", "describes", "describing", "despite", "despited", "despites", "despiting", "did", "different", "differenter", "differentest", "do", "doe", "does", "doing", "doings", "done", "doner", "dones", "donest", "dos", "dost", "doth", "downs", "downward", "downwarder", "downwardest", "downwards", "during", "e", "each", "eg", "eight", "either", "else", "elsewhere", "enough", "ere", "et", "etc", "even", "evened", "evenest", "evens", "evenser", "evensest", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "excepted", "excepting", "excepts", "exes", "f", "fact", "facts", "failing", "failings", "few", "fewer", "fewest", "figupon", "figuponed", "figuponing", "figupons", "five", "followthrough", "for", "forby", "forbye", "fore", "forer", "fores", "forever", "former", "formerer", "formerest", "formerly", "formers", "fornenst", "forwhy", "four", "fourscore", "frae", "from", "fs", "further", "furthered", "furtherer", "furtherest", "furthering", "furthermore", "furthers", "g", "get", "gets", "getting", "go", "gone", "good", "got", "gotta", "gotten", "h", "had", "hadst", "hae", "hardly", "has", "hast", "hath", "have", "haves", "having", "he", "hence", "her", "hereafter", "hereafters", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "hither", "hitherer", "hitherest", "hoo", "hoos", "how", "how-do-you-do", "howbeit", "howdoyoudo", "however", "huh", "humph", "i", "idem", "idemer", "idemest", "ie", "if", "ifs", "immediate", "immediately", "immediater", "immediatest", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "indicating", "info", "information", "insofar", "instead", "into", "inward", "inwarder", "inwardest", "inwards", "is", "it", "its", "itself", "j", "k", "l", "latter", "latterer", "latterest", "latterly", "latters", "layabout", "layabouts", "less", "lest", "lot", "lots", "lotted", "lotting", "m", "main", "make", "many", "mauger", "maugre", "mayest", "me", "meanwhile", "meanwhiles", "midst", "midsts", "might", "mights", "more", "moreover", "most", "mostly", "much", "mucher", "muchest", "must", "musth", "musths", "musts", "my", "myself", "n", "natheless", "nathless", "neath", "neaths", "necessarier", "necessariest", "necessary", "neither", "nethe", "nethermost", "never", "nevertheless", "nigh", "nigher", "nighest", "nine", "no", "no-one", "nobodies", "nobody", "noes", "none", "noone", "nor", "nos", "not", "nothing", "nothings", "notwithstanding", "nowhere", "nowheres", "o", "of", "off", "offest", "offs", "often", "oftener", "oftenest", "oh", "on", "one", "oneself", "onest", "ons", "onto", "or", "orer", "orest", "other", "others", "otherwise", "otherwiser", "otherwisest", "ought", "oughts", "our", "ours", "ourself", "ourselves", "out", "outed", "outest", "outs", "outside", "outwith", "over", "overall", "overaller", "overallest", "overalls", "overs", "own", "owned", "owning", "owns", "owt", "p", "particular", "particularer", "particularest", "particularly", "particulars", "per", "perhaps", "plaintiff", "please", "pleased", "pleases", "plenties", "plenty", "pro", "probably", "provide", "provided", "provides", "providing", "q", "qua", "que", "quite", "r", "rath", "rathe", "rather", "rathest", "re", "really", "regarding", "relate", "related", "relatively", "res", "respecting", "respectively", "s", "said", "saider", "saidest", "same", "samer", "sames", "samest", "sans", "sanserif", "sanserifs", "sanses", "saved", "sayid", "sayyid", "seem", "seemed", "seeminger", "seemingest", "seemings", "seems", "send", "sent", "senza", "serious", "seriouser", "seriousest", "seven", "several", "severaler", "severalest", "shall", "shalled", "shalling", "shalls", "she", "should", "shoulded", "shoulding", "shoulds", "since", "sine", "sines", "sith", "six", "so", "sobeit", "soer", "soest", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimer", "sometimes", "sometimest", "somewhat", "somewhere", "stop", "stopped", "such", "summat", "sup", "supped", "supping", "sups", "syn", "syne", "t", "ten", "than", "that", "the", "thee", "their", "theirs", "them", "themselves", "then", "thence", "thener", "thenest", "there", "thereafter", "thereby", "therefore", "therein", "therer", "therest", "thereupon", "these", "they", "thine", "thing", "things", "this", "thises", "thorough", "thorougher", "thoroughest", "thoroughly", "those", "thou", "though", "thous", "thouses", "three", "thro", "through", "througher", "throughest", "throughout", "thru", "thruer", "thruest", "thus", "thy", "thyself", "till", "tilled", "tilling", "tills", "to", "together", "too", "toward", "towarder", "towardest", "towards", "two", "u", "umpteen", "under", "underneath", "unless", "unlike", "unliker", "unlikest", "until", "unto", "up", "upon", "uponed", "uponing", "upons", "upped", "upping", "ups", "us", "use", "used", "usedest", "username", "usually", "v", "various", "variouser", "variousest", "verier", "veriest", "versus", "very", "via", "vis-a-vis", "vis-a-viser", "vis-a-visest", "viz", "vs", "w", "was", "wast", "we", "were", "wert", "what", "whatever", "whateverer", "whateverest", "whatsoever", "whatsoeverer", "whatsoeverest", "wheen", "when", "whenas", "whence", "whencesoever", "whenever", "whensoever", "where", "whereafter", "whereas", "whereby", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereupon", "wherever", "wherewith", "wherewithal", "whether", "which", "whichever", "whichsoever", "while", "whiles", "whilst", "whither", "whithersoever", "whoever", "whomever", "whose", "whoso", "whosoever", "why", "with", "withal", "within", "without", "would", "woulded", "woulding", "woulds", "x", "y", "ye", "yet", "yon", "yond", "yonder", "you", "your", "yours", "yourself", "yourselves", "z", "zillion", }; private final englishStemmer stemmer = new englishStemmer(); private final Map<String, String> cache = Maps.newHashMap(); private final Set<String> stopwords = Sets.newHashSet(TERRIER_STOP_WORDS); @Override public boolean isStopWord(String word) { return stopwords.contains(word); } @Override public boolean isStemming() { return true; } @Override public boolean isStopwordRemoval() { return true; } public String[] processContent(String text) { TagTokenizer tokenizer = new TagTokenizer(); Document doc = null; try { doc = tokenizer.tokenize(text); } catch (IOException e) { e.printStackTrace(); return null; } List<String> toks = doc.terms; List<String> words = Lists.newArrayList(); for (String tok : toks) { if (!stopwords.contains(tok)) { words.add(tok); } } for (int i = 0; i < words.size(); i++) { String word = words.get(i); if (word != null) { if (cache.containsKey(word)) { words.set(i, cache.get(word)); } else { stemmer.setCurrent(word); if (stemmer.stem()) { String stem = stemmer.getCurrent(); words.set(i, stem); cache.put(word, stem); } else { cache.put(word, word); } } if (cache.size() > 50000) { cache.clear(); } } } String[] arr = new String[words.size()]; return (String[]) words.toArray(arr); } @Override public void configure(Configuration conf) { } @Override public void configure(Configuration mJobConf, FileSystem fs) { } public static void main(String[] args) { String text = " this is a the <test> for the teokenizer 101 546 345-543543545436-4656765865865 rgger <xml> ergtre 456435klj345lj34590"; Tokenizer tokenizer; String[] tokens; System.out.println("tokenization according to Galago: "); tokenizer = new GalagoTokenizer(); tokens = tokenizer.processContent(text); for (String t : tokens) { System.out.println(t); } } }