/*
* MobyParser.java
*
* Copyright (c) 2005 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package rfc2229;
import java.util.ArrayList;
import java.util.List;
import wikipedia.util.StringUtil;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
//import java.util.regex.PatternSyntaxException;
import wikipedia.util.StringUtilRegular;
/** Parses text of Moby's word list */
public class MobyParser {
public MobyParser() {
}
/** Extracts words from the Moby's text.
* Implementations:
* 1. Takes substring from ":" till "."
* 2. Split by comma ","
* 3. Strip non-words letters, e.g. "\r\n backset " -> "backset"
*
* Example of source string:
* 24 (3 in test) Moby Thesaurus words for "mulch":" \
* \r\n backset, fallow,\r\n fertilize, culture,\r\n thin, work\r\n\r\n\r\n\r\n.\r\n
*/
public static String[] getWords(String text) {
Pattern p;
Matcher m;
// remove text from start till the first colon inclusively
p = Pattern.compile("(?s)\\A[^\\:]*:");
m = p.matcher(text);
text = m.replaceFirst("");
// remove text from last dot till the end inclusively
p = Pattern.compile("(?s)\\..*?\\Z");
m = p.matcher(text);
text = m.replaceFirst("");
String[] words = StringUtil.split(",", text);
StringUtilRegular.stripNonWordLetters(words);
// add unique words /replace by StringUtil.addOR/
List<String> result = new ArrayList<String>();
for(String w: words) {
if(!result.contains(w)) {
result.add(w);
}
}
return (String[])result.toArray(new String[0]);
}
}