/*
* WordNetParser.java
*
* Copyright (c) 2005 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package rfc2229;
import java.util.List;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import wikipedia.util.StringUtil;
import wikipedia.util.StringUtilRegular;
/** Parses text of Wordnet's articles */
public class WordNetParser {
public WordNetParser() {
}
/** Searches in the text something like "[syn: {sugar}, {one more sugar\r\n}]", and
* extracts as list "sugar", "one more sugar" if the sought type is "syn".
*
* @params link_type There are the following types: syn, ant, also.
*/
public static List<String> getLinks(String link_type, String text) {
String str_pattern = "\\[" + link_type + "\\:\\s([^\\]]+)\\]";
List<String> result = new ArrayList<String>();
Pattern p = Pattern.compile(str_pattern);
Matcher m = p.matcher(text);
while (m.find()){
String[] words = StringUtil.split(", ", m.group(1));
StringUtilRegular.stripNonWordLetters(words);
for(String w: words) {
if(!result.contains(w)) {
result.add(w);
}
}
}
return result;
}
public static List<String> getSynonyms(String text) {
return StringUtil.addOR(
getLinks("syn", text),
getLinks("also", text) );
}
}