/* WikiWord.java - base class for a word in Wiktionary, e.g. a word from a list
* of Synonyms or Translations.
*
* Copyright (c) 2008 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU General Public License.
*/
package wikokit.base.wikt.util;
import wikokit.base.wikt.multi.en.name.LabelEn;
import wikokit.base.wikipedia.util.StringUtil;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
/** Word in a Wiktionary with wikilink and additional tag or comment,
* e.g. a word from a list of Synonyms or Translations, or definition (meaning).
*
*/
public class WikiWord {
/** Definition word, or synonym, or translation, etc. which is visible to user,
* e.g. "Scratching" for the wiki text "[[scratch|Scratching]]"
*/
private String word_visible;
/** Wikilink, i.e. definition word, or synonym, or translation etc.,
* which is the link for the current word,
* e.g. "scratch" for the wiki text "[[scratch|Scratching]]"
*/
private String word_link;
private final static StringBuffer NULL_STRINGBUFFER = new StringBuffer("");
private final static WikiWord[] NULL_WIKIWORD_ARRAY = new WikiWord[0];
private final static Pattern ptrn_double_brackets = Pattern.compile(
"\\[\\[(.+?)\\]\\]");
/** Gets visible word. */
public String getWordVisible() {
return word_visible;
}
/** Gets wiki link word (lemma). */
public String getWordLink() {
return word_link;
}
/** Comment for the synonym or translation, e.g. synonyms for "entry":
* * (''doorway that provides a means of entering a building''): [[entrance]], [[way in]] {{UK}}
* tags[1]=UK
*
*
* e.g. enwikt "slang: money" in synonyms of bread:
* # (slang: money) dough, folding stuff...),
*
*
* e.g. ruwikt "разг., поэт." in synonyms of Saint-Petersburg:
* # [[Питер]] (разг.), [[град Петров]] (поэт.)
*
* This field will not be used for words of definition (meaning).
*/
private LabelEn[] labels;
/** Initialize and fill WikiWord structure.
* @param _word_link internal wiki link, e.g. "run" in [[run]]ning
* @param _word_visible visible wiki word, e.g. "running" in [[run]]ning
* @param _labels
*/
public WikiWord(String _word_link, String _word_visible, LabelEn[] _labels) {
word_visible = _word_visible;
word_link = _word_link;
labels = _labels;
}
/** Removes and expands wiki links in wiki texts.<PRE>
* Expands wiki links by removing brackets. There are two cases:
* (1) remove brackets, e.g. [[run]] -> run and
* (2) [[run|running]] -> running, or [[Russian language|Russian]] -> Russian,
* i.e. the visible (to reader) words will remain.
*
* It is recommended to call StringUtil.escapeCharDollarAndBackslash(text)
* before this function.
* This is simplified versions of parseDoubleBrackets.</PRE>
*
* @param page_title word which are described in this article 'text'
* @param text source wikified definition text
*
* See also WikiParser.parseDoubleBrackets.
*/
public static StringBuffer parseDoubleBrackets (
String page_title,
StringBuffer text) //,
//LanguageType lang)
{
if(null == text || 0 == text.length()) {
return NULL_STRINGBUFFER;
}
Matcher m = ptrn_double_brackets.matcher(text.toString()); // [[(.+?)]]
//StringUtil.escapeCharDollarAndBackslash(text.toString())); // [[(.+?)]]
String after; // before,
StringBuffer sb = new StringBuffer();
boolean result = m.find();
while(result) {
// g: text within [[brackets]]
String g = StringUtil.escapeCharDollarAndBackslash(m.group(1)).toString();
if(-1 != g.indexOf('|')) {
after = StringUtil.getTextAfterFirstVerticalPipe(g); // before = StringUtil.getTextBeforeFirstVerticalPipe(g);
//System.out.println("sb="+sb+ "; after="+after);
m.appendReplacement(sb, after);
} else {
// [[run]] -> run
m.appendReplacement(sb, g);
}
result = m.find(); // m.appendReplacement(sb, "$1");
}
m.appendTail(sb);
return sb;
}
/** Extract wiki links (word_link and word_visible) from wiki texts.
* There are the visible to reader words (word_visible) and internal links (word_link).
*
* <PRE>
* There are cases:
* (1) [[run]] => two words [run, run], remove brackets
* (2) [[run]]ing => two words [run, runing] extract, remove brackets
* (3) [[run|running]] => two words [run, running],
* or [[Russian language|Russian]] => [Russian language, Russian].
* </PRE>
*
* @param page_title word which are described in this article 'text'
* @param text source wikified definition text
*/
public static WikiWord[] getWikiWords (
String page_title,
StringBuffer text)
{
if(null == text || 0 == text.length()) {
return NULL_WIKIWORD_ARRAY;
}
List<WikiWord> ww_list = new ArrayList<WikiWord>();
WikiWord w;
Matcher m = ptrn_double_brackets.matcher(text.toString()); // [[(.+?)]]
//StringUtil.escapeCharDollarAndBackslash(text.toString())); // [[(.+?)]]
String before, after;
boolean result = m.find();
while(result) {
// g: text within [[brackets]]
String g = StringUtil.escapeCharDollarAndBackslash(m.group(1)).toString();
if(-1 != g.indexOf('|')) {
before = StringUtil.getTextBeforeFirstVerticalPipe(g);
after = StringUtil.getTextAfterFirstVerticalPipe(g);
//System.out.println("sb="+sb+ "; after="+after);
w = new WikiWord(before, after, null);
} else {
// get text till space or punctuation mark or [ ???????????????????
String suffix = StringUtil.getTextTillSpaceOrPuctuationMark(m.end(), text.toString());
w = new WikiWord(g, g.concat(suffix), null);
}
result = m.find();
ww_list.add(w);
}
return( (WikiWord[])ww_list.toArray(NULL_WIKIWORD_ARRAY) );
}
}