/* WEtymologyEn.java - corresponds to an etymology level of English Wiktionary word.
*
* Copyright (c) 2010 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com>
* Distributed under GNU General Public License.
*/
package wikokit.base.wikt.multi.en;
import wikokit.base.wikt.util.LangText;
import wikokit.base.wikipedia.language.LanguageType;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
/** Etymology part of English Wiktionary article.
*
* Etymology is a level 3 header in English Wiktionary:
* 1)<PRE>
* ===Noun===
* ===Etymology=== (level 3 in English Wiktionary)
* ===Noun===
* ===Verb===
*
* ==Finnish==
* ===Etymology===
* ===Noun===</PRE>
*
* 2)<PRE>
* Also level 3 in the case of multiple etymologies:
* ===Etymology 1=== (level 3)
* ====Pronunciation====
* ====Noun====
* ===Etymology 2=== (level 3)
* ====Pronunciation====
* ====Noun====
* ====Verb====</PRE>
*
* See http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained
*/
public class WEtymologyEn {
/** ===Etymology=== or ===Etymology 1===
*/
private final static Pattern ptrn_3d_level_etymology = Pattern.compile(
"(?mi)^===\\s*Etymology\\s*\\d{0,4}\\s*===\\s*");
// (?mi)^===\s*Etymology\s*\d{0,4}\s*===\s* - Regular Expression
private final static LangText[] NULL_LANG_TEXT_ARRAY = new LangText[0];
/** Splits text to fragments related to different etymologies.
*
* page_title - word which are described in this article 'text'
* @param lt .text will be parsed and splitted,
* .lang is not using now, may be in future...
*
* 1) Checks whether exists more than one section ===Etymology===
* 2) If there is only one or zero sections then return lt_source
* If there more than one sections then split it.
*/
public static LangText[] splitToEtymologySections (
String page_title,
LangText lt_source)
{
if(null == lt_source.text || 0 == lt_source.text.length()) {
return NULL_LANG_TEXT_ARRAY;
}
Matcher m = ptrn_3d_level_etymology.matcher(lt_source.text.toString());
boolean b_next = m.find();
// Position of Etymology block in the lt_source.text:
int start, end; // "<start> == Etymology 1 == ... <end> == Etymology 2 =="
int start1, end1;
start1 = end1 = 0;
if(b_next) {
start1 = m.start();
end1 = m.end();
}
b_next = b_next && m.find();
if(!b_next) { // almost === !m.find() || !m.find()) {
LangText[] lt_result = new LangText[1];
lt_result[0] = lt_source;
return lt_result;
}
// there are more than one Etymology in this language in this word
List<LangText> etymology_sections = new ArrayList<LangText>(); // result will be stored to
boolean bfirst = true;
start = m.start();
end = m.end();
LanguageType lang = lt_source.getLanguage();
while(b_next) {
LangText lt = new LangText(lang);
if(bfirst) {
bfirst = false;
lt.text.append(lt_source.text.substring(0, start1));
lt.text.append(lt_source.text.substring(end1, start));
} else
lt.text.append(lt_source.text.substring(start, end));
etymology_sections.add(lt);
b_next = m.find();
if(b_next) {
start = end;
end = m.start();
}
}
LangText lt = new LangText(lang);
lt.text.append(lt_source.text.substring(end));
etymology_sections.add(lt); // last Etymology section
return (LangText[])etymology_sections.toArray(NULL_LANG_TEXT_ARRAY);
}
}