/* YearsRange.java - corresponds to the years period in phrase/sentence * that illustrates a meaning of a word in Russian Wiktionary. * * Copyright (c) 2012 Andrew Krizhanovsky <andrew.krizhanovsky at gmail.com> * Distributed under EPL/LGPL/GPL/AL/BSD multi-license. */ package wikokit.base.wikt.multi.ru.quote; import java.util.regex.Matcher; import java.util.regex.Pattern; /** Years period in the quotation phrase / sentence. */ public class YearsRange { public YearsRange() { year_from = -1; year_to = -1; } /** Start date of a writing book with the quote, if there is no information about date then -1. */ public int year_from; /** Finish date of a writing book with the quote, if there is no information about date then -1. */ public int year_to; /** Converts string to integer, if there is any problem then return -1. */ private static int stringToInt(String page_title, String text) { int i = -1; if(null != text && text.length() > 0) { try { i = Integer.parseInt(text); } catch (NumberFormatException nfe) { System.out.println("Error in WQuoteRu:YearsRange:stringToInt: entry '"+ page_title + "' unknown year format, " + nfe.getMessage()); } } return i; } // extract year XXXX from string, e.g. "1875?" or "12, 2000" private final static Pattern pattern_contains_4_year = Pattern.compile("\\d{4}"); // day day month year year year year (end of the string), e.g. 22 month 2003 //private final static Pattern pattern_day_month_year = Pattern.compile("\\d{1,2}\\s+\\D+\\s+\\d{4}\\Z"); // extract_year_from_year_dot_month_dot_day, e.g. 2002.08.26 //private final static Pattern pattern_year_dot_month_dot_day = Pattern.compile("\\d{4}\\.\\d{1,2}\\.\\d{1,2}"); /** Parses source text (e.g. "1882" or "08-07-2011" or "06.05.2006") * and extracts four digits. * * @return -1 If text was not parsed successfully */ private int extractFourDigits (String page_title, String text) { int i = -1; Matcher m = pattern_contains_4_year.matcher(text); if(m.find()) { text = m.group(); i = stringToInt(page_title, text); } return i; } /** Parses source text (e.g. "1882" or "08-07-2011" or "06.05.2006") * and returns "true" in 2nd and 3rd cases, * when there two not adjacent symbols "-" or ".", * i.e. ".." or "--" generates "false". * * @return -1 If text was not parsed successfully */ private boolean containsTwoNonAdjacentSymbols (String page_title, String text, char symbol) { int pos1, pos2; if(-1 != (pos1 = text.indexOf(symbol))) { if(text.length() > pos1 + 2 && // skip case "XXX-" failed in next line -1 != (pos2 = text.indexOf(symbol, pos1 + 1))) { if(pos2 - pos1 > 1) return true; } } return false; } /** Parses source text (e.g. "1882-1883"), stores years to year_from * and year_to. Store results to year_from and year_to. * If there is only one year, e.g. "1972", then year_from=year_to. * If text was not parsed successfully, then year_from=year_to=-1. */ public void parseYearsRange(String page_title, String text) { if(text.contains("{{-}}")) // range of years with dash template {{-}}: 1998{{-}}2001 text = text.replace("{{-}}", "-"); if(text.contains("-е")) // decade; tens of years, e.g. 1830-е text = text.replace("-е", ""); // question_in_years e.g. 1862—1875? if(text.endsWith("?")) text = text.substring(0, text.length() - 1); if( containsTwoNonAdjacentSymbols (page_title, text, '-') // "08-07-2011" || containsTwoNonAdjacentSymbols (page_title, text, '.') // "06.05.2006" ) { year_to = year_from = extractFourDigits(page_title, text); return; } // range of years: 1880—1881, 1842–1862 int pos = text.indexOf("—"); if(-1 == pos) pos = text.indexOf("-"); if(-1 == pos) pos = text.indexOf("–"); if(-1 == pos) { // it's not a range int len = text.length(); if(len > 4) { // pattern_contains_4_year: "1875?" Matcher m = pattern_contains_4_year.matcher(text); if(m.find()) { text = m.group(); } // compare with: 22 month 2003 /*m = pattern_day_month_year.matcher(text); if(m.find()) { text = text.substring(len - 4); } else { // compare with 2002.08.26 Matcher m2 = pattern_year_dot_month_dot_day.matcher(text); if(m2.matches()) text = text.substring(0, 4); }*/ } year_from = stringToInt(page_title, text); year_to = year_from; } else { // it's a range, split it by "-" or "—" String str_from = text.substring(0, pos); String str_to = text.substring(pos + 1); if(str_from.length() > 0) { year_from = stringToInt(page_title, str_from); year_to = year_from; } if(str_to.length() > 0) year_to = stringToInt(page_title, str_to); } } }