package de.l3s.content.timex.extracting.utils;
/*
* TIMETool - Large-scale Temporal Search in MapReduce
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
/*
* THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
* PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
* NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* @author
*/
import java.text.DateFormat;
import java.text.ParseException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.joda.time.IllegalFieldValueException;
import org.joda.time.LocalDate;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import edu.stanford.nlp.util.Pair;
public class DateUtil {
private static DateFormat full_df = DateFormat.getDateInstance(DateFormat.FULL);
private static DateFormat medium_df = DateFormat.getDateInstance(DateFormat.MEDIUM);
private final static DateTimeFormatter dateFormat = DateTimeFormat
.forPattern("yyyyMMdd");
static final String blog_date4 = "(Mon|Tue|Wed|Thu|Fri|Sat|Sun), \\d{4}-\\d{2}-\\d{2}";
static final String blog_date1 = "(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\s+[0-3]?[0-9],?\\s+[0-2][0-9][0-9][0-9]";
static final String blog_date2 = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\d\\d)";
static final String blog_date3 = "^((19|20)\\d\\d)-(0?[1-9]|1[012])-(0?[1-9]|[12][0-9]|3[01])$";
static final String blog_date0 = "(Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday),?\\s+(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\s+[0-3]?[0-9],?\\s+[0-2][0-9][0-9][0-9]";
static final String html_pattern = "<html>.*?</html>";
String url_regex = "\\(?\\b(http://|www[.])[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]";
static Pattern html_tag = Pattern.compile(html_pattern);
public DateUtil() {}
/**
*
* @param dateString
* @return
*/
public LocalDate extractDateFromContent(String dateString){
String extractedDate = "";
Pattern p = Pattern.compile(blog_date0);
Matcher date = p.matcher(dateString);
//extract date from content
if (date.find()) {
extractedDate = date.group();
try {
return new LocalDate(full_df.parseObject(extractedDate));
} catch (ParseException pe1) {
try {
extractedDate = extractedDate.replaceAll(", ", " ");
extractedDate = extractedDate.replaceFirst(" ", ", ");
extractedDate = replaceLast(extractedDate, " 200", ", 200");
return new LocalDate(full_df.parseObject(extractedDate));
} catch (ParseException pe2) {
System.out.println(extractedDate + "\n " + pe2.getMessage());
}
}
} else {
p = Pattern.compile(blog_date1);
date = p.matcher(dateString);
if (date.find()) {
extractedDate = date.group();
try {
return new LocalDate(medium_df.parseObject(extractedDate));
} catch (ParseException pe1) {
try {
extractedDate = extractedDate.replaceAll(", ", " ");
extractedDate = replaceLast(extractedDate, " 200", ", 200");
return new LocalDate(medium_df.parseObject(extractedDate));
} catch (ParseException pe2) {
System.out.println(extractedDate + "\n " + pe2.getMessage());
}
}
}
}
return null;
}
/**
*
* @param content
* @param url
* @param docid
* @return
* @throws ParseException
*/
public LocalDate extractDate(String[] content_lines, String url, String docId) throws ParseException {
LocalDate extractedUrlDate = null;
LocalDate extractedDocIdDate = null;
LocalDate extractedContentDate = null;
// extract date from content
extractedContentDate = extractDateFromContent(content_lines[0]);
if (extractedContentDate == null && content_lines.length > 1) extractedContentDate = extractDateFromContent(content_lines[1]);
if (extractedContentDate == null ) {
//extract date from blog url
extractedUrlDate = extractDateFromURL(url);
//extract date from docid
extractedDocIdDate = LocalDate.parse(docId.substring(7, 15), dateFormat);
if(extractedUrlDate != null && extractedUrlDate.getMonthOfYear() == extractedDocIdDate.getMonthOfYear()
&& extractedUrlDate.getYear() == extractedDocIdDate.getYear() && extractedUrlDate.getDayOfMonth() == 15){
return extractedDocIdDate;
}
//case url contains exact date yyyyMMdd
else if (extractedUrlDate != null && extractedUrlDate.getDayOfMonth() != 15) return extractedUrlDate;
else if(extractedUrlDate == null) return extractedDocIdDate;
else return extractedUrlDate;
} else{
return extractedContentDate;
}
}
private static String p1 = "/(January|February|March|April|May|June|July|August|September|October|November|December)/(19|20)\\d{2}/";
private static String p2 = "/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/(19|20)\\d{2}/";
private static String p3 = "/(01|02|03|04|05|06|07|08|09|10|11|12)/(19|20)\\d{2}/"; // month = 01-12
private static String p4 = "/(1|2|3|4|5|6|7|8|9|10|11|12)/(19|20)\\d{2}/";
private static String p5 = "/(19|20)\\d{2}/(01|02|03|04|05|06|07|08|09|10|11|12)/";
private static String p6 = "/(19|20)\\d{2}/(1|2|3|4|5|6|7|8|9|10|11|12)/";
private static String p7 = "/(19|20)\\d{2}/(01|02|03|04|05|06|07|08|09|10|11|12)/[0-9][0-9]/";
private static String p8 = "/(19|20)\\d{2}_(01|02|03|04|05|06|07|08|09|10|11|12)_[0-9][0-9]_";
private static String p9 = "/(19|20)\\d{2}_(01|02|03|04|05|06|07|08|09|10|11|12)_";
/**
*
* @param url
* @return
*/
public static LocalDate extractDateFromURL(String url) {
LocalDate extractedDate = null;
if(url.contains("_")) url = url.replace("_", "/");
Pattern p = Pattern.compile(p1);
Matcher date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
String month = token[0];
if (month.contains("January") || month.contains("Jan")) {
month = "01";
} else if (month.contains("February") || month.contains("Feb")) {
month = "02";
} else if (month.contains("March") || month.contains("Mar")) {
month = "03";
} else if (month.contains("April") || month.contains("Apr")) {
month = "04";
} else if (month.contains("May")) {
month = "05";
} else if (month.contains("June") || month.contains("Jun")) {
month = "06";
} else if (month.contains("July") || month.contains("Jul")) {
month = "07";
} else if (month.contains("August") || month.contains("Aug")) {
month = "08";
} else if (month.contains("September") || month.contains("Sep")) {
month = "09";
} else if (month.contains("October") || month.contains("Oct")) {
month = "10";
} else if (month.contains("November") || month.contains("Nov")) {
month = "11";
} else if (month.contains("December") || month.contains("Dec")) {
month = "12";
}
extractedDate = LocalDate.parse((token[1] + month + "15").toString(), dateFormat );
} else {
p = Pattern.compile(p2);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
String month = token[0];
if (month.contains("January") || month.contains("Jan")) {
month = "01";
} else if (month.contains("February") || month.contains("Feb")) {
month = "02";
} else if (month.contains("March") || month.contains("Mar")) {
month = "03";
} else if (month.contains("April") || month.contains("Apr")) {
month = "04";
} else if (month.contains("May")) {
month = "05";
} else if (month.contains("June") || month.contains("Jun")) {
month = "06";
} else if (month.contains("July") || month.contains("Jul")) {
month = "07";
} else if (month.contains("August") || month.contains("Aug")) {
month = "08";
} else if (month.contains("September") || month.contains("Sep")) {
month = "09";
} else if (month.contains("October") || month.contains("Oct")) {
month = "10";
} else if (month.contains("November") || month.contains("Nov")) {
month = "11";
} else if (month.contains("December") || month.contains("Dec")) {
month = "12";
}
extractedDate = LocalDate.parse((token[1] + month + "15").toString(), dateFormat );
} else {
p = Pattern.compile(p3);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
try{
extractedDate = LocalDate.parse((token[1] + token[0] + "15").toString(), dateFormat );
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p4);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
try{
extractedDate = LocalDate.parse((token[1] + "0" + token[0] + "15").toString(), dateFormat );
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p7);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1)
.split("/");
try{
extractedDate = LocalDate.parse(token[0] + token[1] + token[2], dateFormat);
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p6);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
try{
extractedDate = LocalDate.parse((token[0] + token[1] + "15").toString(), dateFormat);
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p5);
date = p.matcher(url);
if (date.find()){
String[] token = date.group().substring(1).split("/");
try{
extractedDate = LocalDate.parse((token[0] + token[1] + "15").toString(), dateFormat );
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p8);
date = p.matcher(url);
if (date.find()){
String[] token = date.group().substring(1).split("_");
try{
extractedDate = LocalDate.parse(token[0] + token[1] + token[2], dateFormat);
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p9);
date = p.matcher(url);
if (date.find()){
String[] token = date.group().substring(1).split("_");
try{
extractedDate = LocalDate.parse(token[0] + token[1] + "15", dateFormat);
}catch(IllegalFieldValueException e){
return null;
}
}
}
}
}
}
}
}
}
}
return extractedDate;
}
public static Pair<String, String> extractDateFromURL_(String url) {
Pair<String, String> extractedDate = null;
if(url.contains("_")) url = url.replace("_", "/");
Pattern p = Pattern.compile(p1);
Matcher date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
String month = token[0];
if (month.contains("January") || month.contains("Jan")) {
month = "01";
} else if (month.contains("February") || month.contains("Feb")) {
month = "02";
} else if (month.contains("March") || month.contains("Mar")) {
month = "03";
} else if (month.contains("April") || month.contains("Apr")) {
month = "04";
} else if (month.contains("May")) {
month = "05";
} else if (month.contains("June") || month.contains("Jun")) {
month = "06";
} else if (month.contains("July") || month.contains("Jul")) {
month = "07";
} else if (month.contains("August") || month.contains("Aug")) {
month = "08";
} else if (month.contains("September") || month.contains("Sep")) {
month = "09";
} else if (month.contains("October") || month.contains("Oct")) {
month = "10";
} else if (month.contains("November") || month.contains("Nov")) {
month = "11";
} else if (month.contains("December") || month.contains("Dec")) {
month = "12";
}
extractedDate = Pair.makePair(LocalDate.parse((token[1] + month + "15").toString(), dateFormat).toString(), "strong");
} else {
p = Pattern.compile(p2);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
String month = token[0];
if (month.contains("January") || month.contains("Jan")) {
month = "01";
} else if (month.contains("February") || month.contains("Feb")) {
month = "02";
} else if (month.contains("March") || month.contains("Mar")) {
month = "03";
} else if (month.contains("April") || month.contains("Apr")) {
month = "04";
} else if (month.contains("May")) {
month = "05";
} else if (month.contains("June") || month.contains("Jun")) {
month = "06";
} else if (month.contains("July") || month.contains("Jul")) {
month = "07";
} else if (month.contains("August") || month.contains("Aug")) {
month = "08";
} else if (month.contains("September") || month.contains("Sep")) {
month = "09";
} else if (month.contains("October") || month.contains("Oct")) {
month = "10";
} else if (month.contains("November") || month.contains("Nov")) {
month = "11";
} else if (month.contains("December") || month.contains("Dec")) {
month = "12";
}
extractedDate = Pair.makePair(LocalDate.parse((token[1] + month + "15").toString(), dateFormat).toString(), "mildly strong");
} else {
p = Pattern.compile(p3);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
try{
extractedDate = Pair.makePair(LocalDate.parse((token[1] + token[0] + "15").toString(), dateFormat).toString(), "mildy strong");
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p4);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
try{
extractedDate = Pair.makePair(LocalDate.parse((token[1] + "0" + token[0] + "15").toString(), dateFormat).toString(), "mildly strong");
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p7);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1)
.split("/");
try{
extractedDate = Pair.makePair(LocalDate.parse(token[0] + token[1] + token[2], dateFormat).toString(), "very strong");
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p6);
date = p.matcher(url);
if (date.find()) {
String[] token = date.group().substring(1).split("/");
try{
extractedDate = Pair.makePair(LocalDate.parse((token[0] + token[1] + "15").toString(), dateFormat).toString(), "mildly strong");
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p5);
date = p.matcher(url);
if (date.find()){
String[] token = date.group().substring(1).split("/");
try{
extractedDate = Pair.makePair(LocalDate.parse((token[0] + token[1] + "15").toString(), dateFormat).toString(), "mildly strong");
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p8);
date = p.matcher(url);
if (date.find()){
String[] token = date.group().substring(1).split("_");
try{
extractedDate = Pair.makePair(LocalDate.parse(token[0] + token[1] + token[2], dateFormat).toString(), "very strong");
}catch(IllegalFieldValueException e){
return null;
}
} else {
p = Pattern.compile(p9);
date = p.matcher(url);
if (date.find()){
String[] token = date.group().substring(1).split("_");
try{
extractedDate = Pair.makePair(LocalDate.parse(token[0] + token[1] + "15", dateFormat).toString(), "mildly strong");
}catch(IllegalFieldValueException e){
return null;
}
}
}
}
}
}
}
}
}
}
return extractedDate;
}
public static String replaceLast(String input, String regex, String replacement) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(input);
if (!matcher.find()) {
return input;
}
int lastMatchStart=0;
do {
lastMatchStart=matcher.start();
} while (matcher.find());
matcher.find(lastMatchStart);
StringBuffer sb = new StringBuffer(input.length());
matcher.appendReplacement(sb, replacement);
matcher.appendTail(sb);
return sb.toString();
}
public static void main (String[] args) {
System.out.println(DateUtil.extractDateFromURL("http://0009.org/blog/index.php/2006/10/"));
}
}
class BlogDocument{
public String docno;
public String permalink;
public String date_xml;
public String dochdr;
public String content;
public static final String DOC = "<DOC>";
public static final String DOC_ = "</DOC>";
public static final String DOCNO = "<DOCNO>";
public static final String DOCNO_ = "</DOCNO>";
public static final String DATE_XML = "<DATE_XML>";
public static final String DATE_XML_ = "</DATE_XML>";
public static final String PERMALINK = "<PERMALINK>";
public static final String PERMALINK_ = "</PERMALINK>";
public static final String DOCHDR = "<DOCHDR>";
public static final String DOCHDR_ = "</DOCHDR>";
public static final String DOCTEXT = "<html>";
public static final String TITLE = "<title>";
public BlogDocument(String docno, String permalink, String date_xml, String dochdr){
this.docno = docno;
this.permalink = permalink;
this.date_xml = date_xml;
this.dochdr = dochdr;
}
public BlogDocument() {}
}