package edu.fudan.nlp.cn.ner;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* 新版时间表达式识别的主要工作类,改进了timebase的工作方式以提高识别准确率,
* 并支持获得推测后时间和推测前时间两种时间信息
*
* @author 邬桐,曹零
* @version 1.1 2010-4-28
* @since FudanNLP 1.0
*/
public class TimeNormalizer implements Serializable {
private static final long serialVersionUID = 463541045644656392L;
private String timeBase;
private String oldTimeBase;
private static Pattern patterns = null;
private String target;
private TimeUnit[] timeToken = new TimeUnit[0];
public TimeNormalizer() {
}
public TimeNormalizer(String path){
if(patterns == null){
try {
patterns = readModel(path);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.err.print("Read model error!");
}
}
}
/**
* TimeNormalizer的构造方法,根据提供的待分析字符串和timeBase进行时间表达式提取
* 在构造方法中已完成对待分析字符串的表达式提取工作
*
* @param target 待分析字符串
* @param timeBase 给定的timeBase
* @return 返回值
*/
public TimeUnit[] parse(String target,String timeBase){
this.target = target;
this.timeBase = timeBase;
this.oldTimeBase = timeBase;
//字符串预处理
preHandling();
timeToken = TimeEx(this.target,timeBase);
return timeToken;
}
/**
* 同上的TimeNormalizer的构造方法,timeBase取默认的系统当前时间
*
* @param target 待分析字符串
* @return 时间单元数组
*/
public TimeUnit[] parse(String target){
this.target = target;
this.timeBase = new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss").format(Calendar.getInstance().getTime());
this.oldTimeBase = timeBase;
preHandling();//字符串预处理
timeToken = TimeEx(this.target,timeBase);
return timeToken;
}
//
/**
* timeBase的get方法
*
* @return 返回值
*/
public String getTimeBase(){
return timeBase;
}
/**
* oldTimeBase的get方法
*
* @return 返回值
*/
public String getOldTimeBase(){
return oldTimeBase;
}
/**
* timeBase的set方法
*
* @param s timeBase
*/
public void setTimeBase(String s){
timeBase = s;
}
/**
* 重置timeBase为oldTimeBase
*
*/
public void resetTimeBase(){
timeBase = oldTimeBase;
}
/**
* 时间分析结果以TimeUnit组的形式出现,此方法为分析结果的get方法
*
* @return 返回值
*/
public TimeUnit[] getTimeUnit(){
return timeToken;
}
/**
* 待匹配字符串的清理空白符和语气助词以及大写数字转化的预处理
*/
private void preHandling(){
target = stringPreHandlingModule.delKeyword(target, "\\s+"); //清理空白符
target = stringPreHandlingModule.delKeyword(target, "[的]+"); //清理语气助词
target = stringPreHandlingModule.numberTranslator(target);//大写数字转化
}
/**
*有基准时间输入的时间表达式识别
*
*这是时间表达式识别的主方法,
*通过已经构建的正则表达式对字符串进行识别,并按照预先定义的基准时间进行规范化
*将所有别识别并进行规范化的时间表达式进行返回,
*时间表达式通过TimeUnit类进行定义
*
*
* @param String 输入文本字符串
* @param String 输入基准时间
* @return TimeUnit[] 时间表达式类型数组
*
*/
private TimeUnit[] TimeEx(String tar,String timebase)
{
Matcher match;
int startline=-1,endline=-1;
String [] temp = new String[99];
int rpointer=0;
TimeUnit[] Time_Result = null;
match=patterns.matcher(tar);
boolean startmark=true;
while(match.find())
{
startline=match.start();
if (endline==startline)
{
rpointer--;
temp[rpointer]=temp[rpointer]+match.group();
}
else
{
if(!startmark)
{
rpointer--;
//System.out.println(temp[rpointer]);
rpointer++;
}
startmark=false;
temp[rpointer]=match.group();
}
endline=match.end();
rpointer++;
}
if(rpointer>0)
{
rpointer--;
//System.out.println(temp[rpointer]);
rpointer++;
}
Time_Result=new TimeUnit[rpointer];
// System.out.println("Basic Data is " + timebase);
for(int j=0;j<rpointer;j++)
{
Time_Result[j]=new TimeUnit(temp[j],this);
//System.out.println(result[j]);
}
return Time_Result;
}
private Pattern readModel(InputStream is) throws Exception{
ObjectInputStream in = new ObjectInputStream(new BufferedInputStream
(new GZIPInputStream (is)));
return readModel(in);
}
private Pattern readModel(String file) throws Exception {
ObjectInputStream in = new ObjectInputStream(new BufferedInputStream
(new GZIPInputStream (new FileInputStream(file))));
return readModel(in);
}
private Pattern readModel(ObjectInputStream in) throws Exception {
Pattern p = (Pattern) in.readObject();
//System.out.print(p.pattern());
return p=Pattern.compile(p.pattern());
}
}