/*
* 文件名:QuestionTemplateGroup.java
* 版权:Copyright 2008-20012 复旦大学 All Rights Reserved.
* 描述:程序总入口
* 修改人:xpqiu
* 修改时间:Nov 30, 2008
* 修改内容:新增
*
* 修改人:〈修改人〉
* 修改时间:YYYY-MM-DD
* 跟踪单号:〈跟踪单号〉
* 修改单号:〈修改单号〉
* 修改内容:〈修改内容〉
*/
package edu.fudan.nlp.pipe.templet;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.fudan.nlp.cn.ChineseTrans;
import edu.fudan.util.MyFiles;
/**
* 读入一组模板,每个模板对应一种类型的问题
* @author xpqiu
* @version 1.0
* QuestionTemplateGroup
*/
public class RETemplateGroup implements Serializable {
private static final long serialVersionUID = 3927868678845644573L;
private static ChineseTrans tc = new ChineseTrans();
String label;
ArrayList<RETemplate> group;
int count=0;
private HashMap<String,Long> lastModTime;
private String fileName;
public RETemplateGroup(){
group = new ArrayList<RETemplate>();
}
/**
* 构造函数
* @param str
*/
public RETemplateGroup(String str) {
fileName = str;
lastModTime = new HashMap<String, Long>();
group = new ArrayList<RETemplate>();
loadAll();
// // 定期监视文件改动
// Timer timer = new Timer(true);
// timer.schedule(new TimerTask() {
// @Override
// public void run() {
// monitor();
// }
//
//
// }, new Date(System.currentTimeMillis() + 100000), 100000);
}
/**
* 查看模板文件是否改变,并重新读入
*
* Jul 16, 2009
*/
private void monitor() {
List<File> files = MyFiles.getAllFiles(fileName,"templete.txt");
try {
for(int i=0;i<files.size();i++){
Long newTime = files.get(i).lastModified();
Long lastTime = lastModTime.get(files.get(i).toString());
if(lastTime ==null || !lastTime.equals(newTime)){
System.out.println("文件改变,重新读入模板");
loadAll();
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 根据句子得到匹配的模板
* @param str
* @returnJul 16, 2009
*/
public List<RETemplate> getTemplate(String str){
List<RETemplate> templates = new ArrayList<RETemplate>();
Iterator<RETemplate> it = group.iterator();
while(it.hasNext()){
RETemplate qt = it.next();
float w = qt.matches(str);
if(w>0)
templates.add(qt);
}
return templates;
}
/**
* 添加模板
* @param template
*/
public void add(RETemplate template) {
//template.str2Reg(template.template);
group.add(template);
}
/**
* 读入对应目录下所有模板
* @param fileName
* Jul 16, 2009
*/
public synchronized void loadAll(){
group.clear();
count=0;
List<File> files = MyFiles.getAllFiles(fileName, ".txt");
if(files==null||files.size()==0){
System.err.println("模板为空");
}
try {
for(int i=0;i<files.size();i++){
read(files.get(i).toString());
//记录文件修改时间
lastModTime.put(files.get(i).toString(),files.get(i).lastModified());
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("总模板数: " +count);
}
public void read(String fileName) throws Exception{
Scanner scanner = new Scanner(new InputStreamReader (new FileInputStream(fileName),"utf-8"));
boolean isNewType = true;
RETemplate qt = null;
while(scanner.hasNext()){
String s = scanner.nextLine();
if(s.equals("%%类别和样例"))
break;
}
while(scanner.hasNext()){
String s = scanner.nextLine();
if(s.startsWith("##")||s.startsWith("%"))
continue;
Pattern p = Pattern.compile("\\((\\d+)\\)");
Matcher m = p.matcher(s);
int weight = 1;
if(m.find()){
String ws = m.group(1);
weight = Integer.valueOf(ws);
}
s = s.replaceAll("\\(\\d+\\)", "").trim();
s = s.replaceAll(" ", "");
if(s.trim().equals("")){
isNewType = true;
continue;
}
if(isNewType){
if(qt!=null)
group.add(qt);
qt = new RETemplate();
qt.comment = s;
isNewType = false;
continue;
}
qt.addTemplate(s,weight);
count++;
}
scanner.close();
}
/**
* 得到单个文件的模板
* @param fileName
* Jul 16, 2009
*/
public void load(String fileName){
try {
InputStreamReader read = new InputStreamReader (new FileInputStream(fileName),"utf-8");
BufferedReader bin = new BufferedReader(read);
RETemplate qt;
qt = new RETemplate();
qt.comment = fileName;
StringBuilder sb;
String line;
// if(fileName.contains("歌曲 - 无修饰推荐"))
// errorLogger.debug("");
//读入前缀、后缀
String prefix="";
String suffix="";
while((line = bin.readLine()) != null){
if(line.length()==0)
break;
if(line.charAt(0)=='@'){
if(line.substring(1, 7).compareTo("PREFIX")==0)
prefix = line.substring(8);
if(line.substring(1, 7).compareTo("SUFFIX")==0)
suffix = line.substring(8);
}
}
//读入模板
while((line = bin.readLine()) != null){
if(line.length()==0)
break;
line = prefix + line + suffix;
try {
qt.addTemplate(line,1);
count++;
} catch (Exception e) {
System.out.println(fileName);
continue;
}
}
group.add(qt);
} catch (Exception e1) {
e1.printStackTrace();
}
}
public static void main(String[] args){
RETemplateGroup g = new RETemplateGroup("./train/intention.train.txt");
g.loadAll();
String str = "我要去公司";
List<RETemplate> l = g.getTemplate(str);
System.out.println(l);
str = "我要去微软公司";
l = g.getTemplate(str);
System.out.println(l);
}
/**
* 处理问句的形式
* @param str
* @return
*/
private String normalise(String str) {
str = str.replaceAll("\\s+", " ");
str = tc.toSimp(str);
str = ChineseTrans.toHalfWidth(str);
return str;
}
/**
* @param templateFileName
*/
public void save(String templateFileName) {
// TODO Auto-generated method stub
}
}