package edu.fudan.nlp.corpus;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Set;
import edu.fudan.nlp.cn.Chars;
import edu.fudan.nlp.cn.Chars.StringType;
import edu.fudan.util.UnicodeReader;
public class Tags {
/**
* 字符串文件转换为序列标注文件
* @param infile
* @param outfile
* @throws IOException
*/
public static void processFile(String infile, String outfile,String delimer,int tagnum) throws IOException {
BufferedReader in = new BufferedReader(new UnicodeReader(new FileInputStream(infile), "utf8"));
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
outfile), "utf8"));
String line = null;
while ((line = in.readLine()) != null) {
line = line.trim();
String newline;
newline= genSegSequence(line,delimer,tagnum);
out.write(newline);
// out.newLine();
}
in.close();
out.close();
}
/**
* 将序列标签转为BMES
* @param wordArray
* @return
*/
public static String genSequence4Tags(String[] wordArray){
StringBuilder sb = new StringBuilder();
for(int i=0; i<wordArray.length; i++) {
String word = wordArray[i];
for(int j=0; j<word.length(); j++) {
char c = word.charAt(j);
if(Chars.getType(c)==Chars.CharType.B)
System.err.println("包含空格");
sb.append(c);
sb.append('\t');
if(j == 0) {
if(word.length() == 1)
sb.append('S');
else
sb.append('B');
} else if(j == word.length()-1) {
sb.append('E');
} else {
sb.append('M');
}
sb.append('\n');
}
}
sb.append('\n');
return sb.toString();
}
/**
* 将序列标签转为BMES
* @param wordArray
* @return
*/
public static String genSequence6Tags(String[] wordArray){
StringBuilder sb = new StringBuilder();
for(int i=0; i<wordArray.length; i++) {
String word = wordArray[i];
String tag = null;
int len = word.length();
switch(len){
case 1:
tag = "S";break;
case 2:
tag = "BE";break;
case 3:
tag = "B2E";break;
case 4:
tag = "B23E";break;
default :
tag = "B23";
int rest = len-4;
while(rest-->0){
tag+="M";
}
tag+="E";
}
assert tag.length() ==len;
for(int j=0; j<word.length(); j++) {
char c = word.charAt(j);
sb.append(c);
sb.append('\t');
sb.append(tag.charAt(j));
sb.append('\n');
}
}
sb.append('\n');
return sb.toString();
}
/**
* 将分好词的字符串转换为标注序列
* @param sent
* @param delimer
* @param tagnum
* @return
*/
public static String genSegSequence(String sent,String delimer,int tagnum){
String[] wordArray = sent.split(delimer);
if(tagnum ==4 )
return genSequence4Tags(wordArray);
else if (tagnum == 6)
return genSequence6Tags(wordArray);
else
return null;
}
/**
* 生成Cross-Label序列
* @param sent
* @param delim
* @param delimTag
* @param filter
* @return
*/
public static String genCrossLabel(String sent,String delim,String delimTag,Set<String> filter){
sent = sent.trim();
if(sent.length()==0)
return sent;
StringBuilder sb = new StringBuilder();
String[] wordArray = sent.split(delim);
for(int i=0; i<wordArray.length; i++) {
//得到tag
int idx = wordArray[i].lastIndexOf(delimTag);
if(idx==-1||idx==wordArray[i].length()-1){
System.err.println(wordArray[i]);
}
String word = wordArray[i].substring(0,idx);
String tag = wordArray[i].substring(idx+1);
for(int j=0; j<word.length(); j++) {
char c = word.charAt(j);
sb.append(c);
sb.append('\t');
if(filter==null||filter.contains(tag)){//不过滤或是保留项
if(j == 0) {
if(word.length() == 1)
sb.append("S-"+tag);
else
sb.append("B-"+tag);
} else if(j == word.length()-1) {
sb.append("E-"+tag);
} else {
sb.append("M-"+tag);
}
}else{//过滤项
sb.append("O");
}
sb.append('\n');
}
}
sb.append('\n');
return sb.toString();
}
}