package edu.fudan.nlp.corpus.fnlp;
import java.util.ArrayList;
import java.util.List;
import edu.fudan.nlp.parser.dep.DependencyTree;
import edu.fudan.util.MyStrings;
/**
* FudanNLP标准数据格式
* @since FudanNLP 1.5
*/
public class FNLPSent {
public String[] words;
public String[] tags;
public int[] heads;
public String[] relations;
/**
* 下标起始位置,缺省为0
*/
private int start = 0;
/**
*
* @param words 词数组
* @param tags 词性数组
* @param heads 父节点数组
* @param relations 关系类型数组
*/
public FNLPSent(String[] words,String[] tags, int[] heads, String[] relations) {
this.words = words;
this.tags = tags;
this.heads = heads;
this.relations = relations;
}
public FNLPSent(List<String> list) {
parse(list,start,true);
}
public FNLPSent() {
// TODO Auto-generated constructor stub
}
public FNLPSent(int len) {
this.words = new String[len];
this.tags = new String[len];
this.heads = new int[len];
this.relations = new String[len];
}
/**
* @param list 一个句子,每行是一组标记 上海 {"NR 2 NMOD", "浦东 NR 6 NMOD"}
* @param pos 标记开始位置,默认是1
* @param p 第一列是否为序列标记
*/
public void parse(List<String> list,int pos,boolean b) {
int len = list.size();
words = new String[len];
tags = new String[len];
heads = new int[len];
relations = new String[len];
int start=0;
if(b){
start = 1;
}
for(int j=0;j<len;j++){
String[] toks = list.get(j).split("[\\t\\s]+");
if(b){
assert (j+pos) == Integer.parseInt(toks[0]);
}
words[j] = toks[start];
if(toks.length>start+1)
tags[j] = toks[start+1];
if(toks.length>start+2)
heads[j] = Integer.parseInt(toks[start+2])-pos;
if(toks.length>start+3)
relations[j] = toks[start+3];
if(toks.length>start+4)
System.err.println("格式列表太多!");
}
}
public String toString(){
StringBuffer sb = new StringBuffer();
for(int j=0;j<words.length;j++){
if(words[j]!=null){
sb.append(j);
sb.append("\t");
sb.append(words[j]);
if(tags!=null){
sb.append("\t");
sb.append(tags[j]);
}
if(heads!=null){
sb.append("\t");
sb.append(heads[j]);
}
if(relations!=null){
sb.append("\t");
sb.append(relations[j]);
}
sb.append("\n");
}
}
return sb.toString();
}
public int size() {
return words.length;
}
/**
* 处理只分词的字符串
* @param line
*/
public void put(String line) {
words = line.split("(\\s| | |\\t)+");
}
public boolean hasTag() {
if(tags[0]!=null)
return true;
else
return false;
}
public DependencyTree toTree() {
ArrayList<DependencyTree> nodes = new ArrayList<DependencyTree>();
DependencyTree root = null;
for(int j=0;j<words.length;j++){
DependencyTree node = new DependencyTree(j, words[j],tags[j], relations[j]);
nodes.add(node);
}
for(int j=0;j<words.length;j++){
int head = heads[j];
if(head==-1)
root = nodes.get(j);
else{
if(head>j)
nodes.get(head).addLeftChild(nodes.get(j));
else
nodes.get(head).addRightChild(nodes.get(j));
}
}
return root;
}
public String getSentenceString() {
String s = MyStrings.toString(words, "");
return s;
}
}