FNLPSent.java example

Explorer
fudannlp-master
package edu.fudan.nlp.corpus.fnlp;

import java.util.ArrayList;
import java.util.List;

import edu.fudan.nlp.parser.dep.DependencyTree;
import edu.fudan.util.MyStrings;

/**
 * FudanNLP标准数据格式
 * @since FudanNLP 1.5
 */
public class FNLPSent {


	public String[] words;
	public String[] tags;
	public int[] heads;
	public String[] relations;
	/**
	 * 下标起始位置，缺省为0
	 */
	private int start = 0;

	/**
	 * 
	 * @param words 词数组
	 * @param tags 词性数组
	 * @param heads 父节点数组
	 * @param relations 关系类型数组
	 */
	public FNLPSent(String[] words,String[] tags, int[] heads, String[] relations) {
		this.words  = words;
		this.tags = tags;
		this.heads = heads;
		this.relations = relations;
	}

	public FNLPSent(List<String> list) {
		parse(list,start,true);
	}

	public FNLPSent() {
		// TODO Auto-generated constructor stub
	}

	public FNLPSent(int len) {
		this.words  = new String[len];
		this.tags = new String[len];
		this.heads = new int[len];
		this.relations = new String[len];
	}

	/**
	 * @param list 一个句子，每行是一组标记 上海	{"NR	2	NMOD", "浦东	NR	6	NMOD"}
	 * @param pos 标记开始位置，默认是1
	 * @param p 第一列是否为序列标记
	 */
	public void parse(List<String> list,int pos,boolean b) {

		int len = list.size();
		words = new String[len];
		tags = new String[len];
		heads = new int[len];
		relations = new String[len];
		int start=0;
		if(b){
			start = 1;	
		}
		for(int j=0;j<len;j++){

			String[] toks = list.get(j).split("[\\t\\s]+");
			if(b){
				assert (j+pos) == Integer.parseInt(toks[0]);
			}

			words[j] = toks[start];
			if(toks.length>start+1)
				tags[j] = toks[start+1];
			if(toks.length>start+2)
				heads[j] = Integer.parseInt(toks[start+2])-pos;
			if(toks.length>start+3)
				relations[j] = toks[start+3];
			if(toks.length>start+4)
				System.err.println("格式列表太多！");
		}


	}



	public String toString(){
		StringBuffer sb = new StringBuffer();
		for(int j=0;j<words.length;j++){
			if(words[j]!=null){
				sb.append(j);
				sb.append("\t");
				sb.append(words[j]);
				if(tags!=null){
					sb.append("\t");
					sb.append(tags[j]);
				}
				if(heads!=null){
					sb.append("\t");
					sb.append(heads[j]);
				}
				if(relations!=null){
					sb.append("\t");
					sb.append(relations[j]);
				}
				sb.append("\n");
			}
		}
		return sb.toString();
	}

	public int size() {		
		return words.length;
	}
	/**
	 * 处理只分词的字符串
	 * @param line
	 */
	public void put(String line) {
		words = line.split("(\\s|　| |\\t)+");		
	}

	public boolean hasTag() {
		if(tags[0]!=null)
			return true;
		else
			return false;
	}

	public DependencyTree toTree() {
		ArrayList<DependencyTree> nodes = new ArrayList<DependencyTree>();
		DependencyTree root = null;
		for(int j=0;j<words.length;j++){
			DependencyTree node = new DependencyTree(j, words[j],tags[j], relations[j]);
			nodes.add(node);
		}
		for(int j=0;j<words.length;j++){
			int head = heads[j];
			if(head==-1)
				root = nodes.get(j);
			else{
				if(head>j)
					nodes.get(head).addLeftChild(nodes.get(j));
				else
					nodes.get(head).addRightChild(nodes.get(j));
			}
		}
		return root;
	}

	public String getSentenceString() {
		String s = MyStrings.toString(words, "");
		return s;
	}
}