PartsOfSpeechTag.java example

Explorer
fudannlp-master
package edu.fudan.example.nlp;

import java.util.ArrayList;

import edu.fudan.ml.types.Dictionary;
import edu.fudan.nlp.cn.tag.CWSTagger;
import edu.fudan.nlp.cn.tag.POSTagger;

/**
 * 词性标注使用示例
 * @author xpqiu
 *
 */
public class PartsOfSpeechTag {
	
	static POSTagger tag;

	/**
	 * 主程序
	 * @param args
	 * @throws IOException 
	 * @throws  
	 */
	public static void main(String[] args) throws Exception {

		
		CWSTagger cws = new CWSTagger("./models/seg.m");
		tag = new POSTagger(cws,"models/pos.m");
		
		System.out.println("得到支持的词性标签集合");
		System.out.println(tag.getSupportedTags());
		System.out.println(tag.getSupportedTags().size());
		System.out.println("\n");
		
		String str = "媒体计算研究所成立了，高级数据挖掘很难。乐phone很好！";
		String s = tag.tag(str);
		System.out.println("处理未分词的句子");
		System.out.println(s);
		
		System.out.println("使用英文标签");
		tag.SetTagType("en");		
		System.out.println(tag.getSupportedTags());
		System.out.println(tag.getSupportedTags().size());
		s = tag.tag(str);
		System.out.println(s);		
		System.out.println();
		
		CWSTagger cws2 = new CWSTagger("./models/seg.m", new Dictionary("./models/dict.txt"));
		
		//bool值指定该dict是否用于cws分词（分词和词性可以使用不同的词典）
		tag = new POSTagger(cws2, "models/pos.m"
				, new Dictionary("./models/dict.txt"), true);//true就替换了之前的dict.txt
		tag.removeDictionary(false);//不移除分词的词典
		tag.setDictionary(new Dictionary("./models/dict.txt"), false);//设置POS词典，分词使用原来设置
		
		String str2 = "媒体计算研究所成立了，高级数据挖掘很难。乐phone很好！";
		String s2 = tag.tag(str2);
		System.out.println("处理未分词的句子，使用词典");
		System.out.println(s2);
		System.out.println();
		
		Dictionary dict = new Dictionary();
		dict.add("媒体计算","mypos1","mypos2");
		dict.add("乐phone","专有名");
		tag.setDictionary(dict, true);
		String s22 = tag.tag(str2);
		System.out.println(s22);
		System.out.println();
		
		POSTagger tag1 = new POSTagger("./models/pos.m");
		String str1 = "媒体计算 研究所 成立 了 , 高级 数据挖掘 很 难";
		String[] w = str1.split(" ");
		String[] s1 = tag1.tagSeged(w);
		System.out.println("直接处理分好词的句子");
		for(int i=0;i<s1.length;i++){
			System.out.print(w[i]+"/"+s1[i]+" ");
		}
		System.out.println("\n");
		
		POSTagger tag3 = new POSTagger("./models/pos.m", new Dictionary("./models/dict.txt"));
		String str3 = "媒体计算 研究所 成立 了 , 高级 数据挖掘 很 难 ";
		String[] w3 = str3.split(" ");
		String[] s3 = tag3.tagSeged(w3);
		System.out.println("直接处理分好词的句子，使用词典");
		for(int i=0;i<s3.length;i++){
			System.out.print(w3[i]+"/"+s3[i]+" ");
		}
		System.out.println("\n");
		
		
		System.out.println("重新构造");
		cws = new CWSTagger("./models/seg.m");
		tag = new POSTagger(cws,"models/pos.m");
		str = "媒体计算研究所成立了, 高级数据挖掘很难";
		System.out.println(tag.tag(str));
		String[][] sa = tag.tag2Array(str);
		for(int i = 0; i < sa.length; i++) {
			for(int j = 0; j < sa[i].length; j++) {
					System.out.print(sa[i][j] + " ");
			}
			System.out.println();
		}
		
		String s4 = tag.tagFile("./example-data/data-tag.txt");
		System.out.println("\n处理文件：");
		System.out.println(s4);
	}

}