package edu.fudan.nlp.cn.anaphora; import java.util.Iterator; import java.util.LinkedList; import java.util.regex.Pattern; import edu.fudan.nlp.cn.tag.POSTagger; /** * 检测文中的实体和代词 * @author jszhao * @version 1.0 * @since FudanNLP 1.5 */ public class EntitiesGetter { private LinkedList<Entity> EntityList; static POSTagger tag; Pattern pattern = Pattern.compile("形谓词|形容词|限定词|名词|人名|地名|机构名|专有名|序数词|代词|数字|量词"); public EntitiesGetter() throws Exception{ } public EntitiesGetter(String segmodel, String posmodel) throws Exception { initTagger(segmodel,posmodel); } public static void initTagger(POSTagger tagg) { tag = tagg; } public static void initTagger(String segmodel, String posmodel) throws Exception { if(tag==null) tag = new POSTagger(segmodel,posmodel); } private boolean isPart(String str, String str2){ return (pattern.matcher(str).find()||(str.equals("结构助词")&&str2.equals("的"))); } private final static String PU = "标点"; private static final String PN = "代词"; private void doIt(String[][][] taggedstr,String str) { Entity ey = null; int distance = 0;int index = 0;int subDistance = 0; String strdata= null; String strtag = null; int flag = 0;String graTag = null; for(int i=0;i<taggedstr.length;i++){ String[] words = taggedstr[i][0]; String[] pos = taggedstr[i][1]; for(int j=0;j<words.length;j++){ distance = i; subDistance = 0; if(pos[j].equals(PU)&&((words[j].equals(",")) ||(words[j].equals(":")))){ subDistance++; } String isSing = "UNKONW"; index = str.indexOf(words[j],index); if(isPart(pos[j],words[j])){ int id = j; strdata = words[j]; strtag = pos[j]; flag = 0; ey = new Entity(); if(j>0) ey.setStart(str.indexOf(words[j],index)); else ey.setStart(str.indexOf(words[j])); while(j<words.length-1){ boolean isTogether = (str.indexOf(words[j], index) +words[j].length())==str.indexOf(words[j+1], (str.indexOf(words[j], index))); boolean isModify = !(isNN(pos[j])&&words[j+1].equals("的")); if(isModify&&isTogether&&isPart(pos[j+1],words[j+1])){ if(pos[j].equals("数词")&&(words[j].equals("一")|| words[j].equals("半")||words[j].equals("1"))) isSing = "YES"; else if (pos[j].equals("数词")&&!(words[j].equals("一") ||words[j].equals("半")||words[j].equals("1"))){ isSing = "NO"; } strdata+= words[j+1]; strtag = pos[j+1]; j++; flag++; } else break; } if(strtag.contains(PN)||strdata.contains("这")|| strdata.contains("那")||strdata.contains("该")){ ey.setIsResolution(true); } else ey.setIsResolution(false); int jj = j; while((!isNN(strtag))&&jj>=0){ int ij = strdata.indexOf(words[jj]); if(ij>=0) strdata = strdata.substring(0,ij); else break; jj--; flag--; if(jj>=0) strtag = pos[jj]; } if(strdata.length() == 0) continue; if(strdata.indexOf("的")==0){ strdata = strdata.substring(1); ey.setStart(ey.getStart()+1); } ey.setPosTag(strtag); ey.setData(strdata); if(isSingular(ey.getData())){ isSing = "YES"; } else if(isNotSingular(ey.getData())){ isSing = "NO"; } ey.setSex("UNKONW"); if(this.isFemale(ey.getData())){ ey.setSex("Female"); } else if(this.isMale(ey.getData())){ ey.setSex("Male"); } graTag = "SUB"; while((j-flag-1)>=0&&!pos[j-flag-1].equals(PU)){ if(isObj(pos[j-flag-1])){ graTag = "OBJ"; break; } graTag = "SUB"; flag++; } int iii = 1; if(j<words.length-1&&pos[j+1].equals("DEG")&& words[j+1].equals("的")){ graTag = "ADJ"; } ey.setId(id); ey.setGraTag(graTag); ey.setSingleOrNot(isSing); ey.setDistance(distance); ey.setSubDistance(subDistance); ey.setEnd(ey.getStart()+ey.getData().length()); this.EntityList.add(ey); } } } } private final static String nn = "名词|代词|人名|地名|机构名|专有名"; private final static Pattern isNN= Pattern.compile(nn); public boolean isNN(String strtag) { return isNN.matcher(strtag).find(); } String obj = "副词|动词|介词|形谓词"; Pattern isobj = Pattern.compile(obj); private boolean isObj(String string) { return isobj.matcher(string).find(); } private Boolean isSingular(String str){ if(str.contains("这个")||str.contains("这种")|| str.contains("每")||str.equals("他")|| str.equals("它")||str.equals("她")){ return true; } else return false; } private Boolean isNotSingular(String str){ if(str.startsWith("各")||str.contains("群")|| str.contains("多")||str.startsWith("二者")|| str.startsWith("全体")||str.startsWith("所有") ||str.contains("们")){ return true; } else return false; } private Boolean isFemale(String str){ if(str.contains("娘")||str.contains("妻")|| str.contains("媳")||str.contains("姑")|| str.contains("夫人")||str.contains("她")|| str.contains("小姐")||str.contains("女")|| str.contains("母")||str.contains("妞")|| str.contains("妈")||str.contains("妇")|| str.contains("婆")){ return true; } else return false; } private Boolean isMale(String str){ if(str.contains("先生")||str.contains("男")|| str.contains("丈夫")||str.contains("父")|| str.contains("兄")||str.contains("儿子") ||str.contains("哥")){ return true; } else return false; } public LinkedList<Entity> getEntiyList(String str) throws Exception{ EntityList =new LinkedList<Entity>(); String[][][] stringTag = tag.tag2DoubleArray(str); this.doIt(stringTag,str); return this.EntityList; } public LinkedList<Entity> getEntiyList(String[][][] stringTag,String str){ EntityList =new LinkedList<Entity>(); this.doIt(stringTag,str); return this.EntityList; } public static void main(String args[]) throws Exception{ EntitiesGetter ep = new EntitiesGetter("./models/seg.m","./models/pos.m"); Entity ey = null; LinkedList<Entity> list = ep.getEntiyList("尽管美韩媒体提出种种猜测," + "不过韩联社17日称,美国太平洋司令部司令塞缪尔·洛克利尔承认," + "尚无法确认该导弹是真品还是仿制品,也难以评价。他同时强调," + "若朝鲜进行第三次核试验,美军可能对朝鲜核试验基地进行精确打击。"); Iterator it = list.iterator(); while (it.hasNext()){ ey = (Entity)it.next(); System.out.print(ey.getData()+"\t"+ey.getIsResolution()+"\t"+ey.getGraTag()+"\n"); } } }