/**
* Created by jpbirdy on 14-11-23.
*/
package jpbirdy.segment;
import jpbirdy.segment.util.Jumper;
import jpbirdy.segment.util.PartOfSpeech;
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author jpbirdy
* @project Segmentation
* @class Segmenter
* @date 14-11-23 10:56
* @desc 分词器
*/
public class Segmenter {
public static final int minTokenFrequency = 2;
private static SegDict dict;
static {
try {
loadDictionary("main/resources/dict.txt");
}
catch (IOException e) {
e.printStackTrace();
}
}
/**
* 当一个分词既出现在用户词典也出现在通用词典中,则优先使用用户词典。
* 词典的格式为(每个分词一行):
* 分词文本 频率 词性
*
* @param file 加载的字典文件,多个用逗号分隔
* @return 字典结构
* @see jpbirdy.segment.SegDict
*/
public static SegDict loadDictionary(String file) throws IOException {
dict = new SegDict();
String[] files = file.split(",");
for (int i = 0; i < files.length; i++) {
System.out.println("正在载入字典 " + files[i]);
// File dictFile = new File(files[i]);
InputStream is = Segmenter.class.getClassLoader().getResourceAsStream(files[i]);
if (is == null) {
System.err.println("字典文件不存在!");
continue;
}
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line;
String text;
int frequency;
PartOfSpeech pos;
while ((line = br.readLine()) != null) {
// System.out.println("read line:" + line);
String[] lineSplit = line.split(" ");
if (lineSplit.length < 2)
continue;
if (lineSplit.length == 2) {
text = lineSplit[0];
System.out.println(lineSplit[0] + lineSplit[1]);
frequency = Integer.parseInt(lineSplit[1]);
pos = new PartOfSpeech();
}
else {
text = lineSplit[0];
frequency = Integer.parseInt(lineSplit[1]);
pos = new PartOfSpeech(lineSplit[2]);
}
// System.out.println(text + ' ' + frequency + pos);
if (frequency < minTokenFrequency) {
//词频过低忽略
continue;
}
//添加分词
List<String> words = splitTextToWords(text);
SegToken token = new SegToken(words, frequency, 0, pos);
dict.addToken(token);
}
}
// 计算每个分词的路径值
double logTotalFrequency = Math.log(dict.getTotalFrequency());
for (SegToken segToken : dict.getTokens()) {
segToken.setDistance(logTotalFrequency - Math.log(segToken.getFrequency()));
}
for (SegToken segToken : dict.getTokens()) {
List<Segment> segments = segmentWords(segToken.getText(), true);
// 计算需要添加的子分词数目
int numTokensToAdd = 0;
for (Segment segment : segments) {
//长度为1的分词自动过滤
if (segment.getToken().getText().size() > 1) {
numTokensToAdd++;
}
}
List<Segment> listTemp = new ArrayList<Segment>(Collections.nCopies(numTokensToAdd, new Segment()));
segToken.setSegments(listTemp);
//添加子分词
int iSegmentsToAdd = 0;
for (Segment segment : segments) {
if (segment.getToken().getText().size() > 1) {
segToken.getSegments().set(iSegmentsToAdd, segments.get(iSegmentsToAdd));
}
}
}
//词典添加完毕
System.out.println("词典添加完毕");
return dict;
}
public List<String> splitTextToWords2(String word) {
List<String> text = new ArrayList<String>();
for (int i = 0; i < word.length(); i++) {
text.add(word.charAt(i) + "");
}
return text;
}
public static List<String> splitTextToWords(String word) {
List<String> text = new ArrayList<String>();
int pattern = 0;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < word.length(); i++) {
char wordChar = word.charAt(i);
switch (pattern) {
//无状态
case 0:
if ((wordChar >= 'A' && wordChar <= 'Z') || (wordChar >= 'a' && wordChar <= 'z')) {
sb.append(wordChar);
pattern = 1;
}
else if ((wordChar >= '0' && wordChar <= '9')) {
sb.append(wordChar);
pattern = 2;
}
else {
sb.append(wordChar);
text.add(sb.toString());
sb.setLength(0);
pattern = 0;
}
break;
//前一个是字母
case 1:
if ((wordChar >= 'A' && wordChar <= 'Z') || (wordChar >= 'a' && wordChar <= 'z')) {
sb.append(wordChar);
}
else {
text.add(sb.toString());
sb.setLength(0);
pattern = 0;
i--;
}
break;
case 2:
if (Character.isDigit(wordChar)) {
sb.append(wordChar);
}
else {
if (wordChar == '%') {
sb.append(wordChar);
i++;
}
text.add(sb.toString());
sb.setLength(0);
pattern = 0;
i--;
}
break;
default:
pattern = 0;
sb.setLength(0);
break;
}
}
if (sb.length() > 0)
text.add(sb.toString());
return text;
}
// 更新跳转信息:
// 1. 当该位置从未被访问过时(jumper.minDistance为零的情况),或者
// 2. 当该位置的当前最短路径大于新的最短路径时
// 将当前位置的最短路径值更新为baseDistance加上新分词的概率
public static Jumper updateJumper(Jumper jumper, double baseDistance, SegToken token) {
double newDistance = baseDistance + token.getDistance();
if (jumper.getMinDistance() == 0 || jumper.getMinDistance() > newDistance) {
jumper.setMinDistance(newDistance);
jumper.setToken(token);
}
return jumper;
}
public static List<Segment> segmentWords(List<String> text, boolean searchMode) {
//搜索模式下该分词已无继续划分可能的情况
if (searchMode && text.size() == 1) {
return new ArrayList<Segment>();
}
List<Jumper> jumpers = new ArrayList<Jumper>(text.size());
for (int i = 0; i < text.size(); i++)
jumpers.add(new Jumper());
List<SegToken> tokens = new ArrayList<SegToken>(dict.getMaxLength());
for (int i = 0; i < dict.getMaxLength(); i++)
tokens.add(new SegToken());
for (int current = 0; current < text.size(); current++) {
double baseDistance = 0;
if (current == 0)
baseDistance = 0;
else
baseDistance = jumpers.get(current - 1).getMinDistance();
// 寻找所有以当前字元开头的分词
int numTokens = dict.lookupTokens(makeSub(text, current, current + dict.getMaxLength()), tokens);
// 对所有可能的分词,更新分词结束字元处的跳转信息
for (int iToken = 0; iToken < numTokens; iToken++) {
int location = current + tokens.get(iToken).getText().size() - 1;
if (!searchMode || current != 0 || location != text.size() - 1) {
jumpers.set(location, updateJumper(jumpers.get(location), baseDistance, tokens.get(iToken)));
}
}
// 当前字元没有对应分词时补加一个伪分词
if (numTokens == 0 || tokens.get(0).getText().size() > 1) {
jumpers.set(current, updateJumper(jumpers.get(current), baseDistance,
new SegToken(splitTextToWords(text.get(current)), 1, 32, new PartOfSpeech("x"))));
}
}
// 从后向前扫描第一遍得到需要添加的分词数目
int numSeg = 0;
for (int index = text.size() - 1; index >= 0; ) {
int location = index - jumpers.get(index).getToken().getText().size() + 1;
numSeg++;
index = location - 1;
}
// 从后向前扫描第二遍添加分词到最终结果
List<Segment> outputSegment = new ArrayList<Segment>(numSeg);
for (int i = 0; i < numSeg; i++)
outputSegment.add(new Segment());
for (int index = text.size() - 1; index >= 0; ) {
int location = index - jumpers.get(index).getToken().getText().size() + 1;
numSeg--;
outputSegment.get(numSeg).setToken(jumpers.get(index).getToken());
index = location - 1;
}
int bytePosition = 0;
for (Segment anOutputSegment : outputSegment) {
anOutputSegment.setStart(bytePosition);
bytePosition += textSliceByteLength(anOutputSegment.getToken().getText());
anOutputSegment.setEnd(bytePosition);
}
return outputSegment;
}
// 返回多个字元的字节总长度
public static int textSliceByteLength(List<String> words) {
int length = 0;
for (String anWord : words) {
length += anWord.length();
}
return length;
}
public static List<String> makeSub(List<String> text, int begin, int end) {
List<String> ret = new ArrayList<String>();
for (int i = begin; i < text.size() && i <= end; i++)
ret.add(text.get(i));
return ret;
}
public List<Segment> internalSegment(String string, boolean searchMode) {
if (string.length() == 0) {
return new ArrayList<Segment>();
}
List<String> text = splitTextToWords(string);
return segmentWords(text, searchMode);
}
public List<Segment> segment(String string) {
return internalSegment(string, false);
}
}