/**
* Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved.
* EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
* http://www.ewcms.com
*/
package com.ewcms.content.document.util.search;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import com.ewcms.common.io.HtmlFileUtil;
import com.ewcms.common.io.HtmlStringUtil;
import com.ewcms.common.io.HtmlNumberUtil;
import com.ewcms.content.document.util.analyzer.IKSegmentation;
import com.ewcms.content.document.util.analyzer.Lexeme;
import com.ewcms.content.document.util.analyzer.lucene.IKAnalyzer;
import com.ewcms.content.document.util.analyzer.lucene.IKQueryParser;
/**
* 提取文章内容的关键字和摘要信息
*
* @author 吴智俊
*/
public class ExtractKeywordAndSummary {
private static String filterWords;
private static String filterChars;
public static String getTextFromHtml(String html) {
String text = HtmlStringUtil.getPureText(html);
if (HtmlStringUtil.isEmpty(text))
text = HtmlStringUtil.clearHtmlTag(html);
return text.replaceAll("[\\s\\u0020\u3000]{2,}", " ");
}
public static String[] getKeyword(String content) {
content = getTextFromHtml(content);
IKSegmentation seg = new IKSegmentation(new StringReader(content));
LinkedHashMap<String, Integer> map = new LinkedHashMap<String, Integer>();
ArrayList<Object> list = new ArrayList<Object>();
try {
for (Lexeme word = seg.next(); word != null; word = seg.next()) {
String k = word.getLexemeText();
if (k != null && k.length() != 1)
if (map.containsKey(k))
map.put(k, new Integer(((Integer) map.get(word.getLexemeText())).intValue() + 1));
else
map.put(k, new Integer(1));
}
Object ks[] = keyArray(map);
Object vs[] = valueArray(map);
ArrayList<Object> arr = new ArrayList<Object>();
for (int i = 0; i < ks.length; i++) {
String k = ks[i].toString();
if (filter(k)) {
int count = ((Integer) vs[i]).intValue();
for (int j = 0; j < ks.length; j++)
if (j != i && ks[j].toString().indexOf(k) >= 0) {
int otherCount = ((Integer) vs[j]).intValue();
count -= otherCount;
}
arr.add(((Object) (new Object[] { k, new Integer(count) })));
}
}
Collections.sort(arr, new Comparator<Object>() {
public int compare(Object o1, Object o2) {
Object arr1[] = (Object[]) o1;
Object arr2[] = (Object[]) o2;
Integer i1 = (Integer) arr1[1];
Integer i2 = (Integer) arr2[1];
return i2.intValue() - i1.intValue();
}
});
for (int i = 0; i < arr.size(); i++) {
Object wordArr[] = (Object[]) arr.get(i);
String k = wordArr[0].toString();
int count = ((Integer) wordArr[1]).intValue();
if (count == 1 || list.contains(k))
continue;
if (list.size() < 3) {
list.add(k);
continue;
}
if (list.size() == 3) {
if (count > 15)
list.add(k);
continue;
}
if (list.size() != 4)
break;
if (count > 20)
list.add(k);
}
if (list.size() > 0 && list.size() <= 3 && arr.size() > list.size()) {
int lastCount = ((Integer) ((Object[]) arr.get(list.size() - 1))[1]).intValue();
for (int i = list.size(); i < 5 && i < arr.size(); i++) {
Object wordArr[] = (Object[]) arr.get(i);
int count = ((Integer) wordArr[1]).intValue();
if (count >= lastCount - 1 && !list.contains(wordArr[0]))
list.add(wordArr[0]);
}
}
} catch (IOException e) {
e.printStackTrace();
}
String arr[] = new String[list.size()];
for (int i = 0; i < list.size(); i++)
arr[i] = list.get(i).toString();
return arr;
}
private static Object[] keyArray(LinkedHashMap<String, Integer> map) {
if (map.size() == 0)
return new Object[0];
Object arr[] = new Object[map.size()];
int i = 0;
for (Iterator<String> iter = map.keySet().iterator(); iter.hasNext();)
arr[i++] = iter.next();
return arr;
}
private static Object[] valueArray(LinkedHashMap<String, Integer> map) {
if (map.size() == 0)
return new Object[0];
Object arr[] = new Object[map.size()];
int i = 0;
for (Iterator<Integer> iter = map.values().iterator(); iter.hasNext();)
arr[i++] = iter.next();
return arr;
}
private static boolean filter(String word) {
if (filterWords == null || filterChars == null)
try {
filterWords = HtmlFileUtil.readText(ExtractKeywordAndSummary.class.getResource("wordfilter.dic").openStream(), "UTF-8");
filterChars = HtmlFileUtil.readText(ExtractKeywordAndSummary.class.getResource("charfilter.dic").openStream(), "UTF-8");
} catch (IOException e1) {
e1.printStackTrace();
}
if (HtmlNumberUtil.isNumber(word))
return false;
if (word == null || word.length() < 2)
return false;
if (filterWords.indexOf(word) >= 0)
return false;
String s = word.substring(0, 1);
String e = word.substring(word.length() - 1);
return filterChars.indexOf(s) < 0 && filterChars.indexOf(e) < 0;
}
@SuppressWarnings("resource")
public static String getTextAbstract(String title, String content) {
try {
content = getTextFromHtml(content);
org.apache.lucene.search.Query q = IKQueryParser.parse("CONTENT", title);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("", "");
Highlighter highlighter = new Highlighter(formatter, new QueryScorer(q));
highlighter.setTextFragmenter(new SimpleFragmenter(200));
org.apache.lucene.analysis.TokenStream tokenStream = (new IKAnalyzer()).tokenStream("CONTENT", new StringReader(content));
String tmp = highlighter.getBestFragment(tokenStream, content);
if (HtmlStringUtil.isNotEmpty(tmp))
content = tmp.trim();
} catch (Exception e) {
e.printStackTrace();
}
int start = 0;
int end = 0;
boolean startFlag = true;
for (int i = 0; i < content.length(); i++) {
char c = content.charAt(i);
if (startFlag) {
if (Character.isWhitespace(c) || Character.isISOControl(c) || c == ',' || c == ',' || c == '”' || c == '’' || c == '.' || c == '。' || c == '>' || c == '?' || c == '?' || c == ' ' || c == ' ' || c == ' ' || c == '!' || c == '!' || c == ';' || c == ';' || c == ':' || c == ':' || c == ']' || c == ']')
continue;
start = i;
startFlag = false;
}
if (!startFlag)
if (c == '.' || c == '。' || c == '?' || c == '?' || c == '!' || c == '!' || c == ' ' || c == ' ' || c == ' ') {
if (i < 8)
start = i + 1;
end = i;
if (i != content.length() - 1 && (content.charAt(i + 1) == '”' || content.charAt(i + 1) == '’'))
end = i + 1;
} else {
if ((c == ',' || c == ',' || c == '>' || c == '》' || c == '、') && i < 2)
start = i + 1;
if (c == '’' || c == '”')
if (i != content.length() - 1) {
char next = content.charAt(i + 1);
if (next != ',' && next == ',' && next == '、' && next == ';' && next == ';')
end = i + 1;
} else {
end = i;
}
}
}
if (end != 0 && end > start) {
content = content.substring(start, end + 1).trim();
start = 0;
for (int i = 0; i < content.length(); i++) {
char c = content.charAt(i);
if ((c == '.' || c == '。' || c == '?' || c == '?' || c == '!' || c == '!' || c == ' ' || c == ' ' || c == ' ') && i < 8)
start = i + 1;
}
if (start != 0)
content = content.substring(start);
end = 0;
if (HtmlStringUtil.isNotEmpty(content)) {
char c = content.charAt(content.length() - 1);
if (c != '.' && c != '。' && c != '?' && c != '?' && c != '!' && c != '!') {
for (int i = content.length() - 2; i > 0; i--) {
c = content.charAt(i);
if (c != ';' && c != ';' && c != ',' && c != ',' && c != '>' && c != '》')
continue;
end = i;
break;
}
}
}
if (end != 0)
content = content.substring(0, end);
}
return content;
}
}