package com.fpcms.common.webcrawler.htmlparser; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.commons.collections.comparators.ReverseComparator; import org.apache.commons.lang.StringUtils; import com.fpcms.common.util.KeywordUtil; import com.fpcms.common.util.StringLengthComparator; import com.fpcms.common.util.TextLangUtil; import com.fpcms.common.webcrawler.htmlparser.HtmlPage.Anchor; public class HtmlPageTitleUtil { public static String smartGetTitle(Anchor anchor, String pageTitle) { if(StringUtils.isNotBlank(anchor.getText()) && anchor.getText().length() >= 6) { return extrectMainTitle(pageTitle,true,anchor.getText().length()); } return filterWithMaxLength(smartGetTitle(pageTitle)); } public static String filterWithMaxLength(String title) { if(TextLangUtil.hasChinese(title)) { List<String> keywordsList = KeywordUtil.toTokenizerList(title); Collections.sort(keywordsList,new ReverseComparator(new StringLengthComparator())); String maxLengthKeyword = keywordsList.get(0); return maxLengthKeyword; }else { return title; } } public static String smartGetTitle(String pageTitle) { String result = extrectMainTitle(pageTitle,true,0); //english if(pageTitle.matches("[\\s\\w-_:|]+")) { ArrayList<String> tokenizerList = KeywordUtil.toTokenizerList(result); if(tokenizerList.size() < 5) { return extrectMainTitle(pageTitle,false,pageTitle.length()); } }else { if(result.length() < 5) { return extrectMainTitle(pageTitle,false,pageTitle.length()); } } return result; } private static char[] titleSeperator = {'_','-',':','|','>',':','—','-','|'}; static String extrectMainTitle(String title,boolean isIndexOf,int fromIndex) { title = title.trim(); for(char c : titleSeperator) { int indexOf = isIndexOf ? title.indexOf(c,fromIndex) : title.lastIndexOf(c, fromIndex); if(indexOf >= 0) { return title.substring(0,indexOf).trim(); } } return title; } }