package com.fpcms.common.random_gen_article; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import org.apache.commons.lang.StringUtils; import org.springframework.util.Assert; import com.fpcms.common.util.ChineseSegmenterUtil; import com.fpcms.common.util.ChineseSegmenterUtil.TokenCount; import com.fpcms.common.util.Constants; import com.fpcms.common.util.GoogleTranslateUtil; import com.fpcms.common.util.KeywordUtil; import com.fpcms.common.util.RandomUtil; /** * 对文件进行:过滤,替换,分段落,增加<h1>标题等操作,然后生成随机文章. * * @author badqiu * */ public class ArticleContentProcesser { private String perfectKeyword; private String keyword; private String article; public ArticleContentProcesser(String keyword) { super(); this.keyword = keyword; } public String getPerfectKeyword() { return perfectKeyword; } public String getArticle() { return article; } private long pCount = 0; public void buildArticle(String content) { Assert.hasText(content,"content must be not empty"); Set<String> tokens = getValidTokens(content); // filterByChineseSegment(tokens); KeywordUtil.filterSensitiveKeyword(tokens); String translatedTokensString = GoogleTranslateUtil.fromEnglish2Chinese(GoogleTranslateUtil.fromChinese2English(StringUtils.join(tokens,","))); // System.out.println("translatedTokensString:"+translatedTokensString); Collection<String> translatedTokens = getValidTokens(translatedTokensString); article = NaipanArticleGeneratorUtil.transformArticle(toHtmlFormat(translatedTokens)); perfectKeyword = getPerfectKeyword(article, keyword); // if(StringUtils.isBlank(perfectKeyword)) { // perfectKeyword = getPerfectKeyword(StringUtils.join(tokens,","), keyword); // } // article = GoogleTranslateUtil.fromEnglish2Chinese(GoogleTranslateUtil.fromChinese2English(article)); } private String toHtmlFormat(Collection<String> tokens) { StringBuilder result = new StringBuilder(); for(String token : tokens) { if(pCount % 30 == 0) { result.append("<p>"); } boolean strongToken = isStrongToken(token) ; if(strongToken) { result.append("<h3>"); } result.append(token).append(KeywordUtil.getSymbol(token)); if(strongToken) { result.append("</h3>"); } if(pCount % 30 == 29) { result.append("</p>\n"); } pCount++; } return result.toString(); } int strongCount = 0; private boolean isStrongToken(String token) { if(isStrongToken0(token) && strongCount <= 5) { strongCount++; return true; } return false; } private boolean isStrongToken0(String token) { Assert.notNull(token,"token must be not null"); for(String strong : Constants.FAIPIAO_KEYWORDS) { if(token.indexOf(strong) >= 0) { return RandomUtil.randomTrue(40); } } if(token.contains(keyword)) { return true; } return false; } Set<String> getValidTokens(String string) { string = removeSearchEngineEmphasizeHtmlTag(string); Set<String> tokens = new HashSet<String>(); StringTokenizer tokenizer = new StringTokenizer(string,KeywordUtil.DELIMITERS); while(tokenizer.hasMoreElements()) { String token = tokenizer.nextToken(); if(isValidToken(token)) { tokens.add(token); } } return tokens; } /** * 删除搜索引擎对关键字的<em>关键字</em> 标记 * @param string * @return */ private String removeSearchEngineEmphasizeHtmlTag(String string) { //sogou string = StringUtils.remove(string,"<em><!--red_beg-->"); string = StringUtils.remove(string,"<!--red_end--></em>"); //google and baidu string = StringUtils.remove(string,"<em>"); string = StringUtils.remove(string,"</em>"); return string; } static String[] ignoreWords = {"搜狗","开","相关搜索","搜索","网页快照","类似结果"}; static boolean isValidToken(String token) { if(token.length() <= 6) { return false; } if(token.matches(".*\\d{11}.*")) { return false; } if(token.matches("\\d+")) { return false; } if(token.matches("\\d{4}年\\d{1,2}月\\d{1,2}日")) { return false; } if(token.matches("\\d{4}年\\d{1,2}月")) { return false; } for(String ignoreWord : ignoreWords) { if(token.contains(ignoreWord)) { return false; } } for(int i = 0; i < token.length(); i++) { char c = token.charAt(i); if(Character.isDigit(c)) { continue; } if((int)c < 1024) { return false; } } return true; } private String getPerfectKeyword(String transferedArticle, String keyword) { String perfectKeyword = KeywordUtil.getPerfectKeyword(transferedArticle,keyword); return perfectKeyword; // String result = null; // if(StringUtils.isBlank(result)) { // for(String faipiao : Constants.FAIPIAO_KEYWORDS) { // result = KeywordUtil.getPerfectKeyword(transferedArticle,faipiao); // if(StringUtils.isNotBlank(result)) { // if(result.matches(".*\\d{4}.*")) { // continue; // } // return result; // } // } // } // return result; } }