/** * Created by jpbirdy on 15-5-19. */ package jpbirdy.detection; import jpbirdy.segment.Segmenter; import java.io.*; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.*; /** * @author jialou.jp * @project Segmentation * @class NewWordDec * @date 15-5-19 10:44 * @desc */ public class NewWordDec { private String file; private Map<String, MapEntity> newWords; public NewWordDec() { newWords = new HashMap<String, MapEntity>(); } public void loadFile(String file) throws IOException { System.out.println("正在处理文件:" + file); File docFile = new File(file); if (!docFile.exists()) { System.err.println("目标文件不存在!"); return; } int totalLine = count(file); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(docFile),"UTF-8")); String line; long lineNum = 0; while ((line = br.readLine()) != null) { findNewWords(line); lineNum++; if (lineNum % 1000 == 0) { System.out.println((lineNum * 10000 / totalLine / 100.0) + "% success!"); } } // System.out.println(newWords); System.out.println("文件处理结束:" + file); } public static int count(String filename) throws IOException { InputStream is = new BufferedInputStream(new FileInputStream(filename)); byte[] c = new byte[1024]; int count = 0; int readChars = 0; while ((readChars = is.read(c)) != -1) { for (int i = 0; i < readChars; ++i) { if (c[i] == '\n') ++count; } } is.close(); return count; } public static class MapEntityComparator implements Comparator<MapEntity> { @Override public int compare(MapEntity o1, MapEntity o2) { int a = o1.getRepeatNum() / 100 + o1.getLeft().size() * o1.getRight().size(); int b = o2.getRepeatNum() / 100 + o2.getLeft().size() * o2.getRight().size(); return b - a; } } public void printNewWords() throws InterruptedException { final int MIN_REPEAT = 1; final int TOP = 500; List<MapEntity> list = new ArrayList<MapEntity>(); for (Map.Entry<String, MapEntity> entry : newWords.entrySet()) { if (entry.getValue().getRepeatNum() < MIN_REPEAT) continue; list.add(entry.getValue()); // System.out.println(entry.getKey()+"--->"+entry.getValue()); } Collections.sort(list, new MapEntityComparator()); for (int i = 0; i < TOP; i++) { MapEntity entity = list.get(i); System.out.println("word: " + entity.getWord() + " , left=" + entity.getLeft().size() + " , right=" + entity.getRight().size() + " ,repeatNum: " + entity.getRepeatNum()); } toFile(list); } public void toFile(List<MapEntity> list) throws InterruptedException { final int MAX_NEW_WORD = 100000; DateFormat dffile = new SimpleDateFormat("yyyyMMddHHmm"); File file = new File("weibo_newwords"); if (!file.exists()) { System.out.println("文件夹" + file.getName() + "不存在,正在创建!"); while (!file.mkdir()) { System.out.println("创建文件夹" + file.getName() + "失败,正在重试……"); Thread.sleep(1000); } System.out.println("文件夹创建成功!"); } String filename = dffile.format(new Date()); File targetFile = new File(file.getAbsolutePath() + "/" + filename + ".txt"); System.out.println("写入文件为:" + targetFile.getAbsolutePath() + targetFile.getName()); FileOutputStream fos = null; try { if (!targetFile.exists()) { targetFile.createNewFile(); } fos = new FileOutputStream(targetFile); for (int i = 0; i < Math.min(list.size(), MAX_NEW_WORD); i++) { MapEntity entity = list.get(i); fos.write((entity.getWord() + " " + entity.getRepeatNum() + " n \n").getBytes("UTF-8")); } } catch (IOException e) { e.printStackTrace(); } } public void loadDir(String dir) throws IOException { File dirFile = new File(dir); if (!dirFile.isDirectory()) { System.out.println("文件夹错误!"); } File[] files = dirFile.listFiles(); for (File file : files) { if (file.isDirectory()) loadDir(file.getPath()); else loadFile(file.getPath()); } } public void findNewWords(String line) { int len = line.length(); for (int i = 0; i < len; i++) { char left, right; left = right = 0; if (i > 0) left = line.charAt(i - 1); for (int j = 2; j <= 4; j++) { StringBuilder sb = new StringBuilder(); sb.setLength(0); for (int k = 0; k < j && (i + k) < len; k++) { sb.append(line.charAt(i + k)); } if (sb.length() < 2) continue; if (i + j < len) right = line.charAt(i + j); String newWord = sb.toString(); if (StopWord.hasStopWord(newWord)) continue; if (newWords.containsKey(newWord)) { MapEntity entity = newWords.get(newWord); entity.setWord(newWord); entity.addOne(); } else { MapEntity entity = new MapEntity(); entity.setWord(newWord); entity.addOne(); newWords.put(newWord, entity); } if (left > 0) { if (!newWords.get(newWord).getLeft().contains(left + "")) newWords.get(newWord).getLeft().add(left + ""); } if (right > 0) { if (!newWords.get(newWord).getRight().contains(right + "")) newWords.get(newWord).getRight().add(right + ""); } } } } public static void main(String[] args) throws Exception { StopWord.seg = new Segmenter(); NewWordDec dec = new NewWordDec(); dec.loadDir("weibo_spider"); dec.printNewWords(); } }