package edu.fudan.util; /* * 文件名:WordCount.java * 版权:Copyright 2008-20012 复旦大学 All Rights Reserved. * 描述:程序总入口 * 修改人:xpqiu * 修改时间:2009-1-5 * 修改内容:新增 * * 修改人:〈修改人〉 * 修改时间:YYYY-MM-DD * 跟踪单号:〈跟踪单号〉 * 修改单号:〈修改单号〉 * 修改内容:〈修改内容〉 */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; /** * pmi(x,y) = ln(pxy/px*py)/-ln(pxy) * @author Administrator * @version 1.0 * @since 1.0 */ public class PMI { HashMap<String, Integer> unigram; HashMap<String, Integer> bigram; HashMap<String, Float> pmi; boolean isSpace = false; private int count=0; public PMI() { unigram = new HashMap<String, Integer>(); bigram = new HashMap<String, Integer>(); pmi = new HashMap<String, Float>(); } /** * @param args */ public static void main(String[] args) { PMI fm = new PMI(); String fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/Training-Unlabelled-A.txt"; fm.read(fileName); fm.calcPMI(); fm.save("pmi.txt", true); System.out.println("Done"); } private void calcPMI() { System.out.println("bi count: "+bigram.size()); Iterator<String> it = bigram.keySet().iterator(); while(it.hasNext()){ String key = it.next(); float c1 = unigram.get(String.valueOf(key.charAt(0))); float c2 = unigram.get(String.valueOf(key.charAt(1))); float c3 = bigram.get(key); float s = (float) ((Math.log((c1*c2)/count/count)/Math.log(c3/count))-1); pmi.put(key, s); } System.out.println("bi count: "+pmi.size()); } /** * @param fileName */ public void read(String fileName) { File f = new File(fileName); if (f.isDirectory()) { File[] files = f.listFiles(); for (int i = 0; i < files.length; i++) { read(files[i].toString()); } } else { try { InputStreamReader read = new InputStreamReader( new FileInputStream(fileName), "utf-8"); BufferedReader bin = new BufferedReader(read); String sent; while ((sent = bin.readLine()) != null) { calc(sent); } } catch (Exception e) { } } } /** * @param filename * @param bcount 是否输出词频 */ public void save(String filename, boolean bcount) { try { FileOutputStream fos = new FileOutputStream(filename); BufferedWriter bout = new BufferedWriter(new OutputStreamWriter( fos, "UTF-8")); Map.Entry<String,Double>[] entries= getSortedHashtableByValue(pmi); for(int i=0;i<entries.length;i++){ bout.write(entries[i].getKey()); if (bcount) { bout.write(" "); bout.write(entries[i].getValue().toString()); } bout.write("\n"); } bout.close(); } catch (Exception e) { } } /** * * @param sent * @param b * @return */ private void calc(String str) { // str = str.replaceAll("[\\[\\]0-9a-zA-Z/\\.<> =]+", " ").trim(); String[] wordarray; if(isSpace){ wordarray = str.split("\\s"); }else{ wordarray = new String[str.length()]; for(int i=0;i<str.length();i++){ wordarray[i] = String.valueOf(str.charAt(i)); } } count += wordarray.length; for (int i = 0; i < wordarray.length; i++) { String w = wordarray[i].trim(); if (w.length() == 0) continue; if (unigram.containsKey(w)) { unigram.put(w, unigram.get(w) + 1); } else { unigram.put(w, 1); } if(i<wordarray.length-1){ String s = wordarray[i]+wordarray[i+1]; if(s.trim().length()<2) continue; // System.out.println(s); if (bigram.containsKey(s)) { bigram.put(s, bigram.get(s) + 1); } else { bigram.put(s, 1); } } } } public static Map.Entry[] getSortedHashtableByValue(Map map) { Set set = map.entrySet(); Map.Entry[] entries = (Map.Entry[]) set.toArray(new Map.Entry[set.size()]); Arrays.sort(entries, new Comparator() { public int compare(Object arg0, Object arg1) { Double key1 = Double.valueOf(((Map.Entry) arg0).getValue().toString()); Double key2 = Double.valueOf(((Map.Entry) arg1).getValue().toString()); return -key1.compareTo(key2); } }); return entries; } }