package com.haogrgr.test.topn;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 题目:
* 有一文件,大小约为2.5G
* 里面每一行为随机的字符串, 长度5-10之间
* 约1亿行,可能重复
* 求出现次数最多的前100个字符串
*/
public class TopNFinder {
static long start, end;
static final int HASH_TABLE_SIZE = 32;
static final String FILE_PATH = "C:/tmp/src.txt";
static final String FILE_TEMP_PERFIX = "C:/tmp/temp"+HASH_TABLE_SIZE+"/temp_";
static final BufferedWriter[] writers = new BufferedWriter[HASH_TABLE_SIZE];
static final BufferedReader[] readers = new BufferedReader[HASH_TABLE_SIZE];
static final Object[] results = new Object[HASH_TABLE_SIZE];
static final ExecutorService executor = Executors.newFixedThreadPool(4);
public static void main(String[] args) throws Exception {
start = System.currentTimeMillis();
// hashToFile();
end = System.currentTimeMillis();
System.out.println("hash cost :" + (end - start));
topN(10);
end = System.currentTimeMillis();
System.out.println("total cost : " + (end - start));
}
@SuppressWarnings("unused")
private static void hashToFile() throws FileNotFoundException, IOException {
initWriter(HASH_TABLE_SIZE);
FileReader r = new FileReader(FILE_PATH);
BufferedReader br = new BufferedReader(r);
String line = br.readLine();
while (line != null) {
BufferedWriter w = writers[index(line, HASH_TABLE_SIZE)];
w.write(line + "\n");
line = br.readLine();
}
br.close();
r.close();
for (int i = 0; i < HASH_TABLE_SIZE; i++) {
writers[i].close();
}
}
@SuppressWarnings({"rawtypes", "unchecked"})
public static HashMap<String, Integer> topN(final int topSize) throws Exception {
initReader(HASH_TABLE_SIZE);
HashMap<String, Integer> result = new HashMap<String, Integer>();
final AtomicInteger inc1 = new AtomicInteger();
final AtomicInteger inc2 = new AtomicInteger();
Future[] submits = new Future[HASH_TABLE_SIZE];
for (final BufferedReader reader : readers) {
Future<?> submit = executor.submit(new Runnable() {
@Override
public void run() {
try {
int index = inc1.getAndIncrement();
HashMap<String, Integer> map = new HashMap<String, Integer>(200000000 / HASH_TABLE_SIZE);
String line = reader.readLine();
while (line != null) {
inc(map, line);
line = reader.readLine();
}
reader.close();
TopNListContainer<Entry<String, Integer>> tops = TopNListContainer.getTopNContainer(topSize);
for (Entry<String, Integer> entry : map.entrySet()) {
tops.add(entry);
}
results[index] = tops;
} catch (Exception e) {
e.printStackTrace();
}
}
});
submits[inc2.getAndIncrement()] = submit;
}
for (Future future : submits) {
future.get();
}
TopNListContainer<Entry<String, Integer>> tops = TopNListContainer.getTopNContainer(topSize);
for (int i = 0; i < results.length; i++) {
TopNListContainer<Entry<String, Integer>> object = (TopNListContainer<Entry<String, Integer>>) results[i];
for (Entry<String, Integer> entry : object) {
tops.add(entry);
}
}
System.out.println(tops);
return result;
}
public static void initWriter(int size) {
for (int i = 0; i < size; i++) {
try {
String fileName = FILE_TEMP_PERFIX + i;
writers[i] = new BufferedWriter(new FileWriter(fileName));
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void initReader(int size) {
for (int i = 0; i < size; i++) {
try {
String fileName = FILE_TEMP_PERFIX + i;
readers[i] = new BufferedReader(new FileReader(fileName));
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void inc(HashMap<String, Integer> map, String key) {
Integer count = map.get(key);
if (count == null) {
map.put(key, 1);
} else {
map.put(key, count + 1);
}
}
public static int index(String str, int tableSize) {
int hash = hash(str);
int index = Math.abs(hash % tableSize);
return index;
}
public static int hash(String str) {
return Objects.hash(str);
}
}