/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.apache.commons.io.IOUtils;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;
/**
* Get occurrence counts for words by iterating compressed Google ngram files.
*/
public class OccurrenceAdder {
private static final int BUFFER_SIZE = 16384;
private void run(Map<String, Integer> map, File dir) throws IOException {
File[] files = dir.listFiles();
for (File file : files) {
runOnFile(map, file);
}
}
private void runOnFile(Map<String, Integer> map, File file) throws IOException {
System.out.println("Working on " + file);
try (
InputStream fileStream = new FileInputStream(file);
InputStream gzipStream = new GZIPInputStream(fileStream, BUFFER_SIZE);
Reader decoder = new InputStreamReader(gzipStream, "utf-8");
BufferedReader buffered = new BufferedReader(decoder, BUFFER_SIZE)
) {
String line;
while ((line = buffered.readLine()) != null) {
String[] parts = line.split("\t");
String word = parts[0];
int occurrences = Integer.parseInt(parts[2]);
Integer val = map.get(word);
if (val != null) {
map.put(word, val + occurrences);
}
}
}
}
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("Usage: " + OccurrenceAdder.class.getName() + " <wordfile> <dir>");
System.exit(1);
}
OccurrenceAdder occurrenceAdder = new OccurrenceAdder();
Map<String, Integer> map = new HashMap<>();
List<String> words = IOUtils.readLines(new FileInputStream(args[0]));
for (String word : words) {
map.put(word, 0);
}
occurrenceAdder.run(map, new File(args[1]));
System.out.println("-------------------------");
for (Map.Entry<String, Integer> entry : map.entrySet()) {
System.out.println(entry.getValue() + "\t" + entry.getKey());
}
}
}