package edu.uncc.cs.watsonsim.index; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Stream; import org.apache.log4j.Logger; import edu.stanford.nlp.util.IterableIterator; import edu.uncc.cs.watsonsim.Passage; /** * Count the bigrams in all passages for entropy based scorers * @author Sean Gallaghers */ public class Bigrams implements Segment { private ConcurrentHashMap<String, Integer> unigrams = new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); private ConcurrentHashMap<String, Integer> bigrams = new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); private final Logger log = Logger.getLogger(getClass()); public Bigrams() { } @Override public void close() throws IOException { flush(); } public void flush() throws IOException { // Make space-separated lines Stream<String> lines = unigrams.entrySet().stream() .map((pair) -> pair.getKey() + " " + pair.getValue()); unigrams= new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); Files.write( Paths.get("/mnt/NCDS/sean", "unigrams"), new IterableIterator<String>(lines.iterator()), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.APPEND); // Make space-separated lines lines = bigrams.entrySet().stream() .map((pair) -> pair.getKey() + " " + pair.getValue()); bigrams =new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); Files.write( Paths.get("/mnt/NCDS/sean", "bigrams"), new IterableIterator<String>(lines.iterator()), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.APPEND); } @Override public void accept(Passage t) { if (!t.getTokens().isEmpty()) { unigrams.merge(t.getTokens().get(0), 1, (a, b) -> a+b); } for (int i=0; i < t.getTokens().size() - 1; i++) { String key = t.getTokens().get(i) + " " + t.getTokens().get(i+1); bigrams.merge(key, 1, (a, b) -> a+b); unigrams.merge(t.getTokens().get(i+1), 1, (a, b) -> a+b); } // Try to keep it from absorbing all available memory if (unigrams.size() > 1_000_000 || bigrams.size() > 1_000_000) { try { flush(); } catch (IOException failed_flush) { log.error(failed_flush); } } } }