/** * Copyright (c) 2014, the Temporal Random Indexing AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.tri.script.gbooks; import di.uniba.it.tri.occ.*; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; import com.google.common.collect.Multiset.Entry; import di.uniba.it.tri.tokenizer.Filter; import di.uniba.it.tri.tokenizer.StopWordFilter; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; /** * * @author pierpaolo */ public class GBooksOccurrence { private File outputDir = new File("./"); private static final Logger LOGGER = Logger.getLogger(GBooksOccurrence.class.getName()); private boolean toLowerCase = false; private Filter swFilter = null; private String tokenRegExp = "^.+$"; /** * Get the RegExp used to fetch files * * @return The RegExp */ public String getTokenRegExp() { return tokenRegExp; } /** * Set the RegExp used to fetch files * * @param tokenRegExp The RegExp */ public void setTokenRegExp(String tokenRegExp) { this.tokenRegExp = tokenRegExp; } /** * Get the output directory * * @return The output directory */ public File getOutputDir() { return outputDir; } /** * Set the output directory * * @param outputDir The output directory */ public void setOutputDir(File outputDir) { this.outputDir = outputDir; } public Filter getSwFilter() { return swFilter; } public void setSwFilter(Filter swFilter) { this.swFilter = swFilter; } public boolean isToLowerCase() { return toLowerCase; } public void setToLowerCase(boolean toLowerCase) { this.toLowerCase = toLowerCase; } private OccOutput count(File file) throws Exception { Map<Integer, Multiset<Integer>> map = new HashMap<>(); BiMap<String, Integer> dict = HashBiMap.create(); int id = 0; LOGGER.log(Level.INFO, "Counting file: {0}", file.getName()); BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)))); String line; while (reader.ready()) { line = reader.readLine(); String[] values = line.split("\\t"); List<String> tokens; if (isToLowerCase()) { tokens = new ArrayList<>(Arrays.asList(values[0].toLowerCase().split("\\s"))); } else { tokens = new ArrayList<>(Arrays.asList(values[0].split("\\s"))); } if (swFilter != null) { swFilter.filter(tokens); } for (int i = tokens.size() - 1; i >= 0; i--) { if (!tokens.get(i).matches(tokenRegExp)) { tokens.remove(i); } } int c = Integer.parseInt(values[1]); for (int k = 0; k < c; k++) { for (int i = 0; i < tokens.size(); i++) { Integer tid = dict.get(tokens.get(i)); if (tid == null) { tid = id; dict.put(tokens.get(i), tid); id++; } Multiset<Integer> multiset = map.get(tid); if (multiset == null) { multiset = HashMultiset.create(); map.put(tid, multiset); } for (int j = 0; j < tokens.size(); j++) { if (j != i) { Integer tjid = dict.get(tokens.get(j)); if (tjid == null) { tjid = id; dict.put(tokens.get(j), tjid); id++; } multiset.add(tjid); } } } } } return new OccOutput(map, dict); } /** * Build the co-occurrences matrix * * @param startingDir The corpus directory containing files with year * metadata * @throws Exception */ public void process(File startingDir) throws Exception { LOGGER.log(Level.INFO, "Starting dir: {0}", startingDir.getAbsolutePath()); LOGGER.log(Level.INFO, "Output dir: {0}", outputDir.getAbsolutePath()); LOGGER.log(Level.INFO, "Lower case: {0}", isToLowerCase()); LOGGER.log(Level.INFO, "Token regexp: {0}", tokenRegExp); File[] files = startingDir.listFiles(); for (File file : files) { if (file.isFile() && file.getName().endsWith(".gz")) { OccOutput count = count(file); String[] splitname = file.getName().split("\\."); String filename = splitname[0] + ".occ.gz"; save(count, filename); } } } private void save(OccOutput count, String filename) throws IOException { BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(outputDir.getAbsolutePath() + "/" + filename)))); Iterator<String> keys = count.getDict().keySet().iterator(); while (keys.hasNext()) { String key = keys.next(); Multiset<Integer> mset = count.getOcc().get(count.getDict().get(key)); if (mset != null) { writer.append(key); Set<Multiset.Entry<Integer>> entrySet = mset.entrySet(); for (Entry<Integer> entry : entrySet) { writer.append("\t").append(count.getDict().inverse().get(entry.getElement())).append("\t").append(String.valueOf(entry.getCount())); } writer.newLine(); } } writer.close(); } static Options options; static CommandLineParser cmdParser = new BasicParser(); static { options = new Options(); options.addOption("in", true, "The corpus directory containing ngrams") .addOption("out", true, "Output directory where output will be stored") .addOption("r", true, "Regular expression used to filter tokens (optional, default \".+\")") .addOption("sw", true, "Stop word file (optional)"). addOption("lower", true, "Enable lower case (default=false)"); } /** * Build the co-occurrences matrix given the set of files with year metadata * * @param args the command line arguments */ public static void main(String[] args) { try { CommandLine cmd = cmdParser.parse(options, args); if (cmd.hasOption("in") && cmd.hasOption("out")) { try { GBooksOccurrence builder = new GBooksOccurrence(); builder.setOutputDir(new File(cmd.getOptionValue("out"))); if (cmd.hasOption("s")) { LOGGER.info("Load stop word..."); builder.setSwFilter(new StopWordFilter(OccUtils.loadSet(new File(cmd.getOptionValue("s"))))); } builder.setTokenRegExp(cmd.getOptionValue("r", "^.+$")); builder.setToLowerCase(Boolean.parseBoolean(cmd.getOptionValue("lower", "false"))); builder.process(new File(cmd.getOptionValue("in"))); } catch (Exception ex) { LOGGER.log(Level.SEVERE, null, ex); } } else { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.printHelp("Build the co-occurrences matrix given the set of files with ngrams", options, true); } } catch (ParseException ex) { LOGGER.log(Level.SEVERE, null, ex); } } static class OccOutput { private Map<Integer, Multiset<Integer>> occ; private BiMap<String, Integer> dict; public OccOutput(Map<Integer, Multiset<Integer>> occ, BiMap<String, Integer> dict) { this.occ = occ; this.dict = dict; } public Map<Integer, Multiset<Integer>> getOcc() { return occ; } public void setOcc(Map<Integer, Multiset<Integer>> occ) { this.occ = occ; } public BiMap<String, Integer> getDict() { return dict; } public void setDict(BiMap<String, Integer> dict) { this.dict = dict; } } }