GBooksOccurrence.java example

Explorer

tri-master
- src
  - main
    - java
      - di
        uniba
        it
        tri
        aan
        AAN2file.java
        Paper.java
        package-info.java
        api
        Tri.java
        TriResultObject.java
        rest
        ResponseAllowOriginFilter.java
        ServerConfig.java
        SimpleTriServerWrapper.java
        SimpleTriService.java
        TimeScore.java
        v1
        SimpleTriRest.java
        changepoint
        ComputeCPD.java
        MeanShiftCPD.java
        data
        DictionaryEntry.java
        package-info.java
        extractor
        AANExtractor.java
        Extractor.java
        GutenbergExtractor.java
        IterableExtractor.java
        TextFileIterableExtractor.java
        TxtExtractor.java
        package-info.java
        ir
        SearchResult.java
        Searcher.java
        occ
        BuildOccurrence.java
        OccUtils.java
        package-info.java
        package-info.java
        script
        AnalyzeTimeSeries.java
        BuildOccStatistics.java
        BuildSimStatistics.java
        FindNoChangeWords.java
        FindVariation.java
        TimeWord.java
        WCScorer.java
        gbooks
        GBooks2Plain.java
        GBooksFreqFile.java
        GBooksOccurrence.java
        GBooksUtils.java
        Ngram.java
        shell
        CommandWrapper.java
        TriShell.java
        gui
        ChartDialog.java
        DocDialog.java
        GetDialog.java
        OptionsDialog.java
        ProgressDialog.java
        SimsDialog.java
        TimeSetupDialog.java
        TriShellGUI.java
        WordListDialog.java
        data
        ChartUtils.java
        Options.java
        TimePeriod.java
        WordEntry.java
        space
        SpaceBuilder.java
        SpaceBuilderReflective.java
        TemporalSpaceUtils.java
        clustering
        ClusterComparator.java
        Clusters.java
        test
        TestReadStore.java
        tokenizer
        BasicLatinFilter.java
        EnglishNoStemAnalyzer.java
        Filter.java
        ItalianNoStemAnalyzer.java
        KeywordFinder.java
        LetterFilter.java
        StandardFilter.java
        StopWordFilter.java
        TestKeywordFinder.java
        TriEnStandardTokenizer.java
        TriItStandardTokenizer.java
        TriStandardTokenizer.java
        TriTokenizer.java
        TriTwitterTokenizer.java
        TriWhitespaceTokenizer.java
        Twokenize.java
        vectors
        FileVectorReader.java
        IncompatibleVectorsException.java
        MapVectorReader.java
        MemoryVectorReader.java
        ObjectVector.java
        PermutationUtils.java
        RealVector.java
        RealVectorUtils.java
        ReverseObjectVectorComparator.java
        Vector.java
        VectorCache.java
        VectorFactory.java
        VectorReader.java
        VectorStoreUtils.java
        VectorType.java
        VectorUtils.java
        ZeroVectorException.java

/**
 * Copyright (c) 2014, the Temporal Random Indexing AUTHORS.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the University of Bari nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
 *
 */
package di.uniba.it.tri.script.gbooks;

import di.uniba.it.tri.occ.*;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multiset.Entry;
import di.uniba.it.tri.tokenizer.Filter;
import di.uniba.it.tri.tokenizer.StopWordFilter;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

/**
 *
 * @author pierpaolo
 */
public class GBooksOccurrence {

    private File outputDir = new File("./");

    private static final Logger LOGGER = Logger.getLogger(GBooksOccurrence.class.getName());

    private boolean toLowerCase = false;

    private Filter swFilter = null;

    private String tokenRegExp = "^.+$";

    /**
     * Get the RegExp used to fetch files
     *
     * @return The RegExp
     */
    public String getTokenRegExp() {
        return tokenRegExp;
    }

    /**
     * Set the RegExp used to fetch files
     *
     * @param tokenRegExp The RegExp
     */
    public void setTokenRegExp(String tokenRegExp) {
        this.tokenRegExp = tokenRegExp;
    }

    /**
     * Get the output directory
     *
     * @return The output directory
     */
    public File getOutputDir() {
        return outputDir;
    }

    /**
     * Set the output directory
     *
     * @param outputDir The output directory
     */
    public void setOutputDir(File outputDir) {
        this.outputDir = outputDir;
    }

    public Filter getSwFilter() {
        return swFilter;
    }

    public void setSwFilter(Filter swFilter) {
        this.swFilter = swFilter;
    }

    public boolean isToLowerCase() {
        return toLowerCase;
    }

    public void setToLowerCase(boolean toLowerCase) {
        this.toLowerCase = toLowerCase;
    }

    private OccOutput count(File file) throws Exception {
        Map<Integer, Multiset<Integer>> map = new HashMap<>();
        BiMap<String, Integer> dict = HashBiMap.create();
        int id = 0;
        LOGGER.log(Level.INFO, "Counting file: {0}", file.getName());
        BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
        String line;
        while (reader.ready()) {
            line = reader.readLine();
            String[] values = line.split("\\t");
            List<String> tokens;
            if (isToLowerCase()) {
                tokens = new ArrayList<>(Arrays.asList(values[0].toLowerCase().split("\\s")));
            } else {
                tokens = new ArrayList<>(Arrays.asList(values[0].split("\\s")));
            }
            if (swFilter != null) {
                swFilter.filter(tokens);
            }
            for (int i = tokens.size() - 1; i >= 0; i--) {
                if (!tokens.get(i).matches(tokenRegExp)) {
                    tokens.remove(i);
                }
            }
            int c = Integer.parseInt(values[1]);
            for (int k = 0; k < c; k++) {
                for (int i = 0; i < tokens.size(); i++) {
                    Integer tid = dict.get(tokens.get(i));
                    if (tid == null) {
                        tid = id;
                        dict.put(tokens.get(i), tid);
                        id++;
                    }
                    Multiset<Integer> multiset = map.get(tid);
                    if (multiset == null) {
                        multiset = HashMultiset.create();
                        map.put(tid, multiset);
                    }
                    for (int j = 0; j < tokens.size(); j++) {
                        if (j != i) {
                            Integer tjid = dict.get(tokens.get(j));
                            if (tjid == null) {
                                tjid = id;
                                dict.put(tokens.get(j), tjid);
                                id++;
                            }
                            multiset.add(tjid);
                        }
                    }
                }
            }
        }
        return new OccOutput(map, dict);
    }

    /**
     * Build the co-occurrences matrix
     *
     * @param startingDir The corpus directory containing files with year
     * metadata
     * @throws Exception
     */
    public void process(File startingDir) throws Exception {
        LOGGER.log(Level.INFO, "Starting dir: {0}", startingDir.getAbsolutePath());
        LOGGER.log(Level.INFO, "Output dir: {0}", outputDir.getAbsolutePath());
        LOGGER.log(Level.INFO, "Lower case: {0}", isToLowerCase());
        LOGGER.log(Level.INFO, "Token regexp: {0}", tokenRegExp);
        File[] files = startingDir.listFiles();
        for (File file : files) {
            if (file.isFile() && file.getName().endsWith(".gz")) {
                OccOutput count = count(file);
                String[] splitname = file.getName().split("\\.");
                String filename = splitname[0] + ".occ.gz";
                save(count, filename);
            }
        }

    }

    private void save(OccOutput count, String filename) throws IOException {
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(outputDir.getAbsolutePath() + "/" + filename))));
        Iterator<String> keys = count.getDict().keySet().iterator();
        while (keys.hasNext()) {
            String key = keys.next();
            Multiset<Integer> mset = count.getOcc().get(count.getDict().get(key));
            if (mset != null) {
                writer.append(key);
                Set<Multiset.Entry<Integer>> entrySet = mset.entrySet();
                for (Entry<Integer> entry : entrySet) {
                    writer.append("\t").append(count.getDict().inverse().get(entry.getElement())).append("\t").append(String.valueOf(entry.getCount()));
                }
                writer.newLine();
            }
        }
        writer.close();
    }

    static Options options;

    static CommandLineParser cmdParser = new BasicParser();

    static {
        options = new Options();
        options.addOption("in", true, "The corpus directory containing ngrams")
                .addOption("out", true, "Output directory where output will be stored")
                .addOption("r", true, "Regular expression used to filter tokens (optional, default \".+\")")
                .addOption("sw", true, "Stop word file (optional)").
                addOption("lower", true, "Enable lower case (default=false)");
    }

    /**
     * Build the co-occurrences matrix given the set of files with year metadata
     *
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        try {
            CommandLine cmd = cmdParser.parse(options, args);
            if (cmd.hasOption("in") && cmd.hasOption("out")) {
                try {
                    GBooksOccurrence builder = new GBooksOccurrence();
                    builder.setOutputDir(new File(cmd.getOptionValue("out")));
                    if (cmd.hasOption("s")) {
                        LOGGER.info("Load stop word...");
                        builder.setSwFilter(new StopWordFilter(OccUtils.loadSet(new File(cmd.getOptionValue("s")))));
                    }
                    builder.setTokenRegExp(cmd.getOptionValue("r", "^.+$"));
                    builder.setToLowerCase(Boolean.parseBoolean(cmd.getOptionValue("lower", "false")));
                    builder.process(new File(cmd.getOptionValue("in")));
                } catch (Exception ex) {
                    LOGGER.log(Level.SEVERE, null, ex);
                }
            } else {
                HelpFormatter helpFormatter = new HelpFormatter();
                helpFormatter.printHelp("Build the co-occurrences matrix given the set of files with ngrams", options, true);
            }
        } catch (ParseException ex) {
            LOGGER.log(Level.SEVERE, null, ex);
        }
    }

    static class OccOutput {

        private Map<Integer, Multiset<Integer>> occ;

        private BiMap<String, Integer> dict;

        public OccOutput(Map<Integer, Multiset<Integer>> occ, BiMap<String, Integer> dict) {
            this.occ = occ;
            this.dict = dict;
        }

        public Map<Integer, Multiset<Integer>> getOcc() {
            return occ;
        }

        public void setOcc(Map<Integer, Multiset<Integer>> occ) {
            this.occ = occ;
        }

        public BiMap<String, Integer> getDict() {
            return dict;
        }

        public void setDict(BiMap<String, Integer> dict) {
            this.dict = dict;
        }

    }

}