/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.languagetool.JLanguageTool;
import org.languagetool.rules.ConfusionSet;
import org.languagetool.rules.ConfusionSetLoader;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
/**
* Generate URLs for the confusion set to download the parts of Google's ngram corpus (v2)
* that we need to cover the confusion set.
* @since 2.7
*/
@SuppressWarnings("DynamicRegexReplaceableByCompiledPattern")
final class NGramUrlGenerator {
private NGramUrlGenerator() {}
public static void main(String[] args) throws IOException {
String url = "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-4gram-20120701-<XX>.gz";
String chars = "abcdefghijklmnopqrstuvwxyz";
String chars2 = "abcdefghijklmnopqrstuvwxyz_";
for (int i = 0; i <= 9; i++) {
System.out.println(url.replace("<XX>", String.valueOf(i)));
}
for (int i = 0; i < chars.length(); i++) {
for (int j = 0; j < chars2.length(); j++) {
String name = String.valueOf(chars.charAt(i)) + String.valueOf(chars2.charAt(j));
System.out.println(url.replace("<XX>", name));
}
}
System.out.println(url.replace("<XX>", "punctuation"));
}
public static void mainDownloadSome(String[] args) throws IOException {
ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
Map<String,List<ConfusionSet>> map = confusionSetLoader.loadConfusionSet(inputStream);
String url = "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-2gram-20120701-<XX>.gz";
Set<String> nameSet = new HashSet<>();
for (String s : map.keySet()) {
if (s.length() < 2) {
nameSet.add(s.substring(0, 1).toLowerCase() + "_");
} else {
nameSet.add(s.substring(0, 2).toLowerCase());
}
}
List<String> nameList = new ArrayList<>(nameSet);
Collections.sort(nameList);
for (String name : nameList) {
System.out.println(url.replace("<XX>", name));
}
System.err.println("Number of files: " + nameList.size());
}
}