/* LanguageTool, a natural language style checker * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.tools; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Properties; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jetbrains.annotations.Nullable; import morfologik.tools.DictCompile; import morfologik.tools.FSACompile; import morfologik.tools.SerializationFormat; /** * Create a Morfologik binary dictionary from plain text data. */ class DictionaryBuilder { private final Properties props = new Properties(); private static final int FREQ_RANGES_IN = 256; private static final int FREQ_RANGES_OUT = 26; // (A-Z) private static final int FIRST_RANGE_CODE = 65; // character 'A', less frequent words private static final SerializationFormat serializationFormat = SerializationFormat.CFSA2; private final Map<String, Integer> freqList = new HashMap<>(); private final Pattern pFreqEntry = Pattern.compile(".*<w f=\"(\\d+)\" flags=\"(.*)\">(.+)</w>.*"); // Valid for tagger dictionaries (wordform_TAB_lemma_TAB_postag) or spelling dictionaries (wordform) private final Pattern pTaggerEntry = Pattern.compile("^([^\t]+).*$"); private String outputFilename; protected DictionaryBuilder(File infoFile) throws IOException { props.load(new FileInputStream(infoFile)); } protected void setOutputFilename(String outputFilename) { this.outputFilename = outputFilename; } protected String getOutputFilename() { return outputFilename; } protected File buildDict(File inputFile) throws Exception { File outputFile = new File(outputFilename); String infoPath = inputFile.toString().replaceAll("\\.txt$", ".info"); File resultFile = new File (inputFile.toString().replaceAll("\\.txt$", ".dict")); File infoFile = new File(infoPath); // save info file in the same path of input text file and with the same name props.store(new FileOutputStream(infoFile), ""); String[] buildOptions = {"--exit", "false", "-i", inputFile.toString(), "-f", serializationFormat.toString()}; System.out.println("Running Morfologik DictCompile.main with these options: " + Arrays.toString(buildOptions)); DictCompile.main(buildOptions); // move output file to the desired path and name Files.move(resultFile.toPath(), outputFile.toPath(), StandardCopyOption.REPLACE_EXISTING); System.out.println("Done. The binary dictionary has been written to " + outputFile.getAbsolutePath()); return outputFile; } protected File buildFSA(File inputFile) throws Exception { File resultFile = new File(outputFilename); String[] buildOptions = { "--exit", "false", "-i", inputFile.toString(), "-o", resultFile.toString(), "-f", serializationFormat.toString()}; System.out.println("Running Morfologik FSACompile.main with these options: " + Arrays.toString(buildOptions)); FSACompile.main(buildOptions); System.out.println("Done. The binary dictionary has been written to " + resultFile.getAbsolutePath()); return resultFile; } @Nullable protected String getOption(String option) { String property = props.getProperty(option); if (property == null) { return null; } return property.trim(); } protected boolean hasOption(String option) { return props.getProperty(option) != null; } protected boolean isOptionTrue(String option) { return hasOption(option) && "true".equals(getOption(option)); } protected void readFreqList(File freqListFile) { try ( FileInputStream fis = new FileInputStream(freqListFile.getAbsoluteFile()); InputStreamReader reader = new InputStreamReader(fis, "utf-8"); BufferedReader br = new BufferedReader(reader) ) { String line; while ((line = br.readLine()) != null) { Matcher m = pFreqEntry.matcher(line); if (m.matches()) { freqList.put(m.group(3), Integer.parseInt(m.group(1))); } } } catch (IOException e) { throw new RuntimeException("Cannot read file: " + freqListFile.getAbsolutePath()); } } protected File addFreqData(File dictFile, boolean useSeparator) throws IOException { if (!isOptionTrue("fsa.dict.frequency-included")) { throw new IOException("In order to use frequency data add the line 'fsa.dict.frequency-included=true' to the dictionary info file."); } String separator = getOption("fsa.dict.separator"); if (separator == null || separator.trim().isEmpty()) { throw new IOException("A separator character (fsa.dict.separator) must be defined in the dictionary info file."); } File tempFile = File.createTempFile(DictionaryBuilder.class.getSimpleName(), "WithFrequencies.txt"); String encoding = getOption("fsa.dict.encoding"); int freqValuesApplied = 0; try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile.getAbsoluteFile()), encoding)); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dictFile.getAbsoluteFile()), encoding))) { String line; int maxFreq = Collections.max(freqList.values()); double maxFreqLog = Math.log(maxFreq); while ((line = br.readLine()) != null) { Matcher m = pTaggerEntry.matcher(line); if (m.matches()) { int freq = 0; String key = m.group(1); if (freqList.containsKey(key)) { freq = freqList.get(key); freqValuesApplied++; } int normalizedFreq = freq; if (freq > 0 && maxFreq > 255) { double freqZeroToOne = Math.log(freq) / maxFreqLog; // spread number better over the range normalizedFreq = (int) (freqZeroToOne * (FREQ_RANGES_IN-1)); // 0 to 255 } if (normalizedFreq < 0 || normalizedFreq > 255) { throw new RuntimeException("Frequency out of range (0-255): " + normalizedFreq + " in word " + key); } // Convert integers 0-255 to ranges A-Z, and write output String freqChar = Character.toString((char) (FIRST_RANGE_CODE + normalizedFreq*FREQ_RANGES_OUT/FREQ_RANGES_IN)); //add separator only in speller dictionaries if (useSeparator) { bw.write(line + separator + freqChar + "\n"); } else { bw.write(line + freqChar + "\n"); } } } System.out.println(freqList.size() + " frequency values applied in " + freqValuesApplied + " word forms."); } catch (IOException e) { throw new RuntimeException("Cannot read file: " + dictFile.getAbsolutePath()); } tempFile.deleteOnExit(); return tempFile; } protected File convertTabToSeparator(File inputFile) throws RuntimeException, IOException { File outputFile = File.createTempFile( DictionaryBuilder.class.getSimpleName() + "_separator", ".txt"); String separator = getOption("fsa.dict.separator"); if (separator == null || separator.trim().isEmpty()) { throw new IOException( "A separator character (fsa.dict.separator) must be defined in the dictionary info file."); } String encoding = getOption("fsa.dict.encoding"); try (Scanner scanner = new Scanner(inputFile, encoding); Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), encoding))) { while (scanner.hasNextLine()) { String line = scanner.nextLine(); String[] parts = line.split("\t"); if (parts.length == 3) { out.write(parts[1] + separator + parts[0] + separator + parts[2]); out.write("\n"); } else { System.err .println("Invalid input, expected three tab-separated columns in " + inputFile + ": " + line + " => ignoring"); } } scanner.close(); } return outputFile; } }