/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Created by bgalbs on Jan 30, 2003 at 11:38:39 PM */
package com.swabunga.spell.engine;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.Vector;
/**
* An implementation of <code>SpellDictionary</code> that doesn't cache any words in memory. Avoids the huge footprint of
* <code>SpellDictionaryHashMap</code> at the cost of relatively minor latency. A future version of this class that implements some caching
* strategies might be a good idea in the future, if there's any demand for it.
* <p>
* This class makes use of the "classic" Java IO library (java.io). However, it could probably benefit from the new IO APIs (java.nio) and
* it is anticipated that a future version of this class, probably called <code>SpellDictionaryDiskNIO</code> will appear at some point.
*
* @author Ben Galbraith (ben@galbraiths.org)
* @version 0.1
* @since 0.5
*/
public final class SpellDictionaryDisk extends SpellDictionaryASpell {
private final static String DIRECTORY_WORDS = "words";
private final static String DIRECTORY_DB = "db";
private final static String FILE_CONTENTS = "contents";
private final static String FILE_DB = "words.db";
private final static String FILE_INDEX = "words.idx";
/* maximum number of words an index entry can represent */
private final static int INDEX_SIZE_MAX = 200;
private File base;
private File words;
private File db;
private Map<String, int[]> index;
protected boolean ready;
/* used at time of creation of index to speed up determining the number of words per index entry */
private List<String> indexCodeCache = null;
/**
* NOTE: Do *not* create two instances of this class pointing to the same <code>File</code> unless you are sure that a new dictionary
* does not have to be created. In the future, some sort of external locking mechanism may be created that handles this scenario
* gracefully.
*
* @param base
* the base directory in which <code>SpellDictionaryDisk</code> can expect to find its necessary files
* @param block
* if a new word db needs to be created, there can be a considerable delay before the constructor returns. If block is true,
* this method will block while the db is created and return when done. If block is false, this method will create a thread
* to create the new dictionary and return immediately.
*/
public SpellDictionaryDisk(File base, File phonetic, boolean block) throws FileNotFoundException, IOException {
super(phonetic);
this.ready = false;
this.base = base;
this.words = new File(base, DIRECTORY_WORDS);
this.db = new File(base, DIRECTORY_DB);
if (!this.base.exists()) {
throw new FileNotFoundException("Couldn't find required path '" + this.base + "'");
}
if (!this.words.exists()) {
throw new FileNotFoundException("Couldn't find required path '" + this.words + "'");
}
if (!this.db.exists()) {
db.mkdirs();
}
if (newDictionaryFiles()) {
if (block) {
buildNewDictionaryDatabase();
loadIndex();
ready = true;
} else {
Thread t = new Thread() {
@Override
public void run() {
try {
buildNewDictionaryDatabase();
loadIndex();
ready = true;
} catch (Exception e) {
e.printStackTrace();
}
}
};
t.start();
}
} else {
loadIndex();
}
}
protected void buildNewDictionaryDatabase() throws FileNotFoundException, IOException {
/* combine all dictionary files into one sorted file */
File sortedFile = buildSortedFile();
/* create the db for the sorted file */
buildCodeDb(sortedFile);
sortedFile.delete();
/* build contents file */
buildContentsFile();
}
@Override
public void addWord(String word) {
throw new UnsupportedOperationException("addWord not yet implemented (sorry)");
}
@Override
public List<String> getWords(String code) {
List<String> words = new Vector<String>();
int[] posLen = getStartPosAndLen(code);
if (posLen != null) {
try {
InputStream input = new FileInputStream(new File(db, FILE_DB));
input.skip(posLen[0]);
byte[] bytes = new byte[posLen[1]];
input.read(bytes, 0, posLen[1]);
input.close();
String data = new String(bytes);
String[] lines = split(data, "\n");
for (int i = 0; i < lines.length; i++) {
String[] s = split(lines[i], ",");
if (s[0].equals(code)) {
words.add(s[1]);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
return words;
}
public boolean isReady() {
return ready;
}
private boolean newDictionaryFiles() throws FileNotFoundException, IOException {
/* load in contents file, which indicates the files and sizes of the last db build */
List<FileSize> contents = new ArrayList<FileSize>();
File c = new File(db, FILE_CONTENTS);
if (c.exists()) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(c));
String line;
while ((line = reader.readLine()) != null) {
// format of file should be [filename],[size]
String[] s = split(line, ",");
contents.add(new FileSize(s[0], Integer.parseInt(s[1])));
}
} catch (FileNotFoundException e) {
throw e;
} catch (IOException e) {
throw e;
} finally {
if (reader != null) {
reader.close();
}
}
}
/* compare this to the actual directory */
boolean changed = false;
File[] wordFiles = words.listFiles();
if (contents.size() != wordFiles.length) {
// if the size of the contents list and the number of word files are different, it
// means we've definitely got to reindex
changed = true;
} else {
// check and make sure that all the word files haven't changed on us
for (int i = 0; i < wordFiles.length; i++) {
FileSize fs = new FileSize(wordFiles[i].getName(), wordFiles[i].length());
if (!contents.contains(fs)) {
changed = true;
break;
}
}
}
return changed;
}
private File buildSortedFile() throws FileNotFoundException, IOException {
List<String> w = new ArrayList<String>();
/*
* read every single word into the list. eeek. if this causes problems,
* we may wish to explore disk-based sorting or more efficient memory-based storage
*/
File[] wordFiles = words.listFiles();
for (int i = 0; i < wordFiles.length; i++) {
BufferedReader r = new BufferedReader(new FileReader(wordFiles[i]));
String word;
while ((word = r.readLine()) != null) {
if (!word.equals("")) {
w.add(word.trim());
}
}
r.close();
}
Collections.sort(w);
// FIXME - error handling for running out of disk space would be nice.
File file = File.createTempFile("jazzy", "sorted");
BufferedWriter writer = new BufferedWriter(new FileWriter(file));
String prev = null;
for (int i = 0; i < w.size(); i++) {
String word = w.get(i);
if (prev == null || !prev.equals(word)) {
writer.write(word);
writer.newLine();
}
prev = word;
}
writer.close();
return file;
}
private void buildCodeDb(File sortedWords) throws FileNotFoundException, IOException {
List<CodeWord> codeList = new ArrayList<SpellDictionaryDisk.CodeWord>();
BufferedReader reader = new BufferedReader(new FileReader(sortedWords));
String word;
while ((word = reader.readLine()) != null) {
codeList.add(new CodeWord(this.getCode(word), word));
}
reader.close();
Collections.sort(codeList);
List<Object[]> index = new ArrayList<Object[]>();
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(new File(db, FILE_DB)));
String currentCode = null;
int currentPosition = 0;
int currentLength = 0;
for (int i = 0; i < codeList.size(); i++) {
CodeWord cw = codeList.get(i);
String thisCode = cw.getCode();
// if (thisCode.length() > 3) thisCode = thisCode.substring(0, 3);
thisCode = getIndexCode(thisCode, codeList);
String toWrite = cw.getCode() + "," + cw.getWord() + "\n";
byte[] bytes = toWrite.getBytes();
if (currentCode == null) {
currentCode = thisCode;
}
if (!currentCode.equals(thisCode)) {
index.add(new Object[] { currentCode, new int[] { currentPosition, currentLength } });
currentPosition += currentLength;
currentLength = bytes.length;
currentCode = thisCode;
} else {
currentLength += bytes.length;
}
out.write(bytes);
}
out.close();
// Output the last iteration
if (currentCode != null && currentPosition != 0 && currentLength != 0) {
index.add(new Object[] { currentCode, new int[] { currentPosition, currentLength } });
}
BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_INDEX)));
for (int i = 0; i < index.size(); i++) {
Object[] o = index.get(i);
writer.write(o[0].toString());
writer.write(",");
writer.write(String.valueOf(((int[]) o[1])[0]));
writer.write(",");
writer.write(String.valueOf(((int[]) o[1])[1]));
writer.newLine();
}
writer.close();
}
private void buildContentsFile() throws IOException {
File[] wordFiles = words.listFiles();
if (wordFiles.length > 0) {
BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_CONTENTS)));
for (int i = 0; i < wordFiles.length; i++) {
writer.write(wordFiles[i].getName());
writer.write(",");
writer.write(String.valueOf(wordFiles[i].length()));
writer.newLine();
}
writer.close();
} else {
new File(db, FILE_CONTENTS).delete();
}
}
protected void loadIndex() throws IOException {
index = new HashMap<String, int[]>();
File idx = new File(db, FILE_INDEX);
BufferedReader reader = new BufferedReader(new FileReader(idx));
String line;
while ((line = reader.readLine()) != null) {
String[] fields = split(line, ",");
index.put(fields[0], new int[] { Integer.parseInt(fields[1]), Integer.parseInt(fields[2]) });
}
reader.close();
}
private int[] getStartPosAndLen(String code) {
while (code.length() > 0) {
int[] posLen = index.get(code);
if (posLen == null) {
code = code.substring(0, code.length() - 1);
} else {
return posLen;
}
}
return null;
}
private String getIndexCode(String code, List<CodeWord> codes) {
if (indexCodeCache == null) {
indexCodeCache = new ArrayList<String>();
}
if (code.length() <= 1) {
return code;
}
for (int i = 0; i < indexCodeCache.size(); i++) {
String c = indexCodeCache.get(i);
if (code.startsWith(c)) {
return c;
}
}
int foundSize = -1;
boolean cacheable = false;
for (int z = 1; z < code.length(); z++) {
String thisCode = code.substring(0, z);
int count = 0;
for (int i = 0; i < codes.size();) {
if (i == 0) {
i = Collections.binarySearch(codes, new CodeWord(thisCode, ""));
if (i < 0) {
i = 0;
}
}
CodeWord cw = codes.get(i);
if (cw.getCode().startsWith(thisCode)) {
count++;
if (count > INDEX_SIZE_MAX) {
break;
}
} else if (cw.getCode().compareTo(thisCode) > 0) {
break;
}
i++;
}
if (count <= INDEX_SIZE_MAX) {
cacheable = true;
foundSize = z;
break;
}
}
String newCode = foundSize == -1 ? code : code.substring(0, foundSize);
if (cacheable) {
indexCodeCache.add(newCode);
}
return newCode;
}
private static String[] split(String input, String delimiter) {
StringTokenizer st = new StringTokenizer(input, delimiter);
int count = st.countTokens();
String[] out = new String[count];
for (int i = 0; i < count; i++) {
out[i] = st.nextToken();
}
return out;
}
private class CodeWord implements Comparable<CodeWord> {
private String code;
private String word;
public CodeWord(String code, String word) {
this.code = code;
this.word = word;
}
public String getCode() {
return code;
}
public String getWord() {
return word;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof CodeWord)) {
return false;
}
final CodeWord codeWord = (CodeWord) o;
if (!word.equals(codeWord.word)) {
return false;
}
return true;
}
@Override
public int hashCode() {
return word.hashCode();
}
@Override
public int compareTo(CodeWord o) {
return code.compareTo(o.getCode());
}
}
private class FileSize {
private String filename;
private long size;
public FileSize(String filename, long size) {
this.filename = filename;
this.size = size;
}
public String getFilename() {
return filename;
}
public long getSize() {
return size;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof FileSize)) {
return false;
}
final FileSize fileSize = (FileSize) o;
if (size != fileSize.size) {
return false;
}
if (!filename.equals(fileSize.filename)) {
return false;
}
return true;
}
@Override
public int hashCode() {
int result;
result = filename.hashCode();
result = (int) (29 * result + size);
return result;
}
}
}