/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2010 Alex Buloichik
2015 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.gui.glossary;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.omegat.util.EncodingDetector;
/**
* Reader for comma separated glossaries.
*
* @author Keith Godfrey
* @author Maxym Mykhalchuk
* @author Alex Buloichik <alex73mail@gmail.com>
* @author Aaron Madlon-Kay
*/
public final class GlossaryReaderCSV {
/** Fields separator. Can be dependent of regional options. */
protected static final char SEPARATOR = ',';
private GlossaryReaderCSV() {
}
public static List<GlossaryEntry> read(final File file, boolean priorityGlossary) throws IOException {
String encoding = EncodingDetector.detectEncodingDefault(file, StandardCharsets.UTF_8.name());
List<GlossaryEntry> result = new ArrayList<GlossaryEntry>();
try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding))) {
// BOM (byte order mark) bugfix
in.mark(1);
int ch = in.read();
if (ch != 0xFEFF) {
in.reset();
}
for (String s = in.readLine(); s != null; s = in.readLine()) {
// skip lines that start with '#'
if (s.startsWith("#")) {
continue;
}
// divide lines on tabs
String[] tokens = parseLine(s);
// check token list to see if it has a valid string
if (tokens.length < 2 || tokens[0].isEmpty()) {
continue;
}
// creating glossary entry and add it to the hash
// (even if it's already there!)
String comment = "";
if (tokens.length >= 3) {
comment = tokens[2];
}
result.add(new GlossaryEntry(tokens[0], tokens[1], comment, priorityGlossary));
}
}
return result;
}
private static String[] parseLine(String line) {
List<String> result = new ArrayList<String>();
StringBuilder w = new StringBuilder();
boolean fopened = false; // field opened by "
for (int cp, cpn, i = 0; i < line.length(); i += Character.charCount(cp)) {
cp = line.codePointAt(i);
try {
cpn = line.codePointAt(i + Character.charCount(cp));
} catch (StringIndexOutOfBoundsException ex) {
cpn = 0;
}
switch (cp) {
case '"':
if (w.length() == 0 && !fopened) {
// first " in field
fopened = true;
} else if (cpn == '"') {
// double " - add one
w.appendCodePoint(cp);
i++;
} else {
// last " in field
fopened = false;
}
break;
case SEPARATOR:
if (fopened) {
w.appendCodePoint(cp);
} else {
result.add(w.toString());
w.setLength(0);
}
break;
default:
w.appendCodePoint(cp);
break;
}
}
result.add(w.toString());
return result.toArray(new String[result.size()]);
}
}