/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.lemmatizer.parser; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.MutableString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import no.trank.openpipe.lemmatizer.model.LemmaSuffix; import no.trank.openpipe.lemmatizer.model.LemmatizeModel; /** * @version $Revision$ */ public class TextParser implements Parser { private static final Logger log = LoggerFactory.getLogger(TextParser.class); @Override public void parse(Reader in, LemmatizeModel model) throws IOException { try { final LineIterator lineIt = new LineIterator(new FastBufferedReader(in)); while (lineIt.hasNext()) { final MutableString line = lineIt.next().trim(); if (line.length() > 0 && Character.isLetterOrDigit(line.charAt(0))) { final int tEndIdx = line.indexOf('\t'); if (tEndIdx > 0) { final CharSequence term = line.subSequence(0, tEndIdx); try { model.add(term, parseSuffixes(line, tEndIdx + 1)); } catch (Exception e) { log.error("Trouble with line '" + line + '\'', e); } } } } } finally { try { in.close(); } catch (IOException e) { // Ignoring } } } public static List<LemmaSuffix> parseSuffixes(CharSequence line, int idx) { final int len = line.length(); final List<LemmaSuffix> suffixes = new ArrayList<LemmaSuffix>(); while (idx < len) { char c = line.charAt(idx++); int cut = c - '0'; while (idx < len && isDigit(c = line.charAt(idx++))) { cut += cut * 10 + c - '0'; } final int sIdx = isDigit(c) ? idx : idx - 1; while (idx < len && c != '\t') { c = line.charAt(idx++); } suffixes.add(new LemmaSuffix(cut, line.subSequence(sIdx, c == '\t' ? idx - 1 : idx))); } return suffixes; } private static boolean isDigit(final char c) { return c >= '0' && c <= '9'; } @SuppressWarnings({"UseOfSystemOutOrSystemErr"}) public static void main(String[] args) throws IOException { if (args.length < 2) { System.err.println("Uasge: TextParser <input> <output>"); System.exit(-1); } final LemmatizeModel model = new LemmatizeModel(); new TextParser().parse(createReader(args[0]), model); model.log(); final FileOutputStream fout = new FileOutputStream(args[1]); final OutputStream out; if (isGzip(args[1])) { out = new GZIPOutputStream(fout); } else { out = fout; } try { model.write(out); } finally { try { out.close(); } catch (IOException e) { // Ignoring } } } private static Reader createReader(String fileName) throws IOException { if (isGzip(fileName)) { return new InputStreamReader(new GZIPInputStream(new FileInputStream(fileName))); } return new FileReader(fileName); } private static boolean isGzip(String fileName) { return fileName.endsWith(".gz"); } }