// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.morph.io; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import marmot.core.Sequence; import marmot.morph.Sentence; import marmot.morph.Word; import marmot.util.Converter; import marmot.util.LineIterator; public class SentenceReader implements Iterable<Sequence> { private FileOptions options_; public SentenceReader(String option_string) { this(new FileOptions(option_string)); } public SentenceReader(FileOptions options) { options_ = options; } public Iterator<Sequence> iterator() { return new Iterator<Sequence>() { int number_ = 0; LineIterator line_iterator_ = new LineIterator( options_.getInputStream()); @Override public Sequence next() { int form_index = options_.getFormIndex(); int lemma_index = options_.getLemmaIndex(); int tag_index = options_.getTagIndex(); int morph_index = options_.getMorphIndex(); List<Integer> token_feature_indexes = options_.getTokenFeatureIndex(); if (!hasNext()) { throw new NoSuchElementException(); } List<Word> tokens = new LinkedList<Word>(); while (line_iterator_.hasNext()) { List<String> row = line_iterator_.next(); if (row.isEmpty()) { break; } String word = check_index(form_index, "form_index", row, true); String lemma = check_index(lemma_index, "lemma_index", row, false); String tag = check_index(tag_index, "tag_index", row, false); String morph = check_index(morph_index, "morph_index", row, false); List<String> token_feature_list = null; List<String> weighted_token_feature_list = null; List<Double> weighted_token_feature_weight_list = null; for (int token_feature_index : token_feature_indexes) { if (token_feature_index >= 0 && token_feature_index < row.size()) { String[] token_features = row.get(token_feature_index) .split("#"); for (String token_feature : token_features) { int colon_index = token_feature.indexOf(':'); Double weight = null; if (colon_index > 0) { try { weight = Double.parseDouble(token_feature .substring(colon_index + 1)); token_feature = token_feature.substring(0, colon_index); } catch (NumberFormatException e) { throw new RuntimeException("Cannot parse double. If this wasn't meant to be a float feature then replace the colon: " + token_feature); } } if (weight != null) { if (weighted_token_feature_list == null) { weighted_token_feature_list = new LinkedList<String>(); weighted_token_feature_weight_list = new LinkedList<Double>(); } weighted_token_feature_list.add(token_feature); weighted_token_feature_weight_list.add(weight); } else { if (token_feature_list == null) { token_feature_list =new LinkedList<String>(); } token_feature_list.add(token_feature); } } } } tokens.add(new Word(word, lemma, tag, morph, Converter.toStringArray(token_feature_list), Converter.toStringArray(weighted_token_feature_list), Converter.toDoubleArray(weighted_token_feature_weight_list))); } if (tokens.isEmpty()) { System.err.println("Warning: Found empty sentence!"); } number_+= tokens.size(); Sentence sentence = new Sentence(tokens); return sentence; } private String check_index(int index, String string, List<String> row, boolean check_zero) { if ((index < 0 && check_zero) || index >= row.size()) { RuntimeException e = new RuntimeException(String.format( "%s out of range: %d : %s\n", string, index, row)); throw e; } if (index < 0) { return null; } return row.get(index); } @Override public boolean hasNext() { int limit = options_.getLimit(); if (limit >= 0 && number_ > limit) { return false; } return line_iterator_.hasNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } }; } public FileOptions getFileOptions() { return options_; } }