/*
* This file is part of ELKI:
* Environment for Developing KDD-Applications Supported by Index-Structures
*
* Copyright (C) 2017
* ELKI Development Team
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.lmu.ifi.dbs.elki.datasource.parser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.LabelList;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
/**
* Parser that loads a text file for use with string similarity measures.
*
* The parser produces two relations: the first of type String, the second of
* type label list, which contains the same data for convenience.
*
* @author Felix Stahlberg
* @author Erich Schubert
* @since 0.6.0
*/
@Title("String Parser")
@Description("Parses new line separated strings")
public class StringParser implements Parser {
/**
* Comment pattern.
*/
Matcher comment;
/**
* Flag to trim whitespace.
*/
boolean trimWhitespace;
/**
* Constructor.
*
* @param comment Pattern for comments.
* @param trimWhitespace Trim leading and trailing whitespace.
*/
public StringParser(Pattern comment, boolean trimWhitespace) {
super();
this.comment = (comment != null) ? comment.matcher("") : null;
this.trimWhitespace = trimWhitespace;
}
@Override
public MultipleObjectsBundle parse(InputStream in) {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
int lineNumber = 0;
List<String> data = new ArrayList<>();
List<LabelList> labels = new ArrayList<>();
ArrayList<String> ll = new ArrayList<>(1);
try {
for(String line; (line = reader.readLine()) != null; lineNumber++) {
// Skip empty lines and comments
if(line.length() <= 0 || (comment != null && comment.reset(line).matches())) {
continue;
}
final String val = trimWhitespace ? line.trim() : line;
data.add(val);
ll.clear();
ll.add(val);
labels.add(LabelList.make(ll));
}
}
catch(IOException e) {
throw new IllegalArgumentException("Error while parsing line " + lineNumber + ".");
}
return MultipleObjectsBundle.makeSimple(TypeUtil.STRING, data, TypeUtil.LABELLIST, labels);
}
@Override
public void cleanup() {
comment.reset("");
}
/**
* Parameterization class.
*
* @author Felix Stahlberg
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer extends AbstractParameterizer {
/**
* Flag to trim whitespace.
*/
public static final OptionID TRIM_ID = new OptionID("string.trim", "Remove leading and trailing whitespace from each line.");
/**
* Comment pattern.
*/
Pattern comment = null;
/**
* Flag to trim whitespace.
*/
boolean trimWhitespace = false;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
PatternParameter commentP = new PatternParameter(CSVReaderFormat.Parameterizer.COMMENT_ID, "^\\s*#.*$");
if(config.grab(commentP)) {
comment = commentP.getValue();
}
Flag trimP = new Flag(TRIM_ID);
if(config.grab(trimP)) {
trimWhitespace = trimP.isTrue();
}
}
@Override
protected StringParser makeInstance() {
return new StringParser(comment, trimWhitespace);
}
}
}