XwavesLabelfileReader.java example

Explorer
marytts-master
/**
 * Copyright 2010 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.util.data.text;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import marytts.signalproc.analysis.Label;
import marytts.signalproc.analysis.Labels;

/**
 * A class to read and parse labels in a text file. The file format should conform to that used by ESPS Xwaves and the many other
 * labeling programs which support that format.
 * 
 * @author Ingmar Steiner
 */
public class XwavesLabelfileReader {
	// main class variables (reader, times, labels, header lines)
	protected BufferedReader reader;
	protected Double[] times;
	protected String[] labels;
	protected String[] header;

	/**
	 * Read data from a Label file.
	 * 
	 * @param filename
	 *            Label filename as a String
	 * @throws IOException
	 *             IOException
	 */
	public XwavesLabelfileReader(String filename) throws IOException {
		this(new FileReader(filename));
	}

	/**
	 * Read data from a Label file.
	 * 
	 * @param reader
	 *            Label file as a Reader
	 * @throws IOException
	 *             IOException
	 */
	public XwavesLabelfileReader(Reader reader) throws IOException {
		this.reader = new BufferedReader(reader);
		parseLabels();
		reader.close();
	}

	/**
	 * Read lines from the label file and parse them. As each line is parsed, the label in that line and its end time are appended
	 * to the appropriate arrays, and the initial header lines are stored in a third vector.
	 * 
	 * @throws IOException
	 *             IOException
	 */
	private void parseLabels() throws IOException {
		// initialize some variables
		String line;
		boolean headerComplete = false;
		ArrayList<Double> timesList = new ArrayList<Double>();
		ArrayList<String> labelsList = new ArrayList<String>();
		ArrayList<String> headersList = new ArrayList<String>();

		// Legend for regular expression:
		//
		// ^ start of line
		// \\s* leading whitespace
		// ( start of first captured group (time)
		// \\d+ one or more digits
		// (?: followed by a non-capturing group containing
		// \\. a period and
		// \\d+ one or more digits
		// )? this group is optional
		// ) end of first captured group
		// \\s+ whitespace
		// .+? second column, which is ignored (not captured)
		// \\s+? whitespace
		// (.*) second captured group (label)
		// $ end of line
		Pattern linePattern = Pattern.compile("^\\s*(\\d+(?:\\.\\d+)?)\\s+.+?\\s+?(.*)$");
		boolean matches = false;

		// initialize some more variables for each line's captured groups
		String timeStr = null;
		String label = null;
		double time;

		// read the file line by line
		while ((line = reader.readLine()) != null) {
			// apply the regex Pattern to the current line...
			Matcher lineMatcher = linePattern.matcher(line);
			// ...and see if it matches
			matches = lineMatcher.matches();

			if (matches) {
				// some label files might be headerless;
				// in that case, a well-formed line indicates that we are already seeing label data
				headerComplete = true;

				// parse the line by accessing the groups captured by the regex Matcher
				// the first group is the label's end time
				timeStr = lineMatcher.group(1);
				// the second group is the label itself
				label = lineMatcher.group(2);

				try {
					// parse the end time into a Double and append it to times
					time = Double.parseDouble(timeStr);
					timesList.add(time);
				} catch (NumberFormatException nfe) {
					// number could not be parsed; this should never actually happen!
					throw nfe;
				}

				// append label to labels
				labelsList.add(label);

			} else {
				// line could not be parsed by regex; are we still in the header?
				if (!headerComplete) {
					if (line.trim().startsWith("#"))
						// hash line signals end of header (but is not itself part of the header)
						headerComplete = true;
					else
						// no hash line seen so far, line seems to be part of header
						headersList.add(line);
				} else {
					// header was already complete, or we are dealing with a headerless label file,
					// but we found a line that could not be parsed!
					System.err.println("Malformed line found outside of header:\n" + line);
					throw new IOException();
				}
			}
		}

		// it should never happen that times and labels do not have the same number of elements!
		assert timesList.size() == labelsList.size() : "";

		times = new Double[timesList.size()];
		int t;
		for (t = 0; t < timesList.size(); t++) {
			times[t] = timesList.get(t);
		}

		labels = (String[]) labelsList.toArray(new String[0]);
		header = (String[]) headersList.toArray(new String[0]);

		return;

	}

	/**
	 * getter method for times
	 * 
	 * @return times as ArrayList of Doubles
	 */
	public Double[] getTimes() {
		return times;
	}

	/**
	 * getter method for labels
	 * 
	 * @return labels as ArrayList of Strings
	 */
	public String[] getLabelSymbols() {
		return labels;
	}

	public Labels getLabels() {
		Label[] items = new Label[labels.length];
		assert times.length == labels.length;
		for (int i = 0; i < items.length; i++) {
			items[i] = new Label(times[i], labels[i]);
		}
		return new Labels(items);
	}

	/**
	 * getter method for header
	 * 
	 * @return header lines as ArrayList of Strings
	 */
	public String[] getHeader() {
		return header;
	}

}