// This file is part of Penn TotalRecall <http://memory.psych.upenn.edu/TotalRecall>. // // TotalRecall is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 3 only. // // TotalRecall is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with TotalRecall. If not, see <http://www.gnu.org/licenses/>. package components.wordpool; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Parser for the wordpool files. * * Parses both wordpool documents and the narrower word list documents for one audio file (called LST files in PyParse). * Words must contain letters or the parser will skip them. * The word consists of the entire line it is found on. * Indexes are relative to the list of words considered words by this parser, not line numbers or any other standard. * * @author Yuvi Masory * */ public class WordpoolFileParser { /** * Private constructor to prevent instantiation. */ private WordpoolFileParser() { } /** * Parses the wordpool file, traversing it line by line. * * @param file The file to be parsed * @return A List containing the WordpoolWords in the same order they appear in the list * @throws IOException In the event of i/o problems while reading the File */ public static List<WordpoolWord> parse(File file, boolean suppressLineNumbers) throws IOException { BufferedReader br; br = new BufferedReader(new FileReader(file)); ArrayList<WordpoolWord> words = new ArrayList<WordpoolWord>(); String line; int lineNum = 0; line = br.readLine(); while(line != null) { lineNum++; //PyParse goes 1-indexes the wordpool words and goes by line num not word num Matcher whiteSpace = Pattern.compile("\\s*").matcher(line); if(whiteSpace.matches()) { System.err.println("line #" + lineNum + " not a valid wordpool word: " + line); } else { line = line.trim(); if(suppressLineNumbers == false) { words.add(new WordpoolWord(line, lineNum)); } else { words.add(new WordpoolWord(line, -1)); } } line = br.readLine(); } return words; } }