package com.cognitionis.nlp_files;
import java.io.*;
import java.util.regex.Pattern;
/**
* TokenizedPerSentenceFile consists of a space separated tokens, and one sentence per line
* word1 word2, word1/tag word2/tag, or word1|<tag> ...
*
* @author Héctor Llorens
* @since 2011
*/
public class TokenizedPerSentenceFile extends NLPFile {
private String tag_separator;
public TokenizedPerSentenceFile(String filename) {
this(filename,null); // null --> untagged
}
public TokenizedPerSentenceFile(String filename, String separator) {
super(filename);
isWellFormatted = false;
tag_separator=separator;
}
public Boolean isWellFormatted() {
try {
if (super.getFile()==null) {
throw new Exception("No file loaded in NLPFile object");
}
try (BufferedReader reader = new BufferedReader(new FileReader(this.f))) {
String line;
int linen = 0;
while ((line = reader.readLine()) != null) {
line=line.trim();
linen++;
if(tag_separator!=null && line.length()!=0){
if(!line.matches("^[^"+tag_separator+"]+"+tag_separator+"[^"+tag_separator+"]+(\\s+[^"+tag_separator+"]+"+tag_separator+"[^"+tag_separator+"]+)*$")){
throw new Exception("Line " + linen + " ("+line+"): Does not have the format word"+tag_separator+"tag in all the words");
}
}
}
}
} catch (Exception e) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}else{
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
}
this.isWellFormatted=false;
return false;
}
this.isWellFormatted = true;
return true;
}
@Override
public String toPlain(String filename) {
throw new UnsupportedOperationException("This will consist just to remove the tokens"); //To change body of generated methods, choose Tools | Templates.
}
}