package com.cognitionis.nlp_files;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class NgramHandler implements Iterable<List<String[]>> {
private final TokenizedFile tokfile;
private final int ngram_size;
// included_fields which fields are included (default: all)
private int[] included_fields; // 0==skip, 1==include
private final BufferedReader reader;
private final List<String[]> previous;
private final String[] pre_ngram, post_ngram;
public NgramHandler(TokenizedFile tf, int n) throws Exception {
this(tf, n, null);
}
// in the future allow make n-grams of different cardinality (number of
// tabs) and also select which tabs...
/**
* Build an n-gram handler/iterator
*
* @param tf
* TokenizedFile one word per line and optional features
* (columns)
* @param n
* ngram size
* @param f
* included_fields (array of int of the same size of num columns
* where 0==skip and 1==include). By default all cols are
* included
* @throws Exception
*/
public NgramHandler(TokenizedFile tf, int n, int[] f) throws Exception {
tokfile = tf;
ngram_size = n;
reader = new BufferedReader(new FileReader(tf.getFile()));
previous = new ArrayList<>();
// Build pre and post ngrams to add them when needed------------
pre_ngram = new String[tf.getNumFields()];
post_ngram = new String[tf.getNumFields()];
for (int i = 0; i < tf.getLastDescColumn(); i++) {
pre_ngram[i] = post_ngram[i] = "_none_";
}
pre_ngram[tf.getLastDescColumn()] = "*";
post_ngram[tf.getLastDescColumn()] = "STOP";
// --------------------------------------------------------------
if (f == null) {
included_fields = new int[tf.getNumFields()];
for (int i = 0; i < included_fields.length; i++) {
included_fields[i] = 1;
}
} else {
included_fields = f;
if (included_fields.length > tf.getNumFields()) {
throw new IOException("Selected numfields (" + included_fields.length
+ ") is greater than the number of fields in tok file ("
+ tf.getNumFields() + ")");
}
}
// in python is easier to concatenate generators/iterators however here
// it makes more sense to read a file word by word and add stop if
// needed.
/*
* (hi ha que llegir per frase i hi ha que afegir els special tags * *
* STOP) (last field) i en la resta de fields "-"
*/
if (n < 1) {
throw new Exception("Error: Ngram size n must be > 0");
}
/*
* if(n>1){ for
*
*
* System.err.println("Error: Ngram size greater than input string length"
* ); }
*/
}
@Override
public Iterator<List<String[]>> iterator() {
return new Iterator<List<String[]>>() {
@Override
public boolean hasNext() {
try {
reader.mark(1);
// if previous isn't empty process last line and add "STOP"
if (reader.read() < 0 && previous.isEmpty()) {
return false;
}
reader.reset();
return true;
} catch (final IOException e) {
return false;
}
}
@Override
public List<String[]> next() {
final List<String[]> ngram = new ArrayList<>();
try {
String line = reader.readLine();
if (line != null)
line = line.trim();
else
line = "";
// Clear all the leading newlines
if (previous.isEmpty() && line.isEmpty()) {
while ((line = reader.readLine()) != null) {
line = line.trim();
if (!line.isEmpty()) {
break;
}
}
}
// Clear all the intermediate newlines and add the last
// ngram of the sentence STOP
if (line.isEmpty()) {
if (previous.isEmpty()) {
throw new Exception("Empty file: " + tokfile.getFile());
}
reader.mark(1000); // in case we want to go back, 1000
// is the buffer
while ((line = reader.readLine()) != null) {
line = line.trim();
if (!line.isEmpty()) {
reader.reset();
break;
}
}
ngram.addAll(previous);
ngram.add(post_ngram);
previous.clear();
} else {
if (previous.isEmpty()) { // this is a first token in a
// sentence
for (int i = 1; i < ngram_size; i++) {
previous.add(pre_ngram);
}
}
final String[] linearr = line
.split(tokfile.getFieldSeparatorRE());
ngram.addAll(previous);
ngram.add(linearr);
previous.remove(0);
previous.add(linearr);
}
return ngram;
} catch (final Exception e) {
if (System.getProperty("DEBUG") != null
&& System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
} else {
System.err.println("Errors found ("
+ this.getClass().getSimpleName() + "):\n\t"
+ e.toString() + "\n");
}
return null;
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
protected void finalize() throws Throwable {
try {
reader.close();
} finally {
super.finalize();
}
}
/**
* Get n-grams from a space separated string str
*
* @param str
* @param n
* @return
*/
public static List<String> getNgrams(String str, int n) {
final List<String> ngrams = new ArrayList<>();
if (n < 1) {
System.err.println("Error: Ngram size n must be > 0");
return ngrams;
}
if (n > str.split(" ").length) {
System.err.println("Error: Ngram size greater than input string length");
return ngrams;
}
final String[] words = str.split(" ");
for (int ngram_start = 0; ngram_start < words.length - n + 1; ngram_start++) {
final StringBuilder ngram = new StringBuilder();
final int ngram_end = ngram_start + n;
for (int ngram_word_index = ngram_start; ngram_word_index < ngram_end; ngram_word_index++) {
ngram.append(ngram_word_index > ngram_start ? " " : "")
.append(words[ngram_word_index]).toString();
}
ngrams.add(ngram.toString());
}
return ngrams;
}
/*
* See the link in the chrome browser to see to java easy solutions...
* normal (only good for small files.) on-demand (iterator, generator, lazy)
* implement and document this on doc
*
* public Ngram get_ngram(int n){ lo que hace el amigo es que lee frase a
* frase ngram_start cada frase lee parejas "token tag" }
*
* ngram_iterator =
* get_ngrams(get_sentence_lazy(get_token_tag_lazy(annotated_input)),
* self.n)
*
*
* luego desde fuera se puede hacer for ngram in ngram_iterator:
*
*
* n-gram is the number you have selected, e.g., 3 --> [[a,o]
* [b,ngram_start-tag] [c,o]] then you can have an algorithm to do the
* calculation of smaller n-grams...
*
* ngram_start ya se ponen en HashMaps de contadores para luego generar el
* fichero .model
*/
}