package syntaxLearner.corpus;
import syntaxLearner.corpus.source.*;
import syntaxLearner.UI.*;
import syntaxLearner.*;
import java.io.File;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Pattern;
/**
* Main class for search. Typically, only one exists for any language.
* @author Omer Shapira
*
*/
public class Corpus implements Serializable{
/**
*
*/
private static final long serialVersionUID = 864168299858422048L;
/* Properties */
public String name;
public long tokenCount=0;
private Vocabulary vocab;
private final Learner learner;
/* Data Structures */
private List<CorpusSource> sources;
/* Parameters */
private Pattern DELIMITER=Pattern.compile("[\\s]+?");
/* Debug */
//private boolean _DEBUG = false;
/* Constructor */
public Corpus(String name, Learner l){
this.name = name;
this.learner = l;
sources = new LinkedList<CorpusSource>();
vocab = new Vocabulary(this);
}
public void addPlainTextFile(String fileFullPath){
PlainTextFile f = new PlainTextFile(fileFullPath);
sources.add(f);
Console.line(name+" : Included "+f.name);
}
public void addPlainTextFile(File file){
PlainTextFile f = new PlainTextFile(file);
sources.add(f);
Console.line(name+" : Included "+f.name);
}
public void addWikiDump(File file){
WikiDump f = new WikiDump(file);
sources.add(f);
Console.line(name+" : Included "+f.name);
}
/**
* To be called rarely. Builds the n-gram database and vocabulary
*/
public void buildDB(){
Console.line("Building Database\n**********************\n");
for (CorpusSource source : sources){
if (!source.wasRead()){
Console.text(".");
if (source.open()){
while (source.hasNext()){
String sentence = source.readSentence();
//sentence = sentence.replaceAll("[\\W\\d]+", " ").trim(); //^a-zA-Z
sentence = sentence.replaceAll("\\d+?", " ").replaceAll("(\\s+?\\W+?)", " ")
.replaceAll("(\\W+?\\s+?)"," ").replaceAll("[-/\\[\\]\\(\\)]", " " );
if (sentence.length()!=0){
String[] words = toWords(sentence);
for (String word : words){
vocab.add(word);
}
for (int i=0;i<words.length;i++) {
//Make the right context for the word
VocabularyContext cont = new VocabularyContext(
vocab.getIndex(i==0 ? "$START" : words[i-1]),
vocab.getIndex(i==(words.length-1) ? "$END" : words[i+1])
);
//add it to the word
vocab.getWord(vocab.getIndex(words[i])).addContext(cont);
//add 1 to the token count (absolute corpus size, with repititions)
tokenCount++;
}
}
}
} else {
Console.line("");
Console.error("Can't open "+source.name, name);
}
source.close();
}
source.markAsRead();
}
Console.line("Done building database\n");
sources.clear();
}
private String[] toWords(String sentence) {
String[] words = DELIMITER.split(sentence);
List<String> wordsBuffer = new LinkedList<String>();
for (int i=0;i<words.length;i++){
wordsBuffer.add(words[i]);
}
while (wordsBuffer.contains("")){
wordsBuffer.remove("");
}
String[] newWords = new String[wordsBuffer.size()];
int l = newWords.length;
if (l!=0){
for (int i=0; i<l;i++){
String word=wordsBuffer.get(i).trim().toLowerCase();
newWords[i]=word;
//TODO Complete
}
}
return newWords;
}
public Vocabulary getVocabulary(){
return vocab;
}
public void nGramPrinter(String s){
Word w = vocab.getWord(vocab.getIndex(s));
for (Entry<VocabularyContext, Integer> e : w.vocabContexts.entrySet()){
Context c = e.getKey();
Console.line(vocab.getWord(c.type1).name+" "+
s+" "+vocab.getWord(c.type2).name+" : "+e.getValue());
}
}
public Learner getLearner(){
return learner;
}
}