/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.peregrine;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.erasmusmc.math.CRC32;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.textMining.LVG.LVGNormaliser;
import org.erasmusmc.utilities.AbstractNormaliser;
//import org.erasmusmc.utilities.LVGNormaliser;
import org.erasmusmc.utilities.StringUtilities;
/** Identifies words in the text.
* Each word is given a unique identifier, using a CRC32 as hashcode.*/
public class WordPeregrine extends AbstractPeregrine{
/** If true, words are first normalised before the CRC32 is calculated.
* <br><br>The default value is True.*/
public boolean normaliseWords = true;
/** If true, words are first converted to lowercase before the CRC32 is calculated.
* <br><br>The default value is True.*/
public boolean lowercaseWords = true;
/** Construct an ontology during indexation.
* This ontology can be used to translate the CRC32 codes back to words.
* <br><br>The default value is False.*/
public boolean buildOntology = false;
/** If true, Peregrine will keep an internal list of created conceptIDs and will not check the ontology whether
* a concept already exists. This will speed up the process.
* <br><br>The default value is False.*/
public boolean assumeEmptyOntology = false;
/**
* If true, Peregrine will always insert the concept into the ontology, and will not check whether it exists
*/
public boolean alwaysInsertIntoOntology = false;
/** Defines which n-grams will be returned by Peregrine.
* <br><br>The default value is {1}, meaning that only single words are detected */
public int[] ngrams = {1};
/** For n-grams with n greater than 1, breakChars defines the characters that a n-gram cannot cross.
* This serves to limit the number of different (spurious) n-grams.
* <br><br>The default value is [,;():\\"]*/
public Set<Character> breakChars = new TreeSet<Character>();
public WordPeregrine() {
normaliser = new LVGNormaliser();
tokenizer = new SBDtokenizer();
breakChars.add(',');
breakChars.add(';');
breakChars.add('(');
breakChars.add(')');
breakChars.add(':');
breakChars.add('\'');
breakChars.add('"');
}
public WordPeregrine(AbstractNormaliser normaliser){
this.normaliser = normaliser;
}
public void release() {
}
public void index(String string) {
initializeIndex(string);
resultConcepts.clear();
id2concept = new TreeMap<Integer, ResultConcept>();
id2string = new TreeMap<Integer, String>();
int start = 0;
for (int i = 0; i < tokenizer.tokens.size(); i++){
String token = tokenizer.tokens.get(i);
if (!token.equals("")){
if (precededByBreakChar(i) || tokenizer.endOfSentence.contains(i))
start = i;
//if ((!StringUtilities.isAbbr(token) || token.length() == 1) && stopwords.contains(token.toLowerCase())) {
if (stopwords.contains(StringUtilities.firstLetterToLowerCase(token))) {
start = i+1;
} else {
for (int n : ngrams){
int firstWord = i-n+1;
if (firstWord >= start)
addTerm(firstWord, i);
}
}
}
}
if (buildOntology) addNewWordsToOntology();
}
private String text;
private boolean precededByBreakChar(int tokenpos) {
int pos = tokenizer.startpositions.get(tokenpos)-1;
while (pos > 0 && !Character.isLetterOrDigit(text.charAt(pos))){
if (breakChars.contains(text.charAt(pos)))
return true;
pos--;
}
return false;
}
private Map<Integer, ResultConcept> id2concept;
private Map<Integer, String> id2string;
//private SortedIntListSet conceptIDs = new SortedIntListSet(20000000);
private Set<Integer> conceptIDs = new HashSet<Integer>(20000000);
private void addTerm(int start, int end) {
String string = StringUtilities.join(tokenizer.tokens.subList(start, end+1), " ");
int hashcode = crc32.crc32(string);
ResultTerm resultTerm = new ResultTerm();
resultTerm.words = new int[end-start+1];
for (int i = start; i <= end; i++){
resultTerm.words[i-start] = i;
}
ResultConcept resultconcept = id2concept.get(hashcode);
if (resultconcept == null) {
resultconcept = new ResultConcept();
resultconcept.conceptId = hashcode;
id2concept.put(hashcode, resultconcept);
resultConcepts.add(resultconcept);
}
resultconcept.terms.add(resultTerm);
resultTerms.add(resultTerm);
id2string.put(hashcode, string);
}
private void addNewWordsToOntology() {
for (ResultConcept resultConcept : resultConcepts){
boolean addConcept;
if (alwaysInsertIntoOntology)
addConcept = true;
else {
if (assumeEmptyOntology){
if (conceptIDs.contains(resultConcept.conceptId)){
addConcept = false;
} else {
addConcept = true;
conceptIDs.add(resultConcept.conceptId);
}
} else {//do not assume an empty ontology, so check whether concept already in ontology:
Concept concept = ontology.getConcept(resultConcept.conceptId);
addConcept = (concept == null);
}
}
if (addConcept){
Concept concept = new Concept(resultConcept.conceptId);
concept.setName(id2string.get(resultConcept.conceptId));
ontology.setConcept(concept);
}
}
}
private void initializeIndex(String string){
text = string;
resultTerms.clear();
if (string == null)
string = "";
tokenizer.tokenize(string);
if (normaliseWords)
tokenizer.tokens = normalise(tokenizer.tokens);
else if (lowercaseWords)
tokenizer.tokens = toLowercase(tokenizer.tokens);
}
protected CRC32 crc32= new CRC32();
}