/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.utilities;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public abstract class AbstractNormaliser implements Serializable {
/**
* If true, the cache will grow when a new word is normalised. <br>
* <br>
* The default value is False;
*/
public boolean growCache = false;
/**
* If false, abbreviations (tokens with a majority of uppercase letters) will
* be normalised, even if the term as a whole is set to be matched normalised.
* <br>
* <br>
* The default value is True.
*/
public boolean doNotNormaliseAbbreviations = true;
/**
* if true, for abbreviations that consists of all uppercase letters, except the last
* letter which is an 's' the last letter is removed.<br>
* For example: ADRs becomes ADR.
* <br>
* <br>
* The default value is True.
*/
public boolean convertPluralAbbreviations = true;
/**
* Normalizes a set of input strings.
*
* @param string
* The input string to be normalised.
* @return The string of normalised words in alphabetical order.
*/
public List<String> normalise(List<String> tokens) {
List<String> result = new ArrayList<String>(tokens.size());
for (int i = 0; i < tokens.size(); i++) {
result.add(i, normWord(tokens.get(i)));
}
return result;
}
/**
* Converts the input string to a string of normalised words in alphabetical
* order.
*
* @param string
* The input string to be normalised.
* @return The string of normalised words in alphabetical order.
*/
public String normalise(String string) {
if (string == "") {
return "";
}
else {
List<String> words = StringUtilities.mapToWords(string);
List<String> normwords = normalise(words);
Collections.sort(normwords);
return StringUtilities.join(normwords, " ");
}
}
/**
* Converts the input string to a string of normalised words in the original
* order.
*
* @param string
* The input string to be normalised.
* @return The string of normalised words.
*/
public String normaliseInOrder(String string) {
if (string == "") {
return "";
}
else {
List<String> words = StringUtilities.mapToWords(string);
List<String> normwords = normalise(words);
return StringUtilities.join(normwords, " ");
}
}
/**
* Save the cache as a text file.
*
* @param filename
* The path and filename of the text file.
*/
public void saveCache(String filename) {
Iterator<Map.Entry<String, String>> cacheiterator = cache.entrySet().iterator();
try {
FileOutputStream PSFFile = new FileOutputStream(filename);
BufferedWriter bufferedWrite = new BufferedWriter(new OutputStreamWriter(PSFFile), 1000000);
try {
for (int i = 0; i < cache.size(); i++) {
Map.Entry<String, String> entry = cacheiterator.next();
bufferedWrite.write(entry.getKey().toString() + "=" + entry.getValue().toString());
bufferedWrite.newLine();
}
bufferedWrite.flush();
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
/**
* Save the cache as a binary file.
*
* @param filename
* The path and filename of the binary file.
*/
public void saveCacheBinary(String filename) {
try {
FileOutputStream binFile = new FileOutputStream(filename);
try {
ObjectOutputStream out = new ObjectOutputStream(binFile);
out.writeObject(cache);
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
/**
* Load the cache from a binary file.
*
* @param filename
* The path and filename of the binary file.
*/
@SuppressWarnings("unchecked")
public void loadCacheBinary(String filename) {
try {
FileInputStream binFile = new FileInputStream(filename);
try {
ObjectInputStream inp = new ObjectInputStream(binFile);
try {
cache = (Map<String, String>) inp.readObject();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
/**
* Load the cache from a text file.
*
* @param filename
* The path and filename of the text file.
*/
public void loadCache(String filename) {
try {
FileInputStream PSFFile = new FileInputStream(filename);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(PSFFile, "UTF-8"), 1000000);
try {
while (bufferedReader.ready()) {
String line = bufferedReader.readLine();
String[] subs = line.split("=");
cache.put(subs[0], subs[1]);
}
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
/**
* Normalise a single word.
*
* @param word
* The input word.
* @return The normalised word.
*/
public String normWord(String word) {
String result = word;
if (!doNotNormaliseAbbreviations || !StringUtilities.isAbbr(word)) {
result = result.toLowerCase();
if (!StringUtilities.containsNumber(result)) {// contains number: no
result = removePosessive(result);
// check cache
String value = cache.get(result);
if (value != null) {
result = value;
}
else { // not in cache: use LVG
String newResult = externalnormalise(result);
if (growCache)
cache.put(result, newResult);
result = newResult;
}
}
} else if (convertPluralAbbreviations &&
word.length() > 1 &&
word.charAt(word.length()-1) == 's' &&
StringUtilities.countsCharactersInLowerCase(word) == 1)
result = word.substring(0,word.length()-1);
return result;
}
/* Implement this method! */
protected abstract String externalnormalise(String word);
/** Clear the cache. */
public void clearCache() {
cache = new HashMap<String, String>();
}
private String removePosessive(String lcword) {
if (lcword.length() > 2 && lcword.charAt(lcword.length() - 2) == '\'' && lcword.charAt(lcword.length() - 1) == 's') {
return lcword.substring(0, lcword.length() - 2);
}
else
return lcword;
}
public int getCacheSize(){
return cache.size();
}
protected Map<String, String> cache = new HashMap<String, String>();
private static final long serialVersionUID = -7895782040241683680L;
private void writeObject(ObjectOutputStream s) throws IOException {
s.writeObject(cache);
}
@SuppressWarnings( { "unchecked"})
private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
cache = (Map<String, String>) s.readObject();
}
}