package uk.ac.shef.dcs.jate.eval;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* load gs from file
*/
public class GSLoader {
/**
* @param file GENIA Goldstandard "concept.txt" file
* @return List<String> a list of gs terms for the ATE evaluation
*/
public static List<String> loadGenia(String file) throws IOException {
List<String> terms = FileUtils.readLines(new File(file));
List<String> filteredGSTerms = filterByStopwords(terms);
return filteredGSTerms;
}
/**
* remove concept predicates in genia 'concept' list
* "Blood cell receptor" is not included in the content we use which is actually a "second" title of article "MEDLINE:94077003".
* @param terms gs term list loaded directly from GENIA "concept.txt" file
* @return List<String> a list of gs terms for the ATE evaluation
*/
private static List<String> filterByStopwords(List<String> terms) {
String[] genia_gs_stop_words = {"*", "(OR", "(NOT", "(TO", "(THAN", "(VERSUS", "(AND", "(BUT", "(AS", "(AND/OR", "Blood cell receptor"};
List<String> prunedGSTerms = new ArrayList<>();
for (String rawGSTerm : terms) {
boolean isFiltered = false;
for (String gsStopWord : genia_gs_stop_words) {
if (rawGSTerm.contains(gsStopWord)) {
isFiltered = true;
break;
}
}
if (!isFiltered) {
prunedGSTerms.add(rawGSTerm);
}
}
return prunedGSTerms;
}
/**
*
* @param file ACL RD-TEC goldstandard term file (with invalid and valid terms)
* @return List<String> list of goldstandard terms
* @throws IOException
*/
public static List<String> loadACLRD(String file) throws IOException {
List<String> raw = FileUtils.readLines(new File(file));
List<String> terms = new ArrayList<>(raw.size() - 1);
int count = 0;
for (String r : raw) {
if (count == 0) {
count++;
continue;
}
String[] splits = r.split("\\t");
if (splits.length==1)
terms.add(splits[0].trim());
else if (splits[2].equals("0"))
continue;
else
terms.add(splits[1].trim());
}
System.out.println("total="+terms.size());
return terms;
}
}