package de.berlin.hu.uima.ae.feature; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.u_compare.shared.semantic.NamedEntity; import org.u_compare.shared.syntactic.Sentence; import org.u_compare.shared.syntactic.Token; import org.uimafit.util.JCasUtil; import de.berlin.hu.chemspot.Mention; import de.berlin.hu.util.Constants; public class FeatureTokenGenerator { public enum Feature_Phase { PHASE1, // after all tagger components ran PHASE2, // after match expansion PHASE3, // after stopword filtering PHASE4 // after normalization }; public enum ChemSpot_Feature { CRF, DICTIONARY, SUM_TAGGER, ABBREV, CHEMSPOT, MATCH_EXPANSION, CRF_ME, DICTIONARY_ME, SUM_TAGGER_ME, ABBREV_ME, STOPWORD, CHID, CHEB, CAS, PUBC, PUBS, INCH, DRUG, HMBD, KEGG, KEGD, MESH, CHEB_MIN_DEPTH, CHEB_AVG_DEPTH, CHEB_MAX_DEPTH, CHEB_CHILDREN, CHEMICAL_PREFIX, CHEMICAL_SUFFIX; }; private Map<Integer, List<FeatureToken>> tokens = null; private Map<String, Integer> chebiMinDepth = null; private Map<String, Integer> chebiAvgDepth = null; private Map<String, Integer> chebiMaxDepth = null; private Map<String, Integer> nrChildNodes = null; private List<String> prefixes = null; private List<String> suffixes = null; private Map<List<String>, String> phareData = null; private Map<String, String> whoAtcList = null; private void loadChebiData(String file) throws IOException { chebiMinDepth = new HashMap<String, Integer>(); chebiAvgDepth = new HashMap<String, Integer>(); chebiMaxDepth = new HashMap<String, Integer>(); nrChildNodes = new HashMap<String, Integer>(); System.out.print("Loading chebi data from resource " + file + "... "); BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(file))); String line = null; reader.readLine(); while ((line = reader.readLine()) != null) { String[] chebi = line.split("\t"); String chebiId = chebi [0]; int children = Integer.valueOf(chebi[1]); String[] depths = chebi[2].split(","); int minDepth = Integer.MAX_VALUE; int avgDepth = 0; int maxDepth = 0; for (String depthString : depths) { int depth = Integer.valueOf(depthString.trim()); minDepth = depth < minDepth ? depth : minDepth; maxDepth = depth > maxDepth ? depth : maxDepth; avgDepth += depth; } avgDepth = Math.round((float)avgDepth / (float)depths.length); chebiMinDepth.put(chebiId, minDepth); chebiAvgDepth.put(chebiId, avgDepth); chebiMaxDepth.put(chebiId, maxDepth); nrChildNodes .put(chebiId, children); } System.out.println("Done."); reader.close(); } private void loadPrefixesSuffixes(String path) throws IOException { System.out.print("Loading prefixes and suffixes from resource directory " + path + "... "); prefixes = new ArrayList<String>(); suffixes = new ArrayList<String>(); BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(path + "prefixes.txt"))); String line = null; while ((line = reader.readLine()) != null) { prefixes.add(line.split("\t")[0]); } reader.close(); reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(path + "suffixes-filtered.txt"))); while ((line = reader.readLine()) != null) { suffixes.add(line.split("\t")[0]); } reader.close(); System.out.println("Done."); } private void loadPhareData(String file) throws IOException { System.out.print("Loading pharmagenomics relationship ontology data from resource " + file + "... "); phareData = new HashMap<List<String>, String>(); BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(file))); String line = null; while ((line = reader.readLine()) != null) { String[] phare = line.split("\t"); String label = phare[0]; List<String> terms = new ArrayList<String>(); for (String term : phare[1].split("\\|")) { terms.add(term); } phareData.put(terms, label); } reader.close(); System.out.println("Done."); } private void loadWHOATCData(String path) throws IOException { System.out.print("Loading WHO ATC list from resource directory " + path + "... "); whoAtcList = new HashMap<String, String>(); BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(path))); String line = null; String identifier = null; int i = 0; while ((line = reader.readLine()) != null) { String term = null; line = line.trim(); if (line.contains(" - ")) { String[] split = line.split(" - "); identifier = split[0].trim(); term = split[1].trim(); i = 0; } else { term = line; i++; } String mapIdentifier = identifier + (i > 0 ? "/" + i : ""); whoAtcList.put(mapIdentifier, term); if (term.contains(",")) { whoAtcList.put(mapIdentifier + "/C", term.replaceAll(",.*", "")); } } reader.close(); System.out.println("Done."); } public FeatureTokenGenerator() { System.out.println(); System.out.println("Initializing feature generator."); tokens = new HashMap<Integer, List<FeatureToken>>(); if (chebiMinDepth == null) { try { loadChebiData("/resources/chebi/chebi_ontology_fulldepth.txt"); } catch (IOException e) { System.out.println("Error while loading chebi data"); e.printStackTrace(); } } if (prefixes == null) { try { loadPrefixesSuffixes("/resources/"); } catch (IOException e) { System.out.println("Error while loading prefixes and suffixes"); e.printStackTrace(); } } if (phareData == null) { try { loadPhareData("/resources/phare.txt"); } catch (IOException e) { System.out.println("Error while loading pharmagenomics relationship ontology data"); e.printStackTrace(); } } if (whoAtcList == null) { try { loadWHOATCData("/resources/who/WHO-ATC.txt"); } catch (IOException e) { System.out.println("Error while loading WHO ATC list"); e.printStackTrace(); } } System.out.println("Feature generator initialized."); System.out.println(); } public void process(JCas aJCas, Feature_Phase phase) throws AnalysisEngineProcessException { switch (phase) { case PHASE1: tokens.put(aJCas.getDocumentText().hashCode(), new ArrayList<FeatureToken>()); generateFeatureTokens(aJCas); checkNormalization(aJCas); break; case PHASE2: checkExpandedMentions(aJCas); break; case PHASE3: checkStopwords(aJCas); break; case PHASE4: checkNormalization(aJCas); checkPrefixesSuffixes(aJCas); checkPhareData(aJCas); checkWHOATC(aJCas); //printFeatureTokens(aJCas); break; } } public void clearFeatureTokens() { tokens.clear(); } private void generateFeatureTokens(JCas aJCas) { List<FeatureToken> tokens = getFeatureTokens(aJCas); for (Token token : JCasUtil.iterate(aJCas, Token.class)) { FeatureToken ft = new FeatureToken(aJCas, token.getBegin(), token.getEnd()); tokens.add(ft); } for (NamedEntity ne : JCasUtil.iterate(aJCas, NamedEntity.class)) { if (Constants.GOLDSTANDARD.equals(ne.getSource())) continue; for (FeatureToken token : getFeatureTokens(aJCas, ne)) { try { token.addFeature(ChemSpot_Feature.valueOf(ne.getSource().toUpperCase())); } catch (IllegalArgumentException e) { // do nothing } } } } public List<FeatureToken> getFeatureTokens(JCas aJCas) { return tokens.get(aJCas.getDocumentText().hashCode()); } public List<FeatureToken> removeFeatureTokens(JCas aJCas) { return tokens.remove(aJCas.getDocumentText().hashCode()); } public List<FeatureToken> getFeatureTokens(JCas aJCas, Annotation container) { List<FeatureToken> result = new ArrayList<FeatureToken>(); for (FeatureToken token : getFeatureTokens(aJCas)) { if (token.getBegin() > container.getEnd()) break; if (token.getBegin() >= container.getBegin() && token.getEnd() <= container.getEnd()) { result.add(token); } } return result; } private void checkExpandedMentions(JCas aJCas) { for (NamedEntity ne : JCasUtil.iterate(aJCas, NamedEntity.class)) { if (Constants.GOLDSTANDARD.equals(ne.getSource())) continue; boolean was_expanded = false; List<FeatureToken> tokens = getFeatureTokens(aJCas, ne); for (FeatureToken token : tokens) { try { ChemSpot_Feature feature = ChemSpot_Feature.valueOf(ne.getSource().toUpperCase()); if (!token.hasFeature(feature)) { token.addFeature(ChemSpot_Feature.valueOf(feature + "_ME")); token.addFeature(ChemSpot_Feature.MATCH_EXPANSION); was_expanded = true; } } catch (IllegalArgumentException e) { // do nothing } } if (was_expanded) { for (FeatureToken token : tokens) { try { ChemSpot_Feature feature = ChemSpot_Feature.valueOf(ne.getSource().toUpperCase()); if (!token.hasFeature(feature + "_ME")) { token.addFeature(ChemSpot_Feature.valueOf(feature + "_ME")); token.addFeature(ChemSpot_Feature.MATCH_EXPANSION); } } catch (IllegalArgumentException e) { // do nothing } } } } } private void checkStopwords(JCas aJCas) { List<FeatureToken> tokens = new ArrayList<FeatureToken>(getFeatureTokens(aJCas)); for (NamedEntity ne : JCasUtil.iterate(aJCas, NamedEntity.class)) { if (Constants.GOLDSTANDARD.equals(ne.getSource())) continue; for (FeatureToken token : getFeatureTokens(aJCas, ne)) { tokens.remove(token); } } for (FeatureToken token : tokens) { if (!token.getFeatures().isEmpty()) { token.addFeature(ChemSpot_Feature.STOPWORD); } } } private void checkNormalization(JCas aJCas) { for (NamedEntity ne : JCasUtil.iterate(aJCas, NamedEntity.class)) { if (Constants.GOLDSTANDARD.equals(ne.getSource())) continue; Mention mention = new Mention(ne); String[] ids = mention.getIds(); for (FeatureToken token : getFeatureTokens(aJCas, ne)) { token.addFeature(ChemSpot_Feature.CHEMSPOT); for (int i = 0; i < ids.length; i++) { if (ids[i] != null && !ids[i].isEmpty()) { token.addFeature(Constants.ChemicalID.values()[i].toString()); } } String chebiId = mention.getCHEB(); if (chebiId != null) { if (chebiAvgDepth.containsKey(chebiId)) { token.addFeature(ChemSpot_Feature.CHEB_AVG_DEPTH + "_" + chebiAvgDepth.get(chebiId)); } if (chebiMinDepth.containsKey(chebiId)) { token.addFeature(ChemSpot_Feature.CHEB_MIN_DEPTH + "_" + chebiMinDepth.get(chebiId)); } if (chebiMaxDepth.containsKey(chebiId)) { token.addFeature(ChemSpot_Feature.CHEB_MAX_DEPTH + "_" + chebiMaxDepth.get(chebiId)); } if (nrChildNodes.containsKey(chebiId)) { token.addFeature(ChemSpot_Feature.CHEB_CHILDREN + "_" + nrChildNodes.get(chebiId)); } } } } } private void checkPrefixesSuffixes(JCas aJCas) { for (FeatureToken token : getFeatureTokens(aJCas)) { for (String prefix : prefixes) { if (token.getCoveredText().toLowerCase().startsWith(prefix)) { token.addFeature(ChemSpot_Feature.CHEMICAL_PREFIX); token.addFeature(ChemSpot_Feature.CHEMICAL_PREFIX + "_" + prefix.toUpperCase()); } } for (String suffix : suffixes) { if (token.getCoveredText().toLowerCase().endsWith(suffix)) { token.addFeature(ChemSpot_Feature.CHEMICAL_SUFFIX); token.addFeature(ChemSpot_Feature.CHEMICAL_SUFFIX + "_" + suffix.toUpperCase()); } } } } private void checkPhareData(JCas aJCas) { for (Sentence sentence : JCasUtil.iterate(aJCas, Sentence.class)) { String sentenceString = sentence.getCoveredText().toLowerCase(); for (List<String> terms : phareData.keySet()) { for (String term : terms) { int index = sentenceString.indexOf(term.toLowerCase()); while (index != -1) { if ((index - 1 < 0 || !Character.isLetter(sentenceString.charAt(index-1))) && (index + term.length() >= sentenceString.length() || !Character.isLetter(sentenceString.charAt(index+term.length()))) ) { for (FeatureToken token : getFeatureTokens(aJCas, sentence)) { if (token.getBegin() >= sentence.getBegin() + index && token.getEnd() <= sentence.getBegin() + index + term.length()) { token.addFeature(phareData.get(terms).replaceAll("\\s+", "_").toUpperCase()); } } } index = sentenceString.indexOf(term.toLowerCase(), index + term.length()); } } } } } private void checkWHOATC(JCas aJCas) { for (Sentence sentence : JCasUtil.iterate(aJCas, Sentence.class)) { String sentenceString = sentence.getCoveredText().toLowerCase(); for (String identifier : whoAtcList.keySet()) { String term = whoAtcList.get(identifier); int index = sentenceString.indexOf(term.toLowerCase()); while (index != -1) { if ((index - 1 < 0 || !Character.isLetter(sentenceString.charAt(index-1))) && (index + term.length() >= sentenceString.length() || !Character.isLetter(sentenceString.charAt(index+term.length()))) ) { List<String> whoAtcFeatures = new ArrayList<String>(); whoAtcFeatures.add("WHO-ATC-"+ identifier + ":" + term.replaceAll("\\s+", "_").toUpperCase()); for (String identifier2 : whoAtcList.keySet()) { if (identifier.startsWith(identifier2)) { String term2 = whoAtcList.get(identifier2); whoAtcFeatures.add("WHO-ATC-"+ identifier2 + ":" + term2.replaceAll("\\s+", "_").toUpperCase()); } } for (FeatureToken token : getFeatureTokens(aJCas, sentence)) { if (token.getBegin() >= sentence.getBegin() + index && token.getEnd() <= sentence.getBegin() + index + term.length()) { token.getFeatures().addAll(whoAtcFeatures); } } } index = sentenceString.indexOf(term.toLowerCase(), index + term.length()); } } } } public void printFeatureTokens(JCas aJCas) { List<NamedEntity> nes = new ArrayList<NamedEntity>(JCasUtil.select(aJCas, NamedEntity.class)); for (NamedEntity ne : new ArrayList<NamedEntity>(nes)) { if (Constants.GOLDSTANDARD.equals(ne.getSource())) nes.remove(ne); } for (FeatureToken token : getFeatureTokens(aJCas)) { while (!nes.isEmpty() && nes.get(0).getEnd() < token.getBegin()) { nes.remove(0); } if (!nes.isEmpty() && nes.get(0).getBegin() <= token.getBegin() && nes.get(0).getEnd() >= token.getEnd()) { NamedEntity ne = nes.remove(0); System.out.println(); System.out.println(ne.getCoveredText()); } if (!token.getFeatures().isEmpty()) { System.out.println(" " + token.getCoveredText() + " -> " + token.getFeatures()); } } } }