package de.berlin.hu.uima.ae.tagger.simple; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.u_compare.shared.semantic.chemical.Chemical; import de.berlin.hu.util.Constants; import de.berlin.hu.util.Constants.ChemicalType; public class ChemicalFormulaTagger extends JCasAnnotator_ImplBase { private static final String ELEMENT_PATTERN = "(Ac|Ag|Al|Am|Ar|As|At|Au|B|Ba|Be|Bh|Bi|Bk|Br|Ca|Cd|Ce|Cf|Cl|Cm|Cn|Co|Cr|Cs|Cu|C|Db|Ds|Dy|Er|Es|Eu|" + "Fe|Fm|Fr|F|Ga|Gd|Ge|He|Hf|Hg|Ho|Hs|H|In|Ir|I|Kr|K|La|Li|Lr|Lu|Md|Me|Mg|Mn|Mo|Mt|Na|Nb|Nd|Ne|Ni|No|Np|N|Os|O|Pa|Pb|Pd|Ph|Pm|Po|Pr|" + "Pt|Pu|P|Ra|Rb|Re|Rf|Rg|Rh|Rn|Ru|Sb|Sc|Se|Sg|Si|Sm|Sn|Sr|S|Ta|Tb|Tc|Te|Th|Ti|Tl|Tm|U|V|W|Xe|Yb|Y|Zn|Zr)"; private static final String MOLECULE_PATTERN = "(" + ELEMENT_PATTERN + "\\\\d?\\\\d?[\\\\+\\\\-]?)"; private static final Pattern FORMULA_PATTERN = Pattern.compile("(?<=[^a-zA-Z0-9\\-\\+\\)])((\\d\\d?|\\(\\d\\d?\\))?(%s|\\((%s+|\\d?\\d?[\\+\\-]|\\d\\d?[\\+\\-]?)\\)[0-9a-z]?)+)(?=[^a-zA-Z0-9\\-\\+\\(])".replaceAll("%s", MOLECULE_PATTERN)); private static final Pattern MUST_CONTAIN = Pattern.compile("\\p{Alpha}.*[0-9]|[0-9].*\\p{Alpha}"); private static final Pattern DOES_NOT_MATCH = Pattern.compile("\\p{Alpha}{2}[0-9]|.*1[5-9].*|.*[2-9]\\d.*|.*(\\D|^)1(\\D|$).*|[A-Z][a-z]?([5-9]|\\d{2,}|(\\(([5-9]|\\d{2,})\\)))|\\d+[A-Z]\\[a-z]?"); @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String text = aJCas.getDocumentText(); Matcher matcher = FORMULA_PATTERN.matcher(text); while (matcher.find()) { String formula = matcher.group(); int begin = matcher.start(); int end = matcher.end(); if (formula.startsWith("(") && formula.endsWith(")")) { begin++; end--; formula = formula.substring(1, formula.length()-1); } if (formula.replaceAll("\\(|\\)", "").length() > 2 && (MUST_CONTAIN.matcher(formula).find()) && !DOES_NOT_MATCH.matcher(formula).matches()) { createFormulaAnnotation(aJCas, begin, end, null); } } } private Chemical createFormulaAnnotation(JCas aJCas, int begin, int end, String id) { Chemical formula = new Chemical(aJCas); formula.setBegin(begin); formula.setEnd(end); formula.setId(id); formula.setSource(Constants.SUM_TAGGER); formula.setEntityType(ChemicalType.FORMULA.toString()); formula.addToIndexes(); return formula; } }