package org.jabref.logic.integrity; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jabref.logic.integrity.IntegrityCheck.Checker; import org.jabref.logic.l10n.Localization; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldProperty; import org.jabref.model.entry.InternalBibtexFields; public class HTMLCharacterChecker implements Checker { // Detect any HTML encoded character, private static final Pattern HTML_CHARACTER_PATTERN = Pattern.compile("&[#\\p{Alnum}]+;"); /** * Checks, if there are any HTML encoded characters in nonverbatim fields. */ @Override public List<IntegrityMessage> check(BibEntry entry) { List<IntegrityMessage> results = new ArrayList<>(); for (Map.Entry<String, String> field : entry.getFieldMap().entrySet()) { // skip verbatim fields if (InternalBibtexFields.getFieldProperties(field.getKey()).contains(FieldProperty.VERBATIM)) { continue; } Matcher characterMatcher = HTML_CHARACTER_PATTERN.matcher(field.getValue()); if (characterMatcher.find()) { results.add( new IntegrityMessage(Localization.lang("HTML encoded character found"), entry, field.getKey())); } } return results; } }