/**
* Copyright 2008 The University of North Carolina at Chapel Hill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.unc.lib.dl.xml;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.MODS_V3_NS;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.RDF_NS;
import static edu.unc.lib.dl.xml.JDOMNamespaceUtil.SKOS_NS;
import java.io.ByteArrayInputStream;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Pattern;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.Namespace;
import org.jdom2.filter.Filters;
import org.jdom2.input.SAXBuilder;
import org.jdom2.input.sax.XMLReaderSAX2Factory;
import org.jdom2.xpath.XPathExpression;
import org.jdom2.xpath.XPathFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utility for looking up department name hierarchies using an index constructed from a SKOS ontology
*
* @author bbpennel
* @date Jun 23, 2014
*/
public class DepartmentOntologyUtil implements VocabularyHelper {
private static final Logger log = LoggerFactory.getLogger(DepartmentOntologyUtil.class);
// Index of terms and labels mapped to authoritative department names pull from the ontology
private Map<String, DepartmentConcept> departments;
private final String UNC_NAME = "university of north carolina";
private final Pattern addressPattern;
private final Pattern addressTrailingPattern;
private final Pattern addressSplit;
private final Pattern trimLeading;
private final Pattern trimTrailing;
private final Pattern deptSplitPlural;
private final Pattern trimSuffix;
private final Pattern trimUNC;
private final Pattern splitSimple;
private XPathExpression<Element> namePath;
private String vocabularyURI;
private static final String locationTermPattern
= "dep(t\\.?|artment(s)?)|school|division|section(s)?|program in|center for|university";
public DepartmentOntologyUtil() {
addressPattern = Pattern.compile("([^,]+,)+\\s*[a-zA-Z ]*\\d+[a-zA-Z]*\\s*[^\\n]*");
addressTrailingPattern = Pattern.compile("([^,]+,){2,}\\s*([a-zA-Z]+ ?){1,2}\\s*");
addressSplit = Pattern.compile(
"(,? *(and *)?(?=" + locationTermPattern + ")(?= of)?)",
Pattern.CASE_INSENSITIVE);
trimLeading = Pattern.compile("^([.?;:*&^%$#@!\\-]|the |at |and |\\s)+");
trimTrailing = Pattern.compile("([.?;:*&^%$#@!\\-]|the |at |\\s)+$");
deptSplitPlural = Pattern
.compile("(and the |and (the )?(" + locationTermPattern + ")( of)?|and )");
trimSuffix = Pattern.compile("\\s*(" + locationTermPattern + ")( of| for| in)?$");
trimUNC = Pattern.compile("\\b(unc|carolina)\\s+");
splitSimple = Pattern.compile("(\\s*[:()]\\s*)+");
try {
XPathFactory xFactory = XPathFactory.instance();
namePath = xFactory.compile("mods:name", Filters.element(MODS_V3_NS), null, MODS_V3_NS);
} catch (Exception e) {
log.error("Failed to construct xpath", e);
}
}
/**
* Returns the authoritative department name that best matches the affiliation provided using the ontology. Since
* there can be multiple hierarchies for a single term, the outer list of the result separates between multiple
* paths, while the inner list contains the individual steps within the same hierarchy
*/
@Override
public List<List<String>> getAuthoritativeForm(String affiliation) {
String cleanAffil = cleanLabel(affiliation);
// First, check to see if the department matches verbatim.
DepartmentConcept dept = departments.get(cleanAffil);
if (dept != null) {
return buildHierarchy(dept);
}
AffiliationStyle style = this.determineStyle(cleanAffil);
switch (style) {
case notApplicable:
// log.debug("Affiliation {} was determined to not be applicable", affiliation);
return null;
case address:
// Affiliation is in address format, so split it into components by commas
List<List<String>> resultDepts = getAddressDepartment(addressSplit.split(cleanAffil));
if (resultDepts != null)
return resultDepts;
return getAddressDepartment(cleanAffil.split("\\s*,\\s*"));
case simple:
// Clean it up and start
String[] affilParts = splitSimple.split(cleanAffil);
for (int i = affilParts.length - 1; i >= 0; i--) {
String affilPart = affilParts[i];
List<List<String>> result = getDepartment(affilPart);
if (result != null)
return result;
}
break;
}
return null;
}
/**
* Returns a list of authoritative terms for the affiliation field in the given document.
*/
@Override
public List<List<String>> getAuthoritativeForms(Element docElement) throws JDOMException {
Set<String> terms = new HashSet<String>();
List<?> names = docElement.getChildren("name", JDOMNamespaceUtil.MODS_V3_NS);
Element nameEl;
for (Object nameObj : names) {
nameEl = (Element) nameObj;
List<?> affiliations = nameEl.getChildren("affiliation", JDOMNamespaceUtil.MODS_V3_NS);
for (Object affilObj : affiliations) {
String affiliation = ((Element) affilObj).getValue();
if (affiliation != null && affiliation.trim().length() > 0) {
terms.add(affiliation);
}
}
}
List<List<String>> expandedDepts = new ArrayList<List<String>>(terms.size());
for (String affiliation : terms) {
List<List<String>> results = getAuthoritativeForm(affiliation);
if (results != null) {
expandedDepts.addAll(results);
}
}
// Remove any duplication between paths
collapsePaths(expandedDepts);
return expandedDepts;
}
/**
* Compares the affiliation values in the given MODS document against the ontology. If a preferred term(s) is found,
* then it will replace the original. Only the first and last terms in a single hierarchy are kept if there are more
* than two levels
*
* @param modsDoc
* @return Returns true if the mods document was modified by adding or changing affiliations
* @throws JDOMException
*/
@Override
public boolean updateDocumentTerms(Element docElement) throws JDOMException {
List<?> nameObjs = namePath.evaluate(docElement);
boolean modified = false;
for (Object nameObj : nameObjs) {
Element nameEl = (Element) nameObj;
List<?> affiliationObjs = nameEl.getChildren("affiliation", MODS_V3_NS);
if (affiliationObjs.size() == 0)
continue;
// Collect the set of all affiliations for this name so that it can be used to detect duplicates
Set<String> affiliationSet = new HashSet<>();
for (Object affiliationObj : affiliationObjs) {
Element affiliationEl = (Element) affiliationObj;
affiliationSet.add(affiliationEl.getTextNormalize());
}
// Make a snapshot of the list of affiliations so that the original can be modified
List<?> affilList = new ArrayList<>(affiliationObjs);
// Get the authoritative department path for each affiliation and overwrite the original
for (Object affiliationObj : affilList) {
Element affiliationEl = (Element) affiliationObj;
String affiliation = affiliationEl.getTextNormalize();
List<List<String>> departments = getAuthoritativeForm(affiliation);
if (departments != null && departments.size() > 0) {
Element parentEl = affiliationEl.getParentElement();
int affilIndex = parentEl.indexOf(affiliationEl);
boolean removeOriginal = true;
// Add each path that matched the affiliation. There can be multiple if there were multiple parents
for (List<String> deptPath : departments) {
String baseDept = deptPath.size() > 1 ? deptPath.get(0) : null;
String topDept = deptPath.get(deptPath.size() - 1);
// No need to remove the original if it is in the set of departments being added
if (affiliation.equals(topDept))
removeOriginal = false;
modified = addAffiliation(baseDept, parentEl, affilIndex, affiliationSet) || modified;
modified = addAffiliation(topDept, parentEl, affilIndex, affiliationSet) || modified;
}
// Remove the old affiliation unless it was already in the vocabulary
if (removeOriginal)
parentEl.removeContent(affiliationEl);
}
}
}
return modified;
}
/**
* Add the given department to the parent element as an affiliation if it is not already present
*
* @param dept
* @param parentEl
* @param affilIndex
* @param affiliationSet
* @return True if an affiliation was added
*/
private boolean addAffiliation(String dept, Element parentEl, int affilIndex, Set<String> affiliationSet) {
// Prevent duplicate departments from being added
if (dept != null && !affiliationSet.contains(dept)) {
Element newAffilEl = new Element("affiliation", parentEl.getNamespace());
newAffilEl.setText(dept);
// Insert the new element near where the original was
try {
parentEl.addContent(affilIndex, newAffilEl);
} catch (IndexOutOfBoundsException e) {
parentEl.addContent(newAffilEl);
}
affiliationSet.add(dept);
return true;
}
return false;
}
private List<List<String>> getAddressDepartment(String[] addressParts) {
List<List<String>> allResults = new ArrayList<List<String>>();
for (int i = 0; i < addressParts.length; i++) {
String addressPart = addressParts[i];
List<List<String>> result = getDepartment(addressPart);
if (result != null) {
allResults.addAll(result);
}
}
// Deduplicate the path and remove other entries which are subsets are a more exact path
if (allResults.size() > 0) {
collapsePaths(allResults);
return allResults;
}
return null;
}
/**
* Attempt to normalize and generate variations on the given affiliation, and return the first matching dept
* hierarchy
*
* @param affiliation
* @return
*/
private List<List<String>> getDepartment(String affiliation) {
if (affiliation == null || affiliation.length() == 0)
return null;
String affilPart = affiliation;
affilPart = affilPart.replaceAll("&", "and").replaceAll(" & ", " and ").replace("&", "");
int index = affilPart.indexOf(UNC_NAME);
if (index > 0) {
affilPart = affilPart.substring(0, index);
}
// Trim off trailing punctuation and articles
affilPart = trimTrailing.matcher(trimLeading.matcher(affilPart).replaceAll("")).replaceAll("");
// Expand abbreviation
affilPart = affilPart.replaceAll("\\bdept\\b", "department");
// Give it another try
DepartmentConcept dept = departments.get(affilPart);
if (dept != null) {
return buildHierarchy(dept);
}
// Attempt without superfluous UNC's
affilPart = trimUNC.matcher(affilPart).replaceFirst("");
dept = departments.get(affilPart);
if (dept != null) {
return buildHierarchy(dept);
}
// Handle inverted departments and slash/and mixups
affilPart = trimSuffix.matcher(affilPart).replaceAll("").replaceAll("\\s*/\\s*", " and ").trim();
if (affilPart.endsWith(",")) {
affilPart = affilPart.substring(0, affilPart.length() - 1);
}
dept = departments.get(affilPart);
if (dept != null) {
return buildHierarchy(dept);
}
// Try to uninvert the name
int commaIndex = affilPart.indexOf(',');
if (commaIndex != -1) {
String uninverted = affilPart.substring(commaIndex + 1).trim() + ' '
+ affilPart.substring(0, commaIndex).trim();
dept = departments.get(uninverted);
if (dept != null) {
return buildHierarchy(dept);
}
}
// Check if there are multiple departments in this affiliation
String[] multipleDepts = deptSplitPlural.split(affiliation);
if (multipleDepts.length > 1) {
List<List<String>> allPaths = new ArrayList<List<String>>();
// Split the departments up, to lookup and add separately
for (String part : multipleDepts) {
part = part.trim().replace("departments", "department");
List<List<String>> result = getDepartment(part);
if (result != null)
allPaths.addAll(result);
}
if (allPaths.size() > 0)
return allPaths;
}
return null;
}
/**
* Builds a list containing all departments in the hierarchy chain leading up to and including the given department
* concept
*
* @param dept
* @param hierarchy
* @return
*/
private List<List<String>> buildHierarchy(DepartmentConcept dept) {
List<List<String>> hierarchy = new ArrayList<List<String>>();
walkHierarchy(dept, new ArrayDeque<String>(), hierarchy);
return hierarchy;
}
/**
* Constructs a list of all departments in the hierarchy leading up to dept
*
* @param dept
* @param deptStack
* @param deptPaths
*/
private void walkHierarchy(DepartmentConcept dept, ArrayDeque<String> deptStack, List<List<String>> deptPaths) {
deptStack.addFirst(dept.identifier);
if (dept.broader != null && dept.broader.size() > 0) {
// Seek the first parent department that has a real concept
DepartmentConcept parentDept = null;
for (String broader : dept.broader) {
parentDept = departments.get(broader);
if (parentDept == null) {
deptPaths.add(new ArrayList<String>(deptStack));
} else {
walkHierarchy(parentDept, deptStack, deptPaths);
}
}
} else {
deptPaths.add(new ArrayList<String>(deptStack));
}
deptStack.removeFirst();
}
/**
* Determines what style of affiliation the given text adheres to, or if it should not be processed
*
* @param affiliation
* @return
*/
private AffiliationStyle determineStyle(String affiliation) {
String department = affiliation.trim();
int indexUNC = department.indexOf(UNC_NAME);
if (indexUNC != -1) {
String afterUNC = department.substring(indexUNC);
// make sure it is UNC chapel hill
if (afterUNC.trim().length() > 0 && !afterUNC.contains("chapel hill")) {
return AffiliationStyle.notApplicable;
} else {
// since it contains the university name, it is most likely an address
return AffiliationStyle.address;
}
}
if (department.contains("university")) {
// From another University, skip
return AffiliationStyle.notApplicable;
}
if (addressPattern.matcher(department).matches() || addressTrailingPattern.matcher(department).matches()) {
// If the address is located in chapel hill, it is worth further processing
if (department.contains("chapel hill")) {
return AffiliationStyle.address;
}
return AffiliationStyle.notApplicable;
} else {
return AffiliationStyle.simple;
}
}
/**
* Parses a SKOS XML vocabulary located at filePath and populates a lookup index labels and alternative labels
* referencing the authoritative version.
*
* @param ontologyURL
* @throws Exception
*/
private void parseVocabulary(byte[] content) throws Exception {
departments = new HashMap<String, DepartmentConcept>();
log.debug("Parsing and building Department vocabulary from {}", getVocabularyURI());
SAXBuilder sb = new SAXBuilder(new XMLReaderSAX2Factory(false));
Document skosDoc = sb.build(new ByteArrayInputStream(content));
// Extract all of the concepts and store them to an index
List<?> concepts = skosDoc.getRootElement().getChildren("Concept", SKOS_NS);
Map<String, DepartmentConcept> tempDepts = new HashMap<String, DepartmentConcept>(concepts.size());
for (Object conceptObj : concepts) {
DepartmentConcept dept = new DepartmentConcept((Element) conceptObj);
tempDepts.put(cleanLabel(dept.getIdentifier()), dept);
}
// Expand out all the alternative labels into an index and resolve references
for (Iterator<Entry<String, DepartmentConcept>> deptIt = tempDepts.entrySet().iterator(); deptIt.hasNext();) {
Entry<String, DepartmentConcept> deptEntry = deptIt.next();
DepartmentConcept dept = deptEntry.getValue();
// Check if this concept should be ignored in favor of a preferred concept
if (dept.prefLabel != null) {
if (departments.containsKey(dept.prefLabel)) {
// The preferred concept has already been indexed, grab extra labels from this concept and reindex pref
DepartmentConcept prefDept = departments.get(dept.prefLabel);
prefDept.merge(dept);
addLabels(prefDept);
} else {
// Since the preferred concept isn't indexed yet, just need to merge labels into it
DepartmentConcept prefDept = tempDepts.get(dept.prefLabel);
if (prefDept == null) {
log.warn("Preferred label {} referencing a concept which is not present", dept.prefLabel);
} else {
prefDept.merge(dept);
}
}
continue;
}
String identifier = cleanLabel(dept.identifier);
if (departments.containsKey(identifier) && dept.identifier.equals(departments.get(identifier).identifier)) {
log.error("Illegal state, multiple concepts share the identifier {}, ignoring duplicate", identifier);
} else {
departments.put(identifier, dept);
}
addLabels(dept);
}
}
/**
* Deduplicate the given set of paths and remove entries which are subsets of more exact paths
*
* @param paths
*/
public static void collapsePaths(List<List<String>> paths) {
Iterator<List<String>> resultsIt = paths.iterator();
while (resultsIt.hasNext()) {
List<String> result = resultsIt.next();
boolean removePath = false;
for (List<String> otherResult : paths) {
if (otherResult != result && result.size() <= otherResult.size()) {
boolean containsPath = true;
for (String dept : result) {
if (!otherResult.contains(dept)) {
containsPath = false;
break;
}
}
if (containsPath) {
removePath = true;
break;
}
}
}
if (removePath)
resultsIt.remove();
}
}
private static String cleanLabel(String label) {
return label.toLowerCase().replaceAll("[.']+", "");
}
/**
* Adds all the alternative labels for a department into the index. Logs a warning if more than one department has
* the same label
*
* @param dept
*/
private void addLabels(DepartmentConcept dept) {
if (dept.otherLabels != null) {
for (String label : dept.otherLabels) {
// Check to see if this label has already been indexed
if (departments.containsKey(label)) {
DepartmentConcept collidingDept = departments.get(label);
if (collidingDept != dept) {
log.warn("Label collision for key {}", label);
}
} else {
departments.put(label, dept);
}
}
}
}
/**
* Returns a set of invalid department affiliation names found in the given MODS document
*/
@Override
public Set<String> getInvalidTerms(Element modsRoot) throws JDOMException {
return getInvalidTerms(modsRoot, false);
}
@Override
public Set<String> getInvalidTermsWithPrefix(Element modsRoot) throws JDOMException {
return getInvalidTerms(modsRoot, true);
}
public Set<String> getInvalidTerms(Element modsRoot, boolean includePrefix) throws JDOMException {
List<?> nameObjs = namePath.evaluate(modsRoot);
Set<String> invalidTerms = new HashSet<String>();
for (Object nameObj : nameObjs) {
Element nameEl = (Element) nameObj;
List<?> affiliationObjs = nameEl.getChildren("affiliation", MODS_V3_NS);
if (affiliationObjs.size() == 0)
continue;
// Make a snapshot of the list of affiliations so that the original can be modified
List<?> affilList = new ArrayList<Object>(affiliationObjs);
// Get the authoritative department path for each affiliation and overwrite the original
for (Object affiliationObj : affilList) {
Element affiliationEl = (Element) affiliationObj;
String affiliation = affiliationEl.getTextNormalize();
List<List<String>> departments = getAuthoritativeForm(affiliation);
if (departments == null || departments.size() == 0) {
// Affiliation was not found in the ontology, add it to result set
if (includePrefix)
affiliation = getInvalidTermPrefix() + "|" + affiliation;
invalidTerms.add(affiliation);
}
}
}
return invalidTerms;
}
public Map<String, DepartmentConcept> getDepartments() {
return departments;
}
/**
* Selectors are not currently used for this helper
*/
@Override
public void setSelector(String selector) {
}
public XPathExpression<Element> getNamePath() {
return namePath;
}
/*
* (non-Javadoc)
*
* @see edu.unc.lib.dl.xml.VocabularyHelper#getVocabularyTerms()
*/
@Override
public Collection<String> getVocabularyTerms() {
Set<String> deptNames = new HashSet<>();
for (DepartmentConcept dept : departments.values()) {
deptNames.add(dept.getIdentifier());
}
return deptNames;
}
@Override
public String getInvalidTermPrefix() {
return "affiliation";
}
/*
* (non-Javadoc)
*
* @see edu.unc.lib.dl.xml.VocabularyHelper#setContent(byte[])
*/
@Override
public void setContent(byte[] content) throws Exception {
parseVocabulary(content);
}
public static enum AffiliationStyle {
simple, address, notApplicable;
}
/**
* Stores ontology information for one department concept
*
* @author bbpennel
* @date Jun 30, 2014
*/
public static class DepartmentConcept {
private final String identifier;
private String prefLabel;
private List<String> broader;
private final List<String> otherLabels;
public DepartmentConcept(Element conceptEl) throws IllegalArgumentException {
String deptLabel = conceptEl.getAttributeValue("about", RDF_NS);
if (deptLabel == null) {
throw new IllegalArgumentException("Invalid concept without a rdf:about attribute found");
}
this.identifier = deptLabel;
setBroader(conceptEl.getChildren("broader", SKOS_NS));
this.prefLabel = conceptEl.getChildText("prefLabel", SKOS_NS);
if (this.prefLabel != null)
this.prefLabel = cleanLabel(this.prefLabel);
this.otherLabels = new ArrayList<String>();
addLabelsFromElements(conceptEl.getChildren("altLabel", SKOS_NS));
addLabelsFromElements(conceptEl.getChildren("hiddenLabel", SKOS_NS));
}
public void merge(DepartmentConcept incoming) {
if (incoming.broader != null) {
for (String newBroader : incoming.broader) {
String lower = cleanLabel(newBroader);
if (!this.broader.contains(lower))
this.broader.add(lower);
}
}
if (incoming.otherLabels != null) {
for (String newLabel : incoming.otherLabels) {
String lower = cleanLabel(newLabel);
if (!this.otherLabels.contains(lower))
this.otherLabels.add(lower);
}
}
}
public void addLabelsFromElements(List<?> labelEls) {
if (labelEls == null) {
return;
}
for (Object labelEl : labelEls) {
otherLabels.add(cleanLabel(((Element) labelEl).getTextNormalize()));
}
}
public void setBroader(List<?> broaderEls) {
broader = new ArrayList<String>(broaderEls.size());
for (Object broaderEl : broaderEls) {
broader.add(((Element) broaderEl).getAttributeValue("resource", RDF_NS).toLowerCase());
}
}
public String getIdentifier() {
return identifier;
}
public String getPrefLabel() {
return prefLabel;
}
public List<String> getBroader() {
return broader;
}
}
/*
* (non-Javadoc)
*
* @see edu.unc.lib.dl.xml.VocabularyHelper#getVocabularyURI()
*/
@Override
public String getVocabularyURI() {
return vocabularyURI;
}
@Override
public void setVocabularyURI(String vocabularyURI) {
this.vocabularyURI = vocabularyURI;
}
@Override
public void setSelectorNamespaces(Namespace[] namespaces) {
}
@Override
public String getSelector() {
return "//mods:name/mods:affiliation";
}
}