/******************************************************************************/
/* Copyright (C) 2010-2011, Sebastian Hellmann */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
/* You may obtain a copy of the License at */
/* */
/* http://www.apache.org/licenses/LICENSE-2.0 */
/* */
/* Unless required by applicable law or agreed to in writing, software */
/* distributed under the License is distributed on an "AS IS" BASIS, */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
/* See the License for the specific language governing permissions and */
/* limitations under the License. */
/******************************************************************************/
package org.nlp2rdf.core.urischemes;
import com.jamonapi.Monitor;
import com.jamonapi.MonitorFactory;
import org.apache.commons.codec.digest.DigestUtils;
import org.nlp2rdf.core.NIFNamespaces;
import org.nlp2rdf.core.Span;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
/**
* User: hellmann
* Date: 12.02.13
*/
public class ContextHashBasedString extends AbstractURIScheme implements URIScheme {
private static final Logger log = LoggerFactory.getLogger(ContextHashBasedString.class);
public static final int firstCharLength = 20;
public static final int defaultContextLength = 10;
public static final String IDENTIFIER = "hash";
public static final String BRA = "(";
public static final String KET = ")";
@Override
public String getOWLClassURI() {
return NIFNamespaces.NIF + "ContextHashBasedString";
}
@Override
public Span[] parse(String prefix, String uri, String context) throws NIFParserException {
String[] st = uri.substring(prefix.length()).split("_");
if (st.length < 5) {
throw new NIFParserException("Too few parameters in " + uri + " expected 5");
}
if (!IDENTIFIER.equals(st[0])) {
throw new NIFParserException("Wrong identifier for " + getOWLClassURI() + " expected " + IDENTIFIER + " found " + st[0] + " in " + uri);
}
int contextLength = 0;
int anchoredPartLength = 0;
String digest = null;
try {
contextLength = Integer.parseInt(st[1]);
anchoredPartLength = Integer.parseInt(st[2]);
digest = st[3];
} catch (NumberFormatException npe) {
throw new NIFParserException("The span could not be recognized correctly for scheme " + getOWLClassURI() + " expected int_int_MD5 , found " + st[1] + "_" + st[2] + "_" + st[3], npe);
}
StringBuilder addressedString = new StringBuilder();
addressedString.append(st[4]);
for (int x = 4; x < st.length; x++) {
addressedString.append(st[x]).append("_");
}
int offset = 0;
int index;
ArrayList<Span> spans = new ArrayList<Span>();
while ((index = context.indexOf(addressedString.toString(), offset)) != -1) {
StringBuilder message = new StringBuilder();
Span spanCandidate = new Span(index, index + anchoredPartLength);
//calculate the context boundaries
message.append(URISchemeHelper.getContextBefore(spanCandidate, context, contextLength));
message.append(BRA);
message.append(spanCandidate.getCoveredText(context));
message.append(KET);
message.append(URISchemeHelper.getContextAfter(spanCandidate, context, contextLength));
String digestNew = DigestUtils.md5Hex(message.toString());
if (digest.equals(digestNew)) {
spans.add(spanCandidate);
} else {
//try the next one
offset = index;
}
}
if (spans.isEmpty()) {
throw new NIFParserException("Could not calculate spans for uri " + uri + " of scheme " + getOWLClassURI() + " string not found in context");
}
return spans.toArray(new Span[spans.size()]);
}
@Override
public String generate(String prefix, String context, Span[] spans) {
return generate(prefix, context, spans, defaultContextLength);
}
public String generate(String prefix, String context, Span[] spans, int contextLength) {
if (spans.length != 1) {
log.debug(getOWLClassURI() + " scheme only takes the first span for generation of URIs, but the array contains " + spans.length);
}
Span span = spans[0];
//the substring
String anchoredPart = span.getCoveredText(context).toString();
StringBuilder message = new StringBuilder();
//calculate the context boundaries
message.append(URISchemeHelper.getContextBefore(span, context, contextLength));
message.append(BRA);
message.append(anchoredPart);
message.append(KET);
message.append(URISchemeHelper.getContextAfter(span, context, contextLength));
String digest = DigestUtils.md5Hex(message.toString());
String firstChars = URISchemeHelper.getFirstCharacters(anchoredPart, firstCharLength);
StringBuilder uri = new StringBuilder();
uri.append(prefix);
uri.append(IDENTIFIER).append("_");
uri.append(contextLength).append("_");
uri.append(anchoredPart.length()).append("_");
uri.append(digest).append("_");
uri.append(firstChars);
if (log.isTraceEnabled()) {
log.trace("Text (" + context.length() + " chars): " + context);
log.trace("Word (" + span.getCoveredText(context).length() + " chars): " + span.getCoveredText(context));
log.trace("Span: " + span.getStart() + "|" + span.getEnd());
//log.trace("Before|After: " + before + "|" + after);
log.trace("Context (" + contextLength + ") before: |" + URISchemeHelper.getContextBefore(span, context, contextLength));
log.trace("Context (" + contextLength + ") after: |" + URISchemeHelper.getContextAfter(span, context, contextLength) + "|");
log.trace("Message: |" + message.toString() + "|");
log.trace("URI: " + uri.toString());
}
return uri.toString();
}
@Override
public boolean validate(String prefix, String uri, String context) {
Span[] span = null;
try {
span = parse(prefix, uri, context);
} catch (NIFParserException npe) {
return false;
}
if (span.length > 1) {
log.warn(uri + "this uri addresses several strings in the context, validation failed, not unique");
return false;
}
return true;
}
public int calculateMinimalContextLength(String text, Set<Span> spans) {
Monitor mon = MonitorFactory.getTimeMonitor(this.getClass().getSimpleName() + "init").start();
int contextLength = 0;
contextLength = repeat(text, spans, contextLength);
log.info("Minimal context calculated: " + contextLength + " needed: " + mon.stop().getLastValue() + " ms. ");
return contextLength;
}
private int repeat(String text, Set<Span> allSpans, int contextLength) {
Set<String> collision = new HashSet<String>();
for (Span span : allSpans) {
if (false == collision.add(generate("", text, new Span[]{span}))) {
contextLength++;
return repeat(text, allSpans, contextLength);
}
}
return contextLength;
}
}