/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.jwpl; import java.io.IOException; import java.sql.Timestamp; import org.apache.commons.lang.StringEscapeUtils; import org.apache.uima.UimaContext; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.Revision; /** * Reads pairs of adjacent revisions of all articles. */ @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) public class WikipediaRevisionPairReader extends WikipediaRevisionReaderBase { public static final String REVISION_1 = "Revision1"; public static final String REVISION_2 = "Revision2"; /** * Restrict revision pairs to cases where the length of the revisions differ more than this * value (counted in characters). * */ public static final String PARAM_MIN_CHANGE = "MinChange"; @ConfigurationParameter(name = PARAM_MIN_CHANGE, mandatory = true, defaultValue = "0") private int minChange; /** * Restrict revision pairs to cases where the length of the revisions does not differ more than * this value (counted in characters). * */ public static final String PARAM_MAX_CHANGE = "MaxChange"; @ConfigurationParameter(name = PARAM_MAX_CHANGE, mandatory = true, defaultValue = "10000") private int maxChange; /** The number of revision pairs that should be skipped in the beginning. */ public static final String PARAM_SKIP_FIRST_N_PAIRS = "SkipFirstNPairs"; @ConfigurationParameter(name = PARAM_SKIP_FIRST_N_PAIRS, mandatory = false) protected int skipFirstNPairs; private Timestamp savedTimestamp; private int nrOfRevisionsProcessed; @Override public void initialize(UimaContext context) throws ResourceInitializationException { if (revisionIdFile != null || revisionIdParamArray != null) { this.getLogger() .log(Level.WARNING, "Reading a predefined list of revisions is currently not supported by the WikipediaRevisionPairReader. Falling back to reading ALL revisions."); revisionIdFile = null; revisionIdParamArray = null; // TODO add support for reading a defined set of revisions (like the // WikipediaRevisionReader) } super.initialize(context); savedTimestamp = null; nrOfRevisionsProcessed = 0; } @Override public void getNext(JCas jcas) throws IOException, CollectionException { super.getNext(jcas); Timestamp currentTimestamp = timestampIter.next(); if (currentTimestamp == null) { throw new CollectionException(new Throwable( "Current timestamp is null. Upps ... should not happen.")); } this.getLogger().log(Level.FINE, currentArticle.getPageId() + "-" + currentTimestamp); try { JCas revView1 = jcas.createView(REVISION_1); JCas revView2 = jcas.createView(REVISION_2); Revision revision1; Revision revision2; String text1 = ""; String text2 = ""; if (nrOfRevisionsProcessed < skipFirstNPairs) { if (nrOfRevisionsProcessed % 1000 == 0) { this.getLogger().log(Level.INFO, "Skipping " + nrOfRevisionsProcessed + "th revision."); } // create fake revisions revision1 = getRevision(null); revision2 = getRevision(null); } else { revision1 = getRevision(savedTimestamp); revision2 = getRevision(currentTimestamp); text1 = getText(revision1); text2 = getText(revision2); int difference = Math.abs(text1.length() - text2.length()); if (difference < minChange || difference > maxChange) { text1 = ""; text2 = ""; } } revView1.setDocumentText(text1); revView2.setDocumentText(text2); addDocumentMetaData(jcas, currentArticle.getPageId(), revision1.getRevisionID()); addDocumentMetaData(revView1, currentArticle.getPageId(), revision1.getRevisionID()); addDocumentMetaData(revView2, currentArticle.getPageId(), revision2.getRevisionID()); addRevisionAnnotation(revView1, revision1); addRevisionAnnotation(revView2, revision2); savedTimestamp = currentTimestamp; if (!timestampIter.hasNext()) { savedTimestamp = null; } nrOfRevisionsProcessed++; } catch (WikiApiException e) { throw new CollectionException(e); } catch (CASException e) { throw new CollectionException(e); } } // TODO Use SWEBLE private String getText(Revision rev) { String text = rev.getRevisionText(); if (outputPlainText) { text = StringEscapeUtils.unescapeHtml(text); ParsedPage pp = parser.parse(text); if (pp == null) { return ""; } text = pp.getText(); // text = WikiUtils.mediaWikiMarkup2PlainText(text); // replace multiple white space with single white space text = WikiUtils.cleanText(text); } return text; } private Revision getRevision(Timestamp timestamp) throws CollectionException { Revision revision; if (timestamp != null) { try { revision = this.revisionApi.getRevision(currentArticle.getPageId(), timestamp); } catch (WikiApiException e) { throw new CollectionException(e); } } else { revision = new Revision(0); revision.setArticleID(currentArticle.getPageId()); revision.setComment(""); revision.setContributorName(""); revision.setContributorId(null); revision.setRevisionID(0); revision.setRevisionText(""); revision.setTimeStamp(timestamp); revision.setMinor(false); } return revision; } }