package org.wikipedia.miner.extract.steps.finalSummary;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.FileReader;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.hadoop.util.AvroCharSequenceComparator;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.mapred.Pair;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.record.CsvRecordOutput;
import org.apache.hadoop.record.Record;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.struct.DbIntList;
import org.wikipedia.miner.db.struct.DbLabel;
import org.wikipedia.miner.db.struct.DbLabelForPage;
import org.wikipedia.miner.db.struct.DbLabelForPageList;
import org.wikipedia.miner.db.struct.DbLinkLocation;
import org.wikipedia.miner.db.struct.DbLinkLocationList;
import org.wikipedia.miner.db.struct.DbPage;
import org.wikipedia.miner.db.struct.DbSenseForLabel;
import org.wikipedia.miner.extract.model.struct.LabelOccurrences;
import org.wikipedia.miner.extract.model.struct.LabelSense;
import org.wikipedia.miner.extract.model.struct.LabelSenseList;
import org.wikipedia.miner.extract.model.struct.LabelSummary;
import org.wikipedia.miner.extract.model.struct.LinkSummary;
import org.wikipedia.miner.extract.model.struct.PageDepthSummary;
import org.wikipedia.miner.extract.model.struct.PageDetail;
import org.wikipedia.miner.extract.model.struct.PageSummary;
import org.wikipedia.miner.extract.model.struct.PrimaryLabels;
import org.wikipedia.miner.extract.steps.LocalStep;
import org.wikipedia.miner.extract.steps.Step;
import org.wikipedia.miner.extract.steps.labelOccurrences.LabelOccurrenceStep;
import org.wikipedia.miner.extract.steps.labelSenses.LabelSensesStep;
import org.wikipedia.miner.extract.steps.pageDepth.PageDepthStep;
import org.wikipedia.miner.extract.steps.primaryLabel.PrimaryLabelStep;
import org.wikipedia.miner.extract.steps.sortedPages.PageSortingStep;
import org.wikipedia.miner.extract.util.SiteInfo;
import org.wikipedia.miner.model.Page.PageType;
public class FinalSummaryStep extends LocalStep {
private static Logger logger = Logger.getLogger(FinalSummaryStep.class) ;
private PageSortingStep pageSortingStep ;
private PageDepthStep pageDepthStep ;
private PrimaryLabelStep primaryLabelStep ;
private LabelSensesStep labelSensesStep ;
private LabelOccurrenceStep labelOccurrenceStep ;
private Comparator<DbLabelForPage> labelComparator = new Comparator<DbLabelForPage>() {
public int compare(DbLabelForPage a, DbLabelForPage b) {
int cmp = new Long(b.getLinkOccCount()).compareTo(a.getLinkOccCount()) ;
if (cmp != 0)
return cmp ;
cmp = new Long(b.getLinkDocCount()).compareTo(a.getLinkDocCount()) ;
if (cmp != 0)
return cmp ;
return(a.getText().compareTo(b.getText())) ;
}
} ;
private AvroCharSequenceComparator<CharSequence> labelTextComparator = new AvroCharSequenceComparator<CharSequence>() ;
public FinalSummaryStep(Path workingDir, PageSortingStep pageSortingStep, PageDepthStep pageDepthStep, PrimaryLabelStep primaryLabelStep, LabelSensesStep labelSensesStep, LabelOccurrenceStep labelOccurrenceStep) throws IOException {
super(workingDir);
this.pageSortingStep = pageSortingStep ;
this.pageDepthStep = pageDepthStep ;
this.primaryLabelStep = primaryLabelStep ;
this.labelSensesStep = labelSensesStep ;
this.labelOccurrenceStep = labelOccurrenceStep ;
}
@Override
public int run() throws Exception {
logger.info("Starting final step");
if (isFinished()) {
logger.info(" - already completed");
return 0 ;
} else
reset() ;
finalizePageStuff() ;
finalizeLabelStuff() ;
finish() ;
return 0 ;
}
public void finalizePageStuff() throws IOException {
BufferedWriter pageWriter = createWriter("page.csv") ;
BufferedWriter articleParentsWriter = createWriter("articleParents.csv") ;
BufferedWriter categoryParentsWriter = createWriter("categoryParents.csv") ;
BufferedWriter childArticlesWriter = createWriter("childArticles.csv") ;
BufferedWriter childCategoriesWriter = createWriter("childCategories.csv") ;
BufferedWriter pageLabelWriter = createWriter("pageLabel.csv") ;
BufferedWriter pageLinkInWriter = createWriter("pageLinkIn.csv") ;
BufferedWriter pageLinkOutWriter = createWriter("pageLinkOut.csv") ;
BufferedWriter redirectSourcesByTargetWriter = createWriter("redirectSourcesByTarget.csv") ;
BufferedWriter redirectTargetsBySourceWriter = createWriter("redirectTargetsBySource.csv") ;
BufferedWriter sentenceSplitsWriter = createWriter("sentenceSplits.csv") ;
Path pageDetailPath = getMainAvroResultPath(pageSortingStep) ;
SeekableInput pageDetailInput = new FsInput(pageDetailPath, new Configuration());
Schema pageDetailSchema = Pair.getPairSchema(Schema.create(Type.INT),PageDetail.getClassSchema()) ;
DatumReader<Pair<Integer,PageDetail>> pageDetailDatumReader = new SpecificDatumReader<Pair<Integer,PageDetail>>(pageDetailSchema);
FileReader<Pair<Integer,PageDetail>> pageDetailReader = DataFileReader.openReader(pageDetailInput, pageDetailDatumReader) ;
Path pageDepthsPath = getMainAvroResultPath(pageDepthStep) ;
SeekableInput pageDepthsInput = new FsInput(pageDepthsPath, new Configuration());
Schema pageDepthsSchema = Pair.getPairSchema(Schema.create(Type.INT),PageDepthSummary.getClassSchema()) ;
DatumReader<Pair<Integer,PageDepthSummary>> pageDepthsDatumReader = new SpecificDatumReader<Pair<Integer,PageDepthSummary>>(pageDepthsSchema);
FileReader<Pair<Integer,PageDepthSummary>> pageDepthsReader = DataFileReader.openReader(pageDepthsInput, pageDepthsDatumReader) ;
Path primaryLabelPath = getMainAvroResultPath(primaryLabelStep) ;
SeekableInput primaryLabelInput = new FsInput(primaryLabelPath, new Configuration());
Schema primaryLabelSchema = Pair.getPairSchema(Schema.create(Type.INT),PrimaryLabels.getClassSchema()) ;
DatumReader<Pair<Integer,PrimaryLabels>> primaryLabelDatumReader = new SpecificDatumReader<Pair<Integer,PrimaryLabels>>(primaryLabelSchema);
FileReader<Pair<Integer,PrimaryLabels>> primaryLabelReader = DataFileReader.openReader(primaryLabelInput, primaryLabelDatumReader) ;
//read through pageDetail and pageDepth files simultaneously.
//both are sorted by id, but pageDepth will be missing many entries.
Pair<Integer,PageDetail> detailPair = null ;
Pair<Integer,PageDepthSummary> depthPair = null ;
Pair<Integer,PrimaryLabels> primaryLabelPair = null ;
while (pageDetailReader.hasNext()) {
detailPair = pageDetailReader.next();
PageDetail detail = detailPair.value() ;
//identify page depth summary, if there is one
while ((depthPair == null || depthPair.key() < detailPair.key()) && pageDepthsReader.hasNext())
depthPair = pageDepthsReader.next();
PageDepthSummary depth = null ;
if (depthPair.key().equals(detailPair.key()))
depth = depthPair.value() ;
//identify primary label summary, if there is one
while ((primaryLabelPair == null || primaryLabelPair.key() < detailPair.key()) && primaryLabelReader.hasNext())
primaryLabelPair = primaryLabelReader.next();
Set<CharSequence> primaryLabels = new HashSet<CharSequence>() ;
if (primaryLabelPair.key().equals(detailPair.key()))
primaryLabels.addAll(primaryLabelPair.value().getLabels()) ;
//now we definitely have a page. If we have a depth, then it is synchonized with page
DbPage page = buildPage(detail, depth) ;
if (page.getType() == PageType.invalid.ordinal())
continue ;
write(detail.getId(), page, pageWriter) ;
if (detail.getNamespace() == SiteInfo.MAIN_KEY) {
if (detail.getRedirectsTo() == null) {
//this is an article or disambig
DbIntList articleParents = buildIntList(detail.getParentCategories()) ;
write(detail.getId(),articleParents, articleParentsWriter) ;
DbLinkLocationList linksIn = buildLinkLocationList(detail.getLinksIn());
write(detail.getId(), linksIn, pageLinkInWriter) ;
DbLinkLocationList linksOut = buildLinkLocationList(detail.getLinksOut());
write(detail.getId(), linksOut, pageLinkOutWriter) ;
DbIntList redirectSources = buildIntList(detail.getRedirects()) ;
write(detail.getId(),redirectSources, redirectSourcesByTargetWriter) ;
DbIntList sentenceSplits = buildIntList(detail.getSentenceSplits()) ;
write(detail.getId(),sentenceSplits, sentenceSplitsWriter) ;
DbLabelForPageList labels = buildLabelList(detail, primaryLabels) ;
write(detail.getId(), labels, pageLabelWriter) ;
} else {
//this is a redirect
redirectTargetsBySourceWriter.write(detail.getId() + "," + detail.getRedirectsTo().getId() + "\n");
}
} else if (detail.getNamespace() == SiteInfo.CATEGORY_KEY) {
if (detail.getRedirectsTo() == null) {
DbIntList categoryParents = buildIntList(detail.getParentCategories()) ;
write(detail.getId(),categoryParents, categoryParentsWriter) ;
DbIntList childArticles = buildIntList(detail.getChildArticles()) ;
write(detail.getId(),childArticles, childArticlesWriter) ;
DbIntList childCategories = buildIntList(detail.getChildCategories()) ;
write(detail.getId(),childCategories, childCategoriesWriter) ;
} else {
//TODO: oops, no clean way of dealing with category redirects
}
}
}
pageWriter.close() ;
articleParentsWriter.close();
categoryParentsWriter.close() ;
childArticlesWriter.close() ;
childCategoriesWriter.close() ;
pageLinkInWriter.close();
pageLinkOutWriter.close();
redirectSourcesByTargetWriter.close() ;
redirectTargetsBySourceWriter.close() ;
sentenceSplitsWriter.close();
pageLabelWriter.close();
}
public void finalizeLabelStuff() throws IOException {
BufferedWriter labelWriter = createWriter("label.csv") ;
Path labelSensesPath = getMainAvroResultPath(labelSensesStep) ;
SeekableInput labelSensesInput = new FsInput(labelSensesPath, new Configuration());
Schema labelSensesSchema = Pair.getPairSchema(Schema.create(Type.STRING),LabelSenseList.getClassSchema()) ;
DatumReader<Pair<CharSequence,LabelSenseList>> labelSensesDatumReader = new SpecificDatumReader<Pair<CharSequence,LabelSenseList>>(labelSensesSchema);
FileReader<Pair<CharSequence,LabelSenseList>> labelSensesReader = DataFileReader.openReader(labelSensesInput, labelSensesDatumReader) ;
Path labelOccurrencesPath = getMainAvroResultPath(labelOccurrenceStep) ;
SeekableInput labelOccurrencesInput = new FsInput(labelOccurrencesPath, new Configuration());
Schema labelOccurrencesSchema = Pair.getPairSchema(Schema.create(Type.STRING),LabelOccurrences.getClassSchema()) ;
DatumReader<Pair<CharSequence,LabelOccurrences>> labelOccurrencesDatumReader = new SpecificDatumReader<Pair<CharSequence,LabelOccurrences>>(labelOccurrencesSchema);
FileReader<Pair<CharSequence,LabelOccurrences>> labelOccurrencesReader = DataFileReader.openReader(labelOccurrencesInput, labelOccurrencesDatumReader) ;
Pair<CharSequence,LabelSenseList> sensesPair = null ;
Pair<CharSequence,LabelOccurrences> occurrencesPair = null ;
while (labelSensesReader.hasNext()) {
sensesPair = labelSensesReader.next();
CharSequence label = sensesPair.key() ;
LabelSenseList senses = sensesPair.value() ;
while ((occurrencesPair == null || labelTextComparator.compare(occurrencesPair.key(), sensesPair.key()) < 0 ) && labelOccurrencesReader.hasNext())
occurrencesPair = labelOccurrencesReader.next();
LabelOccurrences occurrences = null ;
if (labelTextComparator.compare(occurrencesPair.key(), sensesPair.key()) == 0)
occurrences = occurrencesPair.value() ;
//now we definitely have a label and list of senses. If we have occurrences, then they are synchronised with label
ArrayList<DbSenseForLabel> dbSenses = new ArrayList<DbSenseForLabel>() ;
for (LabelSense sense:senses.getSenses()) {
DbSenseForLabel dbSense = new DbSenseForLabel() ;
dbSense.setId(sense.getId());
dbSense.setLinkDocCount(sense.getDocCount());
dbSense.setLinkOccCount(sense.getOccCount());
dbSense.setFromRedirect(sense.getFromRedirect());
dbSense.setFromTitle(sense.getFromTitle());
dbSenses.add(dbSense) ;
}
DbLabel dbLabel = new DbLabel() ;
dbLabel.setSenses(dbSenses);
if (occurrences != null) {
dbLabel.setLinkDocCount(occurrences.getLinkDocCount());
dbLabel.setLinkOccCount(occurrences.getLinkOccCount());
dbLabel.setTextDocCount(occurrences.getTextDocCount());
dbLabel.setTextOccCount(occurrences.getTextOccCount());
}
write(label, dbLabel, labelWriter) ;
}
labelWriter.close();
}
private void write(Integer id, Record record, BufferedWriter writer) throws IOException {
ByteArrayOutputStream outStream = new ByteArrayOutputStream() ;
CsvRecordOutput cro = new CsvRecordOutput(outStream) ;
cro.writeInt(id, "pageId") ;
record.serialize(cro) ;
writer.write(outStream.toString("UTF-8")) ;
}
private void write(CharSequence label, Record record, BufferedWriter writer) throws IOException {
ByteArrayOutputStream outStream = new ByteArrayOutputStream() ;
CsvRecordOutput cro = new CsvRecordOutput(outStream) ;
cro.writeString(label.toString(), "labelText");
record.serialize(cro) ;
writer.write(outStream.toString("UTF-8")) ;
}
private DbPage buildPage(PageDetail detail, PageDepthSummary depth) {
DbPage dbPage = new DbPage() ;
dbPage.setType(getType(detail).ordinal());
dbPage.setTitle(detail.getTitle().toString());
if (depth != null && depth.getDepth() != null)
dbPage.setDepth(depth.getDepth());
else
dbPage.setDepth(-1);
return dbPage ;
}
private DbIntList buildIntList(List<PageSummary> summaries) {
ArrayList<Integer> ids = new ArrayList<Integer>() ;
for (PageSummary summary:summaries)
ids.add(summary.getId()) ;
return new DbIntList(ids) ;
}
private DbIntList buildIntList(Collection<Integer> values) {
ArrayList<Integer> ints = new ArrayList<Integer>() ;
for (Integer value:values)
ints.add(value) ;
return new DbIntList(ints) ;
}
private DbLabelForPageList buildLabelList(PageDetail page, Set<CharSequence> primaryLabels) {
ArrayList<DbLabelForPage> dbLabels = new ArrayList<DbLabelForPage>() ;
Set<CharSequence> redirectTitles = new HashSet<CharSequence>() ;
for (PageSummary redirect:page.getRedirects())
redirectTitles.add(redirect.getTitle()) ;
for (Map.Entry<CharSequence, LabelSummary>e:page.getLabels().entrySet()) {
CharSequence text = e.getKey() ;
LabelSummary detail = e.getValue() ;
DbLabelForPage label = new DbLabelForPage() ;
label.setText(text.toString());
label.setLinkDocCount(detail.getDocCount());
label.setLinkOccCount(detail.getOccCount());
label.setFromRedirect(redirectTitles.contains(text));
label.setFromTitle(page.getTitle().equals(text));
label.setIsPrimary(primaryLabels.contains(text));
dbLabels.add(label) ;
}
Collections.sort(dbLabels, labelComparator) ;
return new DbLabelForPageList(dbLabels) ;
}
private DbLinkLocationList buildLinkLocationList(List<LinkSummary> summaries) {
ArrayList<DbLinkLocation> links = new ArrayList<DbLinkLocation>() ;
for (LinkSummary summary:summaries) {
DbLinkLocation link = new DbLinkLocation() ;
link.setLinkId(summary.getId());
ArrayList<Integer> sentenceIndexes = new ArrayList<Integer>() ;
sentenceIndexes.addAll(summary.getSentenceIndexes()) ;
link.setSentenceIndexes(sentenceIndexes);
links.add(link) ;
}
return new DbLinkLocationList(links) ;
}
private PageType getType(PageDetail detail) {
if (detail.getNamespace() == SiteInfo.MAIN_KEY) {
if (detail.getRedirectsTo() == null) {
if (detail.getIsDisambiguation())
return PageType.disambiguation ;
else
return PageType.article ;
} else {
return PageType.redirect ;
}
} else if (detail.getNamespace() == SiteInfo.CATEGORY_KEY) {
if (detail.getRedirectsTo() == null) {
return PageType.category ;
} else {
//TODO: oops, we don't have a good way to deal with redirects of categories
return PageType.invalid ;
}
} else if (detail.getNamespace() == SiteInfo.TEMPLATE_KEY) {
return PageType.template ;
} else {
return PageType.invalid ;
}
}
private BufferedWriter createWriter(String fileName) throws IOException {
FileSystem fs = getDir().getFileSystem(new Configuration()) ;
FSDataOutputStream stream = fs.create(new Path(getDir() + Path.SEPARATOR + fileName)) ;
OutputStreamWriter streamWriter = new OutputStreamWriter(stream) ;
return new BufferedWriter(streamWriter) ;
}
private Path getMainAvroResultPath(Step step) throws IOException {
FileSystem fs = step.getDir().getFileSystem(new Configuration()) ;
FileStatus[] fileStatuses = fs.listStatus(step.getDir(), new PathFilter() {
public boolean accept(Path path) {
return path.getName().startsWith("part-") ;
}
}) ;
if (fileStatuses.length == 0)
throw new IOException("Could not locate main result file in " + step.getDir()) ;
if (fileStatuses.length > 1)
throw new IOException("Too many result files (so too many reducers) in " + step.getDir()) ;
return fileStatuses[0].getPath() ;
}
}