package com.yahoo.glimmer.indexing.generator;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.yahoo.glimmer.indexing.RDFDocument;
import com.yahoo.glimmer.indexing.RDFDocumentFactory;
import com.yahoo.glimmer.indexing.generator.TermValue.Type;
public class DocumentMapper extends Mapper<LongWritable, Text, TermKey, TermValue> {
private static final Log LOG = LogFactory.getLog(DocumentMapper.class);
public static final int ALIGNMENT_INDEX = -1; // special index for
// alignments
enum Counters {
FAILED_PARSING, INDEXED_OCCURRENCES, NUMBER_OF_RECORDS
}
private String[] fields;
private RDFDocument doc;
protected void setup(org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, TermKey, TermValue>.Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
fields = RDFDocumentFactory.getFieldsFromConf(conf);
doc = RDFDocumentFactory.buildFactory(conf).getDocument();
}
@Override
public void map(LongWritable key, Text record, Context context) throws IOException, InterruptedException {
doc.setContent(record.getBytes(), record.getLength());
if (doc == null || doc.getSubject() == null) {
// Failed parsing
context.getCounter(Counters.FAILED_PARSING).increment(1);
LOG.error("Document failed parsing");
return;
}
if (doc.getId() < 0) {
throw new IllegalStateException("Negative docId:" + doc.getId() + " subject:" + doc.getSubject());
}
// This is used to write the position of the last occurrence and testing
// if the fakeDocOccurrrence for the term has already been written.
Map<String, DocStat> termToDocStatMap = new HashMap<String, DocStat>();
// Iterate over all indices
for (int indexId = 0; indexId < fields.length; indexId++) {
String fieldName = fields[indexId];
if (fieldName.startsWith("NOINDEX")) {
continue;
}
TermValue indexIdValue = new TermValue(Type.INDEX_ID, indexId);
// Iterate in parallel over the words of the indices
MutableString term = new MutableString("");
MutableString nonWord = new MutableString("");
WordReader termReader = doc.content(indexId);
int position = 0;
while (termReader.next(term, nonWord)) {
// Read next property as well
if (term != null && term.length() > 0) {
String termString = term.toString();
// Report progress
context.setStatus(fields[indexId] + "=" + term.substring(0, Math.min(term.length(), 50)));
// Create an occurrence at the next position
TermValue occurrenceValue = new TermValue(Type.OCCURRENCE, doc.getId(), position);
context.write(new TermKey(termString, indexId, occurrenceValue), occurrenceValue);
DocStat docStat = termToDocStatMap.get(termString);
if (docStat == null) {
if (doc.getIndexType() == RDFDocumentFactory.IndexType.VERTICAL) {
// For the Alignment Index, we write the predicate
// id(Which is equal to the index id for a VERTICAL
// index) the first time we encounter a term.
// The 'Alignment Index' is an index without counts
// or positions. It's used for query optimization in
// the query parser. The resulting 'alignment index'
// is basically used as a map from term to
// predicates that the term occurs in.
context.write(new TermKey(termString, ALIGNMENT_INDEX, indexIdValue), indexIdValue);
}
docStat = new DocStat();
docStat.last = position;
docStat.count = 1;
termToDocStatMap.put(termString, docStat);
} else {
docStat.last = position;
docStat.count++;
}
position++;
context.getCounter(Counters.INDEXED_OCCURRENCES).increment(1);
} else {
LOG.info("Nextterm is null");
}
}
if (doc.getIndexType() == RDFDocumentFactory.IndexType.HORIZONTAL && position > 0) {
TermValue docSizeValue = new TermValue(Type.DOC_SIZE, doc.getId(), position);
context.write(new TermKey(TermKey.DOC_SIZE_TERM, indexId, docSizeValue), docSizeValue);
}
for (String termString : termToDocStatMap.keySet()) {
DocStat docStat = termToDocStatMap.get(termString);
TermValue occurrenceCountValue = new TermValue(Type.TERM_STATS, docStat.count, docStat.last);
context.write(new TermKey(termString, indexId, occurrenceCountValue), occurrenceCountValue);
}
termToDocStatMap.clear();
}
context.getCounter(Counters.NUMBER_OF_RECORDS).increment(1);
}
private static class DocStat {
int last;
int count;
}
// For testing
String[] getFields() {
return fields;
}
void setFields(String[] fields) {
this.fields = fields;
}
RDFDocument getDoc() {
return doc;
}
void setDoc(RDFDocument doc) {
this.doc = doc;
}
}