/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.collection.trecweb;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Preconditions;
import edu.umd.cloud9.collection.DocnoMapping;
public class Gov2DocnoMapping implements DocnoMapping {
private static final NumberFormat FormatW2 = new DecimalFormat("00");
private static final NumberFormat FormatW3 = new DecimalFormat("000");
private static final NumberFormat FormatW7 = new DecimalFormat("0000000");
private static final NumberFormat FormatW8 = new DecimalFormat("00000000");
private int[][] docids;
private int[] offsets;
public Gov2DocnoMapping() {}
@Override
public int getDocno(String docid) {
Preconditions.checkNotNull(docid);
int dirNum = Integer.parseInt(docid.substring(2, 5));
int subdirNum = Integer.parseInt(docid.substring(6, 8));
int num = Integer.parseInt(docid.substring(9));
int offset = Arrays.binarySearch(docids[dirNum * 100 + subdirNum], num);
return offsets[dirNum * 100 + subdirNum] + offset + 1;
}
@Override
public String getDocid(int docno) {
docno--;
int i = 0;
for (i = 0; i < docids.length; i++) {
if (docno < offsets[i]) {
break;
}
}
i--;
while (offsets[i] == -1) {
i--;
}
int subdirNum = i % 100;
int dirNum = (i - subdirNum) / 100;
int num = docids[i][docno - offsets[i]];
if (num >= 10000000) {
return "GX" + FormatW3.format(dirNum) + "-" + FormatW2.format(subdirNum) + "-"
+ FormatW8.format(num);
}
return "GX" + FormatW3.format(dirNum) + "-" + FormatW2.format(subdirNum) + "-"
+ FormatW7.format(num);
}
@Override
public void loadMapping(Path p, FileSystem fs) throws IOException {
FSDataInputStream in = fs.open(p);
List<Integer> ids = null;
int lastOffset = -1;
int sz = in.readInt();
docids = new int[273 * 100][];
offsets = new int[273 * 100];
for (int i = 0; i < 273 * 100; i++) {
offsets[i] = -1;
}
for (int i = 0; i < sz; i++) {
String docName = in.readUTF();
// GX243-38-13543987
int dirNum = Integer.parseInt(docName.substring(2, 5));
int subdirNum = Integer.parseInt(docName.substring(6, 8));
int num = Integer.parseInt(docName.substring(9));
int curOffset = dirNum * 100 + subdirNum;
if (curOffset != lastOffset) {
if (ids != null) {
int[] idArray = new int[ids.size()];
for (int j = 0; j < ids.size(); j++) {
idArray[j] = ids.get(j);
}
Arrays.sort(idArray);
docids[lastOffset] = idArray;
}
lastOffset = curOffset;
ids = new ArrayList<Integer>();
offsets[curOffset] = i;
}
ids.add(num);
}
if (ids != null) {
int[] idArray = new int[ids.size()];
for (int j = 0; j < ids.size(); j++) {
idArray[j] = ids.get(j);
}
Arrays.sort(idArray);
docids[lastOffset] = idArray;
}
in.close();
}
@Override
public Builder getBuilder() {
return new TrecWebDocnoMappingBuilder();
}
/**
* Simple program the provides access to the docno/docid mappings.
*/
public static void main(String[] args) throws IOException {
if (args.length < 2) {
System.out.println("usage: (getDocno|getDocid) [mapping-file] [docid/docno]");
System.exit(-1);
}
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
System.out.println("loading mapping file " + args[1]);
Gov2DocnoMapping mapping = new Gov2DocnoMapping();
mapping.loadMapping(new Path(args[1]), fs);
if (args[0].equals("getDocno")) {
System.out.println("looking up docno for \"" + args[2] + "\"");
int idx = mapping.getDocno(args[2]);
if (idx > 0) {
System.out.println(mapping.getDocno(args[2]));
} else {
System.err.print("Invalid docid!");
}
} else if (args[0].equals("getDocid")) {
try {
System.out.println("looking up docid for " + args[2]);
System.out.println(mapping.getDocid(Integer.parseInt(args[2])));
} catch (Exception e) {
System.err.print("Invalid docno!");
}
} else {
System.out.println("Invalid command!");
System.out.println("usage: (list|getDocno|getDocid) [mapping-file] [docid/docno]");
}
}
}