package com.yahoo.glimmer.query; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.di.big.mg4j.document.Document; import it.unimi.di.big.mg4j.document.DocumentCollection; import it.unimi.di.big.mg4j.index.BitStreamIndex; import it.unimi.di.big.mg4j.index.Index; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; import java.util.HashMap; import java.util.Map; public class SetDocumentPriors { public static final String IMPORTANT = "2"; public static final String NEUTRAL = "1"; public static final String UNIMPORTANT = "0"; public static void main(String args[]) { try { Context context = new Context(args[0]); InputStreamReader priorRulesReader = new InputStreamReader(new FileInputStream(context.getDocumentPriorsRules())); Map<String, Integer> rules = readPriorRules(priorRulesReader); RDFIndex index = new RDFIndex("", context); Index fieldIndex = (BitStreamIndex) index.getField(context.getDocumentPriorsField()); calculatePriors(fieldIndex.numberOfDocuments, index.getCollection(), rules, new FileOutputStream(context.getDocumentPriorsFile())); } catch (Exception e) { e.printStackTrace(); } } public static Map<String, Integer> readPriorRules(Reader priorRulesReader) throws NumberFormatException, IOException { HashMap<String, Integer> rules = new HashMap<String, Integer>(); BufferedReader br = new BufferedReader(priorRulesReader); String line; while ((line = br.readLine()) != null) { if (!line.trim().equals("")) { String parts[] = line.split("="); rules.put(parts[0], Integer.parseInt(parts[1])); } } return rules; } public static void calculatePriors(long numberOfDocuments, DocumentCollection collection, Map<String, Integer> hostToWeightMap, OutputStream documetPriorsOutputStream) { HashMap<Integer, Integer> priors = new HashMap<Integer, Integer>(); Document d; try { for (int i = 0; i < numberOfDocuments; i++) { d = collection.document(i); URI uri = null; String host = null; try { uri = new URI(d.title().toString()); if (uri != null) { host = uri.getHost(); } } catch (URISyntaxException use) { } if (host != null) { for (String rule : hostToWeightMap.keySet()) { if (rule.contains(host)) priors.put(i, hostToWeightMap.get(rule)); } } d.close(); } System.out.print("Serializing priors..."); BinIO.storeObject(priors, documetPriorsOutputStream); System.out.println("done"); } catch (IOException e) { e.printStackTrace(); } } }