/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.frontier.precedence; import static org.archive.modules.CoreAttributeConstants.A_PRECALC_PRECEDENCE; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.archive.modules.recrawl.PersistProcessor; import org.archive.util.ArchiveUtils; import org.archive.util.FileUtils; import org.archive.util.bdbje.EnhancedEnvironment; import org.archive.util.iterator.LineReadingIterator; import com.sleepycat.bind.serial.SerialBinding; import com.sleepycat.bind.serial.StoredClassCatalog; import com.sleepycat.bind.tuple.StringBinding; import com.sleepycat.collections.StoredSortedMap; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseException; /** * Utility class for loading externally-created URI-precedence values * into the URI-history database. * * TODO: refactor code relied on in PersistProcessor for easier reuse here * (and elsewhere) * * @author gojomo */ public class PrecedenceLoader { public PrecedenceLoader() { } /** * Utility main for importing a text file (first argument) with lines of * the form: * * URI [whitespace] precedence * * into a BDB-JE environment (second argument, created if necessary). * * @param args command-line arguments * @throws DatabaseException * @throws IOException */ public static void main(String[] args) throws DatabaseException, IOException { if(args.length==2) { main2args(args); } else { System.out.println("Arguments: "); System.out.println(" source target"); System.out.println( "...where source is a file of lines 'URI precedence' "); System.out.println( "and target is a BDB env dir (created if necessary). "); return; } } /** * Merge the precalculated precedence information in the first argument * file to the environment in the second environment (path; environment * will be created if it does not already exist). * * @param args command-line arguments * @throws DatabaseException * @throws FileNotFoundException * @throws UnsupportedEncodingException * @throws IOException */ private static void main2args(String[] args) throws DatabaseException, FileNotFoundException, UnsupportedEncodingException, IOException { File source = new File(args[0]); File env = new File(args[1]); FileUtils.ensureWriteableDirectory(env); // setup target environment EnhancedEnvironment targetEnv = PersistProcessor.setupCopyEnvironment(env); StoredClassCatalog classCatalog = targetEnv.getClassCatalog(); Database historyDB = targetEnv.openDatabase( null, PersistProcessor.URI_HISTORY_DBNAME, PersistProcessor.HISTORY_DB_CONFIG.toDatabaseConfig()); @SuppressWarnings({ "rawtypes", "unchecked" }) StoredSortedMap<String, Object> historyMap = new StoredSortedMap<String, Object>(historyDB, new StringBinding(), new SerialBinding(classCatalog, Map.class), true); int count = 0; if(source.isFile()) { // scan log, writing to database BufferedReader br = ArchiveUtils.getBufferedReader(source); Iterator<String> iter = new LineReadingIterator(br); while(iter.hasNext()) { String line = (String) iter.next(); String[] splits = line.split("\\s"); String uri = splits[0]; if(!uri.matches("\\w+:.*")) { // prepend "http://" uri = "http://"+uri; } String key = PersistProcessor.persistKeyFor(uri); int precedence = Integer.parseInt(splits[1]); @SuppressWarnings("unchecked") Map<String, Object> map = (Map<String, Object>)historyMap.get(key); if (map==null) { map = new HashMap<String, Object>(); } map.put(A_PRECALC_PRECEDENCE, precedence); historyMap.put(key,map); count++; if(count % 100000 == 0) { System.out.print(count+"... "); } } br.close(); System.out.println(); System.out.println(count+" entries loaded"); } else { // error System.err.println("unacceptable source file"); return; } // cleanup historyDB.sync(); historyDB.close(); targetEnv.close(); System.out.println(count+" records imported from "+source+" to BDB env "+env); } }