/******************************************************************************* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.csniper.resbuild.stuff; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createAggregateDescription; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createPrimitiveDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createCollectionReader; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.fit.pipeline.SimplePipeline; import de.tudarmstadt.ukp.csniper.resbuild.ProgressLogger; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; import de.tudarmstadt.ukp.dkpro.core.io.bincas.SerializedCasWriter; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser; /** * @author Erik-Lân Do Dinh */ public class BncLocalCorpusBuilder { private static final String COLLECTION_ID = "BNC"; private static final String HADOOP_USER_HOME = "D:/ukp/data"; private static final String INPUT_BNC_PATH = "jar:file:" + HADOOP_USER_HOME + "/BNC.zip!"; private static final String INCLUSION_FILE = HADOOP_USER_HOME + "/inclusions.txt"; private static final String EXCLUSION_FILE = HADOOP_USER_HOME + "/exclusions.txt"; // $dir will automatically be replaced by the user home dir on the hdfs private static final String OUTPUT_SER_CAS_PATH = HADOOP_USER_HOME + "/output2/" + COLLECTION_ID + "/serialized/"; private static final String OUTPUT_CSV_PATH = HADOOP_USER_HOME + "/output2/" + COLLECTION_ID + "/csv/"; private static final String CLASS_NAME = BncLocalCorpusBuilder.class.getSimpleName(); public static CollectionReader buildCollectionReader() throws ResourceInitializationException { List<String> patterns = new ArrayList<String>(); try { patterns.addAll(read(INCLUSION_FILE, "[+]**/")); System.out.println("Including documents specified in [" + INCLUSION_FILE + "]."); } catch (IOException e) { patterns.add("[+]**/*.xml"); System.out.println("No inclusions specified, parsing all BNC documents."); } try { patterns.addAll(read(EXCLUSION_FILE, "[-]**/")); System.out.println("Excluding documents specified in [" + EXCLUSION_FILE + "]."); } catch (IOException e) { System.out.println("No exclusions specified, parsing all specified BNC documents."); } CollectionReader reader = createCollectionReader(BncReaderReloaded.class, BncReaderReloaded.PARAM_PATH, INPUT_BNC_PATH, BncReaderReloaded.PARAM_PATTERNS, patterns.toArray(new String[0]), BncReaderReloaded.PARAM_LANGUAGE, "en"); return reader; } private static List<String> read(String aFile, String aPatternPrefix) throws IOException { List<String> patterns = new ArrayList<String>(); for (String s : FileUtils.readLines(new File(aFile), "UTF-8")) { patterns.add(aPatternPrefix + s); } return patterns; } public static AnalysisEngineDescription buildMapperEngine() throws ResourceInitializationException { // rename collectionId to BNC (from the path where BNC is located) AnalysisEngineDescription rn = createPrimitiveDescription(Renamer.class, Renamer.PARAM_COLLECTION_ID, COLLECTION_ID); // parse AnalysisEngineDescription sp = createPrimitiveDescription(StanfordParser.class, StanfordParser.PARAM_WRITE_PENN_TREE, true, StanfordParser.PARAM_LANGUAGE, "en", StanfordParser.PARAM_VARIANT, "factored", StanfordParser.PARAM_QUOTE_BEGIN, new String[] { "‘" }, StanfordParser.PARAM_QUOTE_END, new String[] { "’" }); // output as serialized cas AnalysisEngineDescription scw = createPrimitiveDescription(SerializedCasWriter.class, SerializedCasWriter.PARAM_COMPRESSION, CompressionMethod.XZ, SerializedCasWriter.PARAM_TARGET_LOCATION, OUTPUT_SER_CAS_PATH, SerializedCasWriter.PARAM_STRIP_EXTENSION, true); // output as csv for fast db import AnalysisEngineDescription csvw = createPrimitiveDescription(PennTreesToCsvWriter.class, PennTreesToCsvWriter.PARAM_PATH, OUTPUT_CSV_PATH); AnalysisEngineDescription log = createPrimitiveDescription(ProgressLogger.class, ProgressLogger.PARAM_BRIEF_OUTPUT, true); return createAggregateDescription(rn, sp, scw, csvw, log); } public static void main(String[] args) { try { SimplePipeline.runPipeline(buildCollectionReader(), buildMapperEngine()); } catch (Exception e) { e.printStackTrace(); } } }