package edu.umd.hooka; import org.apache.hadoop.fs.Path; public abstract class CorpusInfo { public enum Corpus { HANSARDS, ARABIC_SMALL, ARABIC_10k, ARABIC_50k, ARABIC_150k, ARABIC_500k, ARABIC_1000k, ARABIC_1500k, ARABIC_5000k, ARABIC_LARGE, CZECH_WMT08, GERMAN_TINY }; public static CorpusInfo getCorpus(Corpus corpus) { CorpusInfo res = null; switch (corpus) { case HANSARDS: res = new Hansards(); break; case CZECH_WMT08: res = new CzechWMT08(); break; case ARABIC_SMALL: res = new ArabicSmall(); break; case ARABIC_10k: res = new Arabic10k(); break; case ARABIC_50k: res = new Arabic50k(); break; case ARABIC_150k: res = new Arabic150k(); break; case ARABIC_500k: res = new Arabic500k(); break; case ARABIC_1000k: res = new Arabic1000k(); break; case ARABIC_1500k: res = new Arabic1500k(); break; case ARABIC_LARGE: case ARABIC_5000k: res = new ArabicLarge(); break; case GERMAN_TINY: res = new GermanTiny(); break; } return res; } protected abstract String getBasePath(); protected abstract String getBaseName(); public Path getBitext() { return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".bitext"); } public Path getAlignedBitext() { return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".bitext-aligned"); } public Path getCanonicalTTable() { return new Path(getBasePath() + Path.SEPARATOR + getLocalTTable()); } public Path getCanonicalTTable(String type) { return new Path(getBasePath() + Path.SEPARATOR + getLocalTTable(type)); } public Path getLocalTTable() { return new Path(getBaseName() + ".ttable"); } public Path getLocalTTable(String type) { return new Path(getBaseName() + ".ttable-" + type); } public Path getLocalATable() { return new Path(getBaseName() + ".atable"); } public Path getLocalPhraseTable() { return new Path(getBaseName() + ".ptable"); } public Path getTestSubset() { return new Path(getBasePath() + Path.SEPARATOR + getBaseName() + ".test"); } static class Hansards extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/hansards.fr-en"; } public String getBaseName() { return "hansards.aachen"; } } static class CzechWMT08 extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/cs-en.wmt08"; } public String getBaseName() { return "cs-en"; } } static class ArabicSmall extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/small.ar-en.ldc"; } public String getBaseName() { return "small.ar-en"; } } static class ArabicLarge extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/large.ar-en.ldc"; } public String getBaseName() { return "large.ar-en"; } } static class Arabic10k extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/ar-en.ldc.10k2"; } public String getBaseName() { return "ar-en.10k"; } } static class Arabic50k extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/ar-en.ldc.50k"; } public String getBaseName() { return "ar-en.50k"; } } static class Arabic150k extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/ar-en.ldc.150k"; } public String getBaseName() { return "ar-en.150k"; } } static class Arabic500k extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/ar-en.ldc.500k"; } public String getBaseName() { return "ar-en.500k"; } } static class Arabic1500k extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/ar-en.ldc.1500k"; } public String getBaseName() { return "ar-en.1500k"; } } static class Arabic1000k extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/ar-en.ldc.1000k"; } public String getBaseName() { return "ar-en.1000k"; } } static class GermanTiny extends CorpusInfo { public String getBasePath() { return "/shared/bitexts/tiny.de-en"; } public String getBaseName() { return "tiny-deen"; } } }