package joshua.corpus.suffix_array; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import joshua.corpus.Corpus; import joshua.corpus.Phrase; import joshua.corpus.vocab.SymbolTable; import joshua.util.IntegerPair; import joshua.util.Pair; public class PhrasePairCollocations { private final Map<Pair<Phrase,Phrase>,List<IntegerPair>> phrasePairCollocations = new HashMap<Pair<Phrase,Phrase>,List<IntegerPair>>(); private final Corpus corpus; public PhrasePairCollocations(Corpus corpus) { this.corpus = corpus; } void record(Phrase phrase1, Phrase phrase2, int position1, int position2) { Pair<Phrase,Phrase> phrasePair = new Pair<Phrase,Phrase>(phrase1,phrase2); if (! phrasePairCollocations.containsKey(phrasePair)) { phrasePairCollocations.put(phrasePair, new ArrayList<IntegerPair>()); } List<IntegerPair> locations = phrasePairCollocations.get(phrasePair); locations.add(new IntegerPair(position1, position2)); } List<HierarchicalPhrases> getHierarchicalPhrases() { SymbolTable vocab = corpus.getVocabulary(); int X = vocab.addNonterminal("X"); List<HierarchicalPhrases> result = new ArrayList<HierarchicalPhrases>(); for (Map.Entry<Pair<Phrase,Phrase>,List<IntegerPair>> entry : phrasePairCollocations.entrySet()) { Pair<Phrase,Phrase> phrasePair = entry.getKey(); int[] phrase1 = phrasePair.first.getWordIDs(); int[] phrase2 = phrasePair.second.getWordIDs(); int[] phrasePairTokens = new int[phrase1.length+1+phrase2.length]; { for (int index=0; index<phrase1.length; index++) { phrasePairTokens[index] = phrase1[index]; } phrasePairTokens[phrase1.length] = X; for (int start=phrase1.length+1, index=start, end=start+phrase2.length; index<end; index++) { phrasePairTokens[index] = phrase2[index - start]; } } List<IntegerPair> locations = entry.getValue(); int[] startPositions = new int[locations.size()*2]; { int index=0; for (IntegerPair location : locations) { startPositions[index++] = location.first; startPositions[index++] = location.second; } } int[] sentenceNumbers = new int[locations.size()]; { int index=0; for (IntegerPair location : locations) { sentenceNumbers[index++] = corpus.getSentenceIndex(location.first); } } Pattern hierarchicalPhrase = new Pattern(vocab, phrasePairTokens); HierarchicalPhrases phraseLocations = new HierarchicalPhrases(hierarchicalPhrase,startPositions,sentenceNumbers); result.add(phraseLocations); } return result; } }