Skip to content

Commit

Permalink
Updated with memory mapped arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
pakozm committed Jul 31, 2015
1 parent a16e75d commit 360e6c0
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 48 deletions.
69 changes: 59 additions & 10 deletions packages/nnlm.corpora/c_src/nnlm_corpora.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,37 @@ using AprilUtils::constString;
////////////////////////////////////////////////////////////////////////////

namespace LanguageModels {

NNLMCorpora::Sentence::Sentence(uint32_t *b, uint32_t *e) : b(b), e(e) {
}

NNLMCorpora::Sentence::Sentence(const NNLMCorpora::Sentence &other) :
b(other.b), e(other.e) {
}

const uint32_t *NNLMCorpora::Sentence::begin() const {
return b;
}

const uint32_t *NNLMCorpora::Sentence::end() const {
return e;
}

size_t NNLMCorpora::Sentence::size() const {
return e - b;
}

const uint32_t &NNLMCorpora::Sentence::operator[](size_t i) const {
return b[i];
}

NNLMCorpora::Sentence &NNLMCorpora::Sentence::operator=(const Sentence &other) {
this->b = other.b;
this->e = other.e;
return *this;
}

/////////////////////////////////////////

NNLMCorpora::NNLMCorpora(const char *filename,
AprilUtils::LexClass *lex,
Expand All @@ -37,39 +68,57 @@ namespace LanguageModels {
ERROR_EXIT2(1, "Error reading mmapped file %s: %s\n",
filename, strerror(errno));
}
constString line, word;
// Two traversals: 1. counting sentences and words, 2. extracting tokens.
//
// 1. COUNTING SENTENCES AND WORDS
num_words = 0;
first_sentence_word.push_back(0u);
constString input = constString(file_mmapped, file_size);
constString line;
unsigned int i = 0;
while( (line = input.extract_line()) ) {
while( (word = line.extract_token("\n\r\t ")) ) ++num_words;
first_sentence_word.push_back(num_words);
}
//
// 2. EXTRACTING TOKENS
words_size = sizeof(uint32_t) * num_words;
if ((words = (uint32_t*)mmap(NULL, words_size,
PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED,
-1, 0)) == MAP_FAILED) {
ERROR_EXIT2(1, "Error creating anonymous mmap of size %lu: %s\n",
words_size, strerror(errno));
}
input = constString(file_mmapped, file_size);
char word_str[MAX_WORD_LEN + 1];
size_t i = 0;
while( (line = input.extract_line()) ) {
constString word;
sentences.emplace_back();
while( (word = line.extract_token("\n\r\t ")) ) {
strncpy(word_str, (const char *)word, word.len());
word_str[word.len()] = '\0';
uint32_t wid;
if (!lex->getWordId(word_str, wid)) wid = unk_id;
sentences[i].push_back(wid);
words[i++] = wid;
}
++i;
}
close(file_descriptor);
munmap(file_mmapped, file_size);
}

NNLMCorpora::~NNLMCorpora() {
munmap(words, words_size);
}

const std::vector<uint32_t> &NNLMCorpora::getSentence(size_t i) const {
return sentences[i];
NNLMCorpora::Sentence NNLMCorpora::getSentence(size_t i) const {
return Sentence(words + first_sentence_word[i],
words + first_sentence_word[i+1]);
}

size_t NNLMCorpora::getSentenceLength(size_t i) const {
return sentences[i].size();
return first_sentence_word[i+1] - first_sentence_word[i];
}

size_t NNLMCorpora::getNumberOfSentences() const {
return sentences.size();
return first_sentence_word.size() - 1u;
}

size_t NNLMCorpora::getVocabSize() {
Expand Down
21 changes: 19 additions & 2 deletions packages/nnlm.corpora/c_src/nnlm_corpora.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ namespace LanguageModels {
* Once it is loaded, the class allows to index sentences as vectors.
*/
class NNLMCorpora : public Referenced {
class Sentence; // forward declaration
public:
NNLMCorpora(const char *filename,
AprilUtils::LexClass *lex,
const uint32_t unk_id);
~NNLMCorpora();

const std::vector<uint32_t> &getSentence(size_t i) const;
NNLMCorpora::Sentence getSentence(size_t i) const;
size_t getSentenceLength(size_t i) const;
size_t getNumberOfSentences() const;
size_t getVocabSize();
Expand All @@ -28,7 +29,23 @@ namespace LanguageModels {
const char *filename;
AprilUtils::SharedPtr<AprilUtils::LexClass> lex;
const uint32_t unk_id;
std::vector< std::vector<uint32_t> > sentences;
std::vector<size_t> first_sentence_word;
size_t num_words, words_size;
uint32_t *words; // mmapped

/// This class is an array wrapper
class Sentence {
public:
Sentence(uint32_t *b, uint32_t *e);
Sentence(const Sentence &other);
const uint32_t *begin() const;
const uint32_t *end() const;
size_t size() const;
const uint32_t &operator[](size_t i) const;
Sentence &operator=(const Sentence &other);
private:
const uint32_t *b, *e; // begin and end pointers
};
};

} // namespace LanguageModels
Expand Down
74 changes: 38 additions & 36 deletions packages/nnlm.dataset/c_src/nnlm_dataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ using Basics::TokenSparseMatrixFloat;

////////////////////////////////////////////////////////////////////////////

typedef std::vector<pair<int,int> > PairVector;

namespace LanguageModels {

// TODO: Implement in a generic way which allow to ignore context cues,
// for instance, giving initial_word=0 and/or final_word=0
NNLMDataSetToken::NNLMDataSetToken(const size_t offset, const size_t length,
const uint32_t initial_word,
const uint32_t final_word,
Expand Down Expand Up @@ -49,34 +53,48 @@ namespace LanguageModels {
}

pair<int,int> NNLMDataSetToken::getSentenceWordPair(int index) const {
// binary search of the sentence which contains the given pattern index
auto sentence_it = upper_bound(first_word.begin(), first_word.end(),
static_cast<uint32_t>(index)) - 1u;
// index of the sentence
int s_index = static_cast<int>(sentence_it - first_word.begin());
// index of the word inside the sentence: it can be w_index<0 or
// w_index>=sentence_size, and this cases will be filled with context cues.
int w_index = index - static_cast<int>(*sentence_it) + offset;
return make_pair(s_index,w_index);
}


TokenSparseMatrixFloat *NNLMDataSetToken::
buildSparseMatrixFloatToken(int i, const PairVector &sentence_word_pairs) const {
unsigned int bunch_size = sentence_word_pairs.size();
SharedPtr<FloatGPUMirroredMemoryBlock> ones = new FloatGPUMirroredMemoryBlock(bunch_size);
SharedPtr<Int32GPUMirroredMemoryBlock> indices = new Int32GPUMirroredMemoryBlock(bunch_size);
SharedPtr<Int32GPUMirroredMemoryBlock> first_index = new Int32GPUMirroredMemoryBlock(bunch_size+1u);
(*first_index)[0] = 0;
for (int j=0; j<static_cast<int>(bunch_size); ++j) {
auto s_it = corpora->getSentence(sentence_word_pairs[j].first).begin();
int w_pos = sentence_word_pairs[j].second + i;
(*ones)[j] = 1.0f;
(*first_index)[j+1] = j+1;
if (w_pos < 0) (*indices)[j] = initial_word - 1;
else if (w_pos >= static_cast<int>(corpora->getSentenceLength(sentence_word_pairs[j].first))) (*indices)[j] = final_word - 1;
else (*indices)[j] = s_it[w_pos] - 1;
}
SparseMatrixFloat *mat = new SparseMatrixFloat(bunch_size,
static_cast<int>(lex_size),
ones.get(), indices.get(),
first_index.get());
return new TokenSparseMatrixFloat(mat);
}

Token *NNLMDataSetToken::getPattern(int index) {
SharedPtr<TokenBunchVector> pat = new TokenBunchVector(length);
auto sentence_word_pair = getSentenceWordPair(index);
auto s_it = corpora->getSentence(sentence_word_pair.first).begin();
for (int i=0; i<static_cast<int>(length); ++i) {
SharedPtr<FloatGPUMirroredMemoryBlock> ones = new FloatGPUMirroredMemoryBlock(1u);
SharedPtr<Int32GPUMirroredMemoryBlock> indices = new Int32GPUMirroredMemoryBlock(1u);
SharedPtr<Int32GPUMirroredMemoryBlock> first_index = new Int32GPUMirroredMemoryBlock(2u);
int w_pos = sentence_word_pair.second + i;
(*ones)[0] = 1.0f;
(*first_index)[0] = 0;
(*first_index)[1] = 1;
if (w_pos < 0) (*indices)[0] = initial_word - 1;
else if (w_pos >= static_cast<int>(corpora->getSentenceLength(sentence_word_pair.first))) (*indices)[0] = final_word - 1;
else (*indices)[0] = s_it[w_pos] - 1;
SparseMatrixFloat *mat = new SparseMatrixFloat(1,
static_cast<int>(lex_size),
ones.get(), indices.get(),
first_index.get());
Token *tk = new TokenSparseMatrixFloat(mat);
if (length == 1) return tk; // FIXME: refactor this code
Token *tk = buildSparseMatrixFloatToken(i, PairVector(1u, sentence_word_pair));
// WARNING flow break here
if (length == 1) return tk;
(*pat)[i] = tk;
}
return pat.weakRelease();
Expand All @@ -90,25 +108,9 @@ namespace LanguageModels {
}
SharedPtr<TokenBunchVector> pat = new TokenBunchVector(length);
for (int i=0; i<static_cast<int>(length); ++i) {
SharedPtr<FloatGPUMirroredMemoryBlock> ones = new FloatGPUMirroredMemoryBlock(bunch_size);
SharedPtr<Int32GPUMirroredMemoryBlock> indices = new Int32GPUMirroredMemoryBlock(bunch_size);
SharedPtr<Int32GPUMirroredMemoryBlock> first_index = new Int32GPUMirroredMemoryBlock(bunch_size+1u);
(*first_index)[0] = 0;
for (int j=0; j<static_cast<int>(bunch_size); ++j) {
auto s_it = corpora->getSentence(sentence_word_pairs[j].first).begin();
int w_pos = sentence_word_pairs[j].second + i;
(*ones)[j] = 1.0f;
(*first_index)[j+1] = j+1;
if (w_pos < 0) (*indices)[j] = initial_word - 1;
else if (w_pos >= static_cast<int>(corpora->getSentenceLength(sentence_word_pairs[j].first))) (*indices)[j] = final_word - 1;
else (*indices)[j] = s_it[w_pos] - 1;
}
SparseMatrixFloat *mat = new SparseMatrixFloat(bunch_size,
static_cast<int>(lex_size),
ones.get(), indices.get(),
first_index.get());
Token *tk = new TokenSparseMatrixFloat(mat);
if (length == 1) return tk; // FIXME: refactor this code
Token *tk = buildSparseMatrixFloatToken(i, sentence_word_pairs);
// WARNING flow break here
if (length == 1) return tk;
(*pat)[i] = tk;
}
return pat.weakRelease();
Expand Down
11 changes: 11 additions & 0 deletions packages/nnlm.dataset/c_src/nnlm_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@

namespace LanguageModels {

/**
* @brief Basics::DataSetToken for NNLMs training.
*
* This class behaves different depending in the paramters @c offset and @c
* length . When @c length=1 every pattern is a AprilUtils::SparseMatrixFloat
* instance. When @c length>1 every pattern is a Basics::TokenBunchVector
* with @c length AprilUtils::SparseMatrixFloat instances.
*/
class NNLMDataSetToken : public Basics::DataSetToken {
public:
NNLMDataSetToken(const size_t offset, const size_t length,
Expand All @@ -33,6 +41,9 @@ namespace LanguageModels {
std::vector<uint32_t> first_word; // corpora.getNumberOfSentences() + 1

std::pair<int,int> getSentenceWordPair(int index) const;
Basics::TokenSparseMatrixFloat *
buildSparseMatrixFloatToken(int i, const std::vector<std::pair<int,int> > &
sentence_word_pairs) const;
};

} // namespace LanguageModels
Expand Down

0 comments on commit 360e6c0

Please sign in to comment.