Updated with memory mapped arrays

april-org · Jul 31, 2015 · 360e6c0 · 360e6c0
1 parent a16e75d
commit 360e6c0
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 48 deletions.
diff --git a/packages/nnlm.corpora/c_src/nnlm_corpora.cc b/packages/nnlm.corpora/c_src/nnlm_corpora.cc
@@ -13,6 +13,37 @@ using AprilUtils::constString;
 ////////////////////////////////////////////////////////////////////////////
 
 namespace LanguageModels {
+
+  NNLMCorpora::Sentence::Sentence(uint32_t *b, uint32_t *e) : b(b), e(e) {
+  }
+
+  NNLMCorpora::Sentence::Sentence(const NNLMCorpora::Sentence &other) :
+    b(other.b), e(other.e) {
+  }
+
+  const uint32_t *NNLMCorpora::Sentence::begin() const {
+    return b;
+  }
+
+  const uint32_t *NNLMCorpora::Sentence::end() const {
+    return e;
+  }
+
+  size_t NNLMCorpora::Sentence::size() const {
+    return e - b;
+  }
+
+  const uint32_t &NNLMCorpora::Sentence::operator[](size_t i) const {
+    return b[i];
+  }
+
+  NNLMCorpora::Sentence &NNLMCorpora::Sentence::operator=(const Sentence &other) {
+    this->b = other.b;
+    this->e = other.e;
+    return *this;
+  }
+
+  /////////////////////////////////////////
 
   NNLMCorpora::NNLMCorpora(const char *filename,
                            AprilUtils::LexClass *lex,
@@ -37,39 +68,57 @@ namespace LanguageModels {
       ERROR_EXIT2(1, "Error reading mmapped file %s: %s\n",
                   filename, strerror(errno));
     }
+    constString line, word;
+    // Two traversals: 1. counting sentences and words, 2. extracting tokens.
+    //
+    // 1. COUNTING SENTENCES AND WORDS
+    num_words = 0;
+    first_sentence_word.push_back(0u);
     constString input = constString(file_mmapped, file_size);
-    constString line;
-    unsigned int i = 0;
+    while( (line = input.extract_line()) ) {
+      while( (word = line.extract_token("\n\r\t ")) ) ++num_words;
+      first_sentence_word.push_back(num_words);
+    }
+    //
+    // 2. EXTRACTING TOKENS
+    words_size = sizeof(uint32_t) * num_words;
+    if ((words = (uint32_t*)mmap(NULL, words_size,
+                                 PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED,
+                                 -1, 0)) == MAP_FAILED) {
+      ERROR_EXIT2(1, "Error creating anonymous mmap of size %lu: %s\n",
+                  words_size, strerror(errno));
+    }
+    input = constString(file_mmapped, file_size);
     char word_str[MAX_WORD_LEN + 1];
+    size_t i = 0;
     while( (line = input.extract_line()) ) {
-      constString word;
-      sentences.emplace_back();
       while( (word = line.extract_token("\n\r\t ")) ) {
         strncpy(word_str, (const char *)word, word.len());
         word_str[word.len()] = '\0';
         uint32_t wid;
         if (!lex->getWordId(word_str, wid)) wid = unk_id;
-        sentences[i].push_back(wid);
+        words[i++] = wid;
       }
-      ++i;
     }
     close(file_descriptor);
     munmap(file_mmapped, file_size);
   }
 
   NNLMCorpora::~NNLMCorpora() {
+    munmap(words, words_size);
   }
 
-  const std::vector<uint32_t> &NNLMCorpora::getSentence(size_t i) const {
-    return sentences[i];
+  NNLMCorpora::Sentence NNLMCorpora::getSentence(size_t i) const {
+    return Sentence(words + first_sentence_word[i],
+                    words + first_sentence_word[i+1]);
   }
 
   size_t NNLMCorpora::getSentenceLength(size_t i) const {
-    return sentences[i].size();
+    return first_sentence_word[i+1] - first_sentence_word[i];
   }
 
   size_t NNLMCorpora::getNumberOfSentences() const {
-    return sentences.size();
+    return first_sentence_word.size() - 1u;
   }
 
   size_t NNLMCorpora::getVocabSize() {

diff --git a/packages/nnlm.corpora/c_src/nnlm_corpora.h b/packages/nnlm.corpora/c_src/nnlm_corpora.h
@@ -12,13 +12,14 @@ namespace LanguageModels {
    * Once it is loaded, the class allows to index sentences as vectors.
    */
   class NNLMCorpora : public Referenced {
+    class Sentence; // forward declaration
   public:
     NNLMCorpora(const char *filename,
                 AprilUtils::LexClass *lex,
                 const uint32_t unk_id);
     ~NNLMCorpora();
 
-    const std::vector<uint32_t> &getSentence(size_t i) const;
+    NNLMCorpora::Sentence getSentence(size_t i) const;
     size_t getSentenceLength(size_t i) const;
     size_t getNumberOfSentences() const;
     size_t getVocabSize();
@@ -28,7 +29,23 @@ namespace LanguageModels {
     const char *filename;
     AprilUtils::SharedPtr<AprilUtils::LexClass> lex;
     const uint32_t unk_id;
-    std::vector< std::vector<uint32_t> > sentences;
+    std::vector<size_t> first_sentence_word;
+    size_t num_words, words_size;
+    uint32_t *words; // mmapped
+
+    /// This class is an array wrapper
+    class Sentence {
+    public:
+      Sentence(uint32_t *b, uint32_t *e);
+      Sentence(const Sentence &other);
+      const uint32_t *begin() const;
+      const uint32_t *end() const;
+      size_t size() const;
+      const uint32_t &operator[](size_t i) const;
+      Sentence &operator=(const Sentence &other); 
+    private:
+      const uint32_t *b, *e; // begin and end pointers
+    };
   };
 
 } // namespace LanguageModels

diff --git a/packages/nnlm.dataset/c_src/nnlm_dataset.cc b/packages/nnlm.dataset/c_src/nnlm_dataset.cc
@@ -19,8 +19,12 @@ using Basics::TokenSparseMatrixFloat;
 
 ////////////////////////////////////////////////////////////////////////////
 
+typedef std::vector<pair<int,int> > PairVector;
+
 namespace LanguageModels {
 
+  // TODO: Implement in a generic way which allow to ignore context cues,
+  // for instance, giving initial_word=0 and/or final_word=0
   NNLMDataSetToken::NNLMDataSetToken(const size_t offset, const size_t length,
                                      const uint32_t initial_word,
                                      const uint32_t final_word,
@@ -49,34 +53,48 @@ namespace LanguageModels {
   }
 
   pair<int,int> NNLMDataSetToken::getSentenceWordPair(int index) const {
+    // binary search of the sentence which contains the given pattern index
     auto sentence_it = upper_bound(first_word.begin(), first_word.end(),
                                    static_cast<uint32_t>(index)) - 1u;
+    // index of the sentence
     int s_index = static_cast<int>(sentence_it - first_word.begin());
+    // index of the word inside the sentence: it can be w_index<0 or
+    // w_index>=sentence_size, and this cases will be filled with context cues.
     int w_index = index - static_cast<int>(*sentence_it) + offset;
     return make_pair(s_index,w_index);
   }
-
+
+  TokenSparseMatrixFloat *NNLMDataSetToken::
+  buildSparseMatrixFloatToken(int i, const PairVector &sentence_word_pairs) const {
+    unsigned int bunch_size = sentence_word_pairs.size();
+    SharedPtr<FloatGPUMirroredMemoryBlock> ones = new FloatGPUMirroredMemoryBlock(bunch_size);
+    SharedPtr<Int32GPUMirroredMemoryBlock> indices = new Int32GPUMirroredMemoryBlock(bunch_size);
+    SharedPtr<Int32GPUMirroredMemoryBlock> first_index = new Int32GPUMirroredMemoryBlock(bunch_size+1u);
+    (*first_index)[0] = 0;
+    for (int j=0; j<static_cast<int>(bunch_size); ++j) {
+      auto s_it = corpora->getSentence(sentence_word_pairs[j].first).begin();
+      int w_pos = sentence_word_pairs[j].second + i;
+      (*ones)[j] = 1.0f;
+      (*first_index)[j+1] = j+1;
+      if (w_pos < 0) (*indices)[j] = initial_word - 1;
+      else if (w_pos >= static_cast<int>(corpora->getSentenceLength(sentence_word_pairs[j].first))) (*indices)[j] = final_word - 1;
+      else (*indices)[j] = s_it[w_pos] - 1;
+    }
+    SparseMatrixFloat *mat = new SparseMatrixFloat(bunch_size,
+                                                   static_cast<int>(lex_size),
+                                                   ones.get(), indices.get(),
+                                                   first_index.get());
+    return new TokenSparseMatrixFloat(mat);
+  }
+
   Token *NNLMDataSetToken::getPattern(int index) {
     SharedPtr<TokenBunchVector> pat = new TokenBunchVector(length);
     auto sentence_word_pair = getSentenceWordPair(index);
     auto s_it = corpora->getSentence(sentence_word_pair.first).begin();
     for (int i=0; i<static_cast<int>(length); ++i) {
-      SharedPtr<FloatGPUMirroredMemoryBlock> ones = new FloatGPUMirroredMemoryBlock(1u);
-      SharedPtr<Int32GPUMirroredMemoryBlock> indices = new Int32GPUMirroredMemoryBlock(1u);
-      SharedPtr<Int32GPUMirroredMemoryBlock> first_index = new Int32GPUMirroredMemoryBlock(2u);
-      int w_pos = sentence_word_pair.second + i;
-      (*ones)[0] = 1.0f;
-      (*first_index)[0] = 0;
-      (*first_index)[1] = 1;
-      if (w_pos < 0) (*indices)[0] = initial_word - 1;
-      else if (w_pos >= static_cast<int>(corpora->getSentenceLength(sentence_word_pair.first))) (*indices)[0] = final_word - 1;
-      else (*indices)[0] = s_it[w_pos] - 1;
-      SparseMatrixFloat *mat = new SparseMatrixFloat(1,
-                                                     static_cast<int>(lex_size),
-                                                     ones.get(), indices.get(),
-                                                     first_index.get());
-      Token *tk = new TokenSparseMatrixFloat(mat);
-      if (length == 1) return tk; // FIXME: refactor this code
+      Token *tk = buildSparseMatrixFloatToken(i, PairVector(1u, sentence_word_pair));
+      // WARNING flow break here
+      if (length == 1) return tk;
       (*pat)[i] = tk;
     }
     return pat.weakRelease();
@@ -90,25 +108,9 @@ namespace LanguageModels {
     }
     SharedPtr<TokenBunchVector> pat = new TokenBunchVector(length);    
     for (int i=0; i<static_cast<int>(length); ++i) {
-      SharedPtr<FloatGPUMirroredMemoryBlock> ones = new FloatGPUMirroredMemoryBlock(bunch_size);
-      SharedPtr<Int32GPUMirroredMemoryBlock> indices = new Int32GPUMirroredMemoryBlock(bunch_size);
-      SharedPtr<Int32GPUMirroredMemoryBlock> first_index = new Int32GPUMirroredMemoryBlock(bunch_size+1u);
-      (*first_index)[0] = 0;
-      for (int j=0; j<static_cast<int>(bunch_size); ++j) {
-        auto s_it = corpora->getSentence(sentence_word_pairs[j].first).begin();
-        int w_pos = sentence_word_pairs[j].second + i;
-        (*ones)[j] = 1.0f;
-        (*first_index)[j+1] = j+1;
-        if (w_pos < 0) (*indices)[j] = initial_word - 1;
-        else if (w_pos >= static_cast<int>(corpora->getSentenceLength(sentence_word_pairs[j].first))) (*indices)[j] = final_word - 1;
-        else (*indices)[j] = s_it[w_pos] - 1;
-      }
-      SparseMatrixFloat *mat = new SparseMatrixFloat(bunch_size,
-                                                     static_cast<int>(lex_size),
-                                                     ones.get(), indices.get(),
-                                                     first_index.get());
-      Token *tk = new TokenSparseMatrixFloat(mat);
-      if (length == 1) return tk; // FIXME: refactor this code
+      Token *tk = buildSparseMatrixFloatToken(i, sentence_word_pairs);
+      // WARNING flow break here
+      if (length == 1) return tk;
       (*pat)[i] = tk;
     }
     return pat.weakRelease();

diff --git a/packages/nnlm.dataset/c_src/nnlm_dataset.h b/packages/nnlm.dataset/c_src/nnlm_dataset.h
@@ -10,6 +10,14 @@
 
 namespace LanguageModels {
 
+  /**
+   * @brief Basics::DataSetToken for NNLMs training.
+   *
+   * This class behaves different depending in the paramters @c offset and @c
+   * length . When @c length=1 every pattern is a AprilUtils::SparseMatrixFloat
+   * instance. When @c length>1 every pattern is a Basics::TokenBunchVector
+   * with @c length AprilUtils::SparseMatrixFloat instances.
+   */
   class NNLMDataSetToken : public Basics::DataSetToken {
   public:
     NNLMDataSetToken(const size_t offset, const size_t length,
@@ -33,6 +41,9 @@ namespace LanguageModels {
     std::vector<uint32_t> first_word; // corpora.getNumberOfSentences() + 1
 
     std::pair<int,int> getSentenceWordPair(int index) const;
+    Basics::TokenSparseMatrixFloat *
+    buildSparseMatrixFloatToken(int i, const std::vector<std::pair<int,int> > &
+                                sentence_word_pairs) const;
   };
 
 } // namespace LanguageModels