From 6b8d76685a579fb8f144db9de09b26b5a0f2889d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 5 Jul 2018 09:11:16 +0900 Subject: [PATCH] Tiny refactoring --- src/postings/segment_postings.rs | 7 +-- src/postings/serializer.rs | 105 ++++++++++++++++++++++--------- 2 files changed, 77 insertions(+), 35 deletions(-) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 6d7bc902e8..13f11d105b 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -4,7 +4,6 @@ use DocId; use common::BitSet; use common::HasLen; use compression::compressed_block_size; -use directory::ReadOnlySource; use docset::{DocSet, SkipResult}; use fst::Streamer; use postings::serializer::PostingsSerializer; @@ -88,11 +87,9 @@ impl SegmentPostings { .close_term() .expect("In memory Serialization should never fail."); } - - let data = ReadOnlySource::from(buffer); let block_segment_postings = BlockSegmentPostings::from_data( docs.len(), - OwnedRead::new(data), + OwnedRead::new(buffer), FreqReadingOption::NoFreq, ); SegmentPostings::from_block_postings(block_segment_postings, None) @@ -447,7 +444,7 @@ impl BlockSegmentPostings { freq_decoder: BlockDecoder::with_val(1), freq_reading_option: FreqReadingOption::NoFreq, - remaining_data: OwnedRead::new(ReadOnlySource::empty()), + remaining_data: OwnedRead::new(vec![]), doc_offset: 0, doc_freq: 0, } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 5368fc38f5..86584f2911 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -239,13 +239,60 @@ impl<'a> FieldSerializer<'a> { } } +struct Block { + doc_ids: [DocId; COMPRESSION_BLOCK_SIZE], + term_freqs: [u32; COMPRESSION_BLOCK_SIZE], + len: usize +} + +impl Block { + fn new() -> Self { + Block { + doc_ids: [0u32; COMPRESSION_BLOCK_SIZE], + term_freqs: [0u32; COMPRESSION_BLOCK_SIZE], + len: 0 + } + } + + fn doc_ids(&self) -> &[DocId] { + &self.doc_ids[..self.len] + } + + fn term_freqs(&self) -> &[u32] { + &self.term_freqs[..self.len] + } + + fn clear(&mut self) { + self.len = 0; + } + + fn append_doc(&mut self, doc: DocId, term_freq: u32) { + let len = self.len; + self.doc_ids[len] = doc; + self.term_freqs[len] = term_freq; + self.len = len + 1; + } + + fn is_full(&self) -> bool { + self.len == COMPRESSION_BLOCK_SIZE + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn last_doc(&self) -> DocId { + assert_eq!(self.len, COMPRESSION_BLOCK_SIZE); + self.doc_ids[COMPRESSION_BLOCK_SIZE - 1] + } +} + pub struct PostingsSerializer { postings_write: CountingWriter, last_doc_id_encoded: u32, block_encoder: BlockEncoder, - doc_ids: Vec, - term_freqs: Vec, + block: Box, termfreq_enabled: bool, } @@ -256,41 +303,41 @@ impl PostingsSerializer { postings_write: CountingWriter::wrap(write), block_encoder: BlockEncoder::new(), - doc_ids: vec![], - term_freqs: vec![], + block: Box::new(Block::new()), last_doc_id_encoded: 0u32, termfreq_enabled, } } - pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> { - self.doc_ids.push(doc_id); + fn write_block(&mut self) -> io::Result<()> { + { + // encode the doc ids + let block_encoded: &[u8] = self.block_encoder + .compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded); + self.last_doc_id_encoded = self.block.last_doc(); + self.postings_write.write_all(block_encoded)?; + } if self.termfreq_enabled { - self.term_freqs.push(term_freq as u32); + // encode the term_freqs + let block_encoded: &[u8] = + self.block_encoder.compress_block_unsorted(&self.block.term_freqs()); + self.postings_write.write_all(block_encoded)?; } - if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE { - { - // encode the doc ids - let block_encoded: &[u8] = self.block_encoder - .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); - self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; - self.postings_write.write_all(block_encoded)?; - } - if self.termfreq_enabled { - // encode the term_freqs - let block_encoded: &[u8] = - self.block_encoder.compress_block_unsorted(&self.term_freqs); - self.postings_write.write_all(block_encoded)?; - self.term_freqs.clear(); - } - self.doc_ids.clear(); + self.block.clear(); + Ok(()) + } + + pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> { + self.block.append_doc(doc_id, term_freq); + if self.block.is_full() { + self.write_block()?; } Ok(()) } pub fn close_term(&mut self) -> io::Result<()> { - if !self.doc_ids.is_empty() { + if !self.block.is_empty() { // we have doc ids waiting to be written // this happens when the number of doc ids is // not a perfect multiple of our block size. @@ -299,17 +346,16 @@ impl PostingsSerializer { // using variable int encoding. { let block_encoded = self.block_encoder - .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); + .compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded); self.postings_write.write_all(block_encoded)?; - self.doc_ids.clear(); } // ... Idem for term frequencies if self.termfreq_enabled { let block_encoded = self.block_encoder - .compress_vint_unsorted(&self.term_freqs[..]); + .compress_vint_unsorted(self.block.term_freqs()); self.postings_write.write_all(block_encoded)?; - self.term_freqs.clear(); } + self.block.clear(); } Ok(()) } @@ -323,8 +369,7 @@ impl PostingsSerializer { } fn clear(&mut self) { - self.doc_ids.clear(); - self.term_freqs.clear(); + self.block.clear(); self.last_doc_id_encoded = 0; } }