Skip to content

Commit

Permalink
Tiny refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
fulmicoton committed Jul 5, 2018
1 parent ce5683f commit 6b8d766
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 35 deletions.
7 changes: 2 additions & 5 deletions src/postings/segment_postings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use DocId;
use common::BitSet;
use common::HasLen;
use compression::compressed_block_size;
use directory::ReadOnlySource;
use docset::{DocSet, SkipResult};
use fst::Streamer;
use postings::serializer::PostingsSerializer;
Expand Down Expand Up @@ -88,11 +87,9 @@ impl SegmentPostings {
.close_term()
.expect("In memory Serialization should never fail.");
}

let data = ReadOnlySource::from(buffer);
let block_segment_postings = BlockSegmentPostings::from_data(
docs.len(),
OwnedRead::new(data),
OwnedRead::new(buffer),
FreqReadingOption::NoFreq,
);
SegmentPostings::from_block_postings(block_segment_postings, None)
Expand Down Expand Up @@ -447,7 +444,7 @@ impl BlockSegmentPostings {
freq_decoder: BlockDecoder::with_val(1),
freq_reading_option: FreqReadingOption::NoFreq,

remaining_data: OwnedRead::new(ReadOnlySource::empty()),
remaining_data: OwnedRead::new(vec![]),
doc_offset: 0,
doc_freq: 0,
}
Expand Down
105 changes: 75 additions & 30 deletions src/postings/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,60 @@ impl<'a> FieldSerializer<'a> {
}
}

struct Block {
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
len: usize
}

impl Block {
fn new() -> Self {
Block {
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
len: 0
}
}

fn doc_ids(&self) -> &[DocId] {
&self.doc_ids[..self.len]
}

fn term_freqs(&self) -> &[u32] {
&self.term_freqs[..self.len]
}

fn clear(&mut self) {
self.len = 0;
}

fn append_doc(&mut self, doc: DocId, term_freq: u32) {
let len = self.len;
self.doc_ids[len] = doc;
self.term_freqs[len] = term_freq;
self.len = len + 1;
}

fn is_full(&self) -> bool {
self.len == COMPRESSION_BLOCK_SIZE
}

fn is_empty(&self) -> bool {
self.len == 0
}

fn last_doc(&self) -> DocId {
assert_eq!(self.len, COMPRESSION_BLOCK_SIZE);
self.doc_ids[COMPRESSION_BLOCK_SIZE - 1]
}
}

pub struct PostingsSerializer<W: Write> {
postings_write: CountingWriter<W>,
last_doc_id_encoded: u32,

block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
block: Box<Block>,

termfreq_enabled: bool,
}
Expand All @@ -256,41 +303,41 @@ impl<W: Write> PostingsSerializer<W> {
postings_write: CountingWriter::wrap(write),

block_encoder: BlockEncoder::new(),
doc_ids: vec![],
term_freqs: vec![],
block: Box::new(Block::new()),

last_doc_id_encoded: 0u32,
termfreq_enabled,
}
}

pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.doc_ids.push(doc_id);
fn write_block(&mut self) -> io::Result<()> {
{
// encode the doc ids
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
self.last_doc_id_encoded = self.block.last_doc();
self.postings_write.write_all(block_encoded)?;
}
if self.termfreq_enabled {
self.term_freqs.push(term_freq as u32);
// encode the term_freqs
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
}
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
self.postings_write.write_all(block_encoded)?;
}
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.term_freqs);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
self.doc_ids.clear();
self.block.clear();
Ok(())
}

pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.block.append_doc(doc_id, term_freq);
if self.block.is_full() {
self.write_block()?;
}
Ok(())
}

pub fn close_term(&mut self) -> io::Result<()> {
if !self.doc_ids.is_empty() {
if !self.block.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
Expand All @@ -299,17 +346,16 @@ impl<W: Write> PostingsSerializer<W> {
// using variable int encoding.
{
let block_encoded = self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
.compress_vint_unsorted(self.block.term_freqs());
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
self.block.clear();
}
Ok(())
}
Expand All @@ -323,8 +369,7 @@ impl<W: Write> PostingsSerializer<W> {
}

fn clear(&mut self) {
self.doc_ids.clear();
self.term_freqs.clear();
self.block.clear();
self.last_doc_id_encoded = 0;
}
}
Expand Down

0 comments on commit 6b8d766

Please sign in to comment.